<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#参考段落检查" data-toc-modified-id="参考段落检查-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>参考段落检查</a></span></li></ul></div>

In [1]:
import os
import time
import json
import torch
import numpy
import pandas
import logging

from torch.autograd import Variable
from torch.nn import CrossEntropyLoss, BCELoss
from torch.optim import Adam, lr_scheduler
from sklearn.metrics import auc, confusion_matrix, accuracy_score, roc_auc_score

from setting import *
from config import QAModelConfig, DatasetConfig

from src.dataset import generate_dataloader, Dataset
from src.evaluation_tools import evaluate_qa_model_choice, evaluate_qa_model_judgment, evaluate_classifier
from src.plot_tools import plot_roc_curve, plot_pr_curve
from src.qa_model import BaseChoiceModel, BaseJudgmentModel, ReferenceChoiceModel, ReferenceJudgmentModel
from src.torch_tools import save_checkpoint, load_checkpoint
from src.utils import initialize_logger, terminate_logger, load_args, save_args


# 参考段落检查

In [2]:
args = load_args(Config=DatasetConfig)
args.use_reference = True
args.retrieval_model_name = 'tfidf'
args.train_batch_size = 2
args.valid_batch_size = 2
args.test_batch_size = 2
args.word_embedding = None
args.document_embedding = None
mode = 'train'
pipeline = 'choice'
for_debug = False
do_export = False
dataset = Dataset(args=args, 
                  mode=mode, 
                  do_export=do_export, 
                  pipeline=pipeline,
                  for_debug=for_debug)

usage: -- [-h] [--filter_stopword FILTER_STOPWORD]
          [--retrieval_model_name RETRIEVAL_MODEL_NAME]
          [--num_top_subject NUM_TOP_SUBJECT]
          [--num_best_per_subject NUM_BEST_PER_SUBJECT]
          [--use_reference USE_REFERENCE] [--use_userdict USE_USERDICT]
          [--train_batch_size TRAIN_BATCH_SIZE]
          [--valid_batch_size VALID_BATCH_SIZE]
          [--test_batch_size TEST_BATCH_SIZE]
          [--max_reference_length MAX_REFERENCE_LENGTH]
          [--max_statement_length MAX_STATEMENT_LENGTH]
          [--max_option_length MAX_OPTION_LENGTH]
          [--word_embedding WORD_EMBEDDING]
          [--document_embedding DOCUMENT_EMBEDDING] [--num_epoch NUM_EPOCH]
          [--lr_multiplier LR_MULTIPLIER] [--learning_rate LEARNING_RATE]
          [--weight_decay WEIGHT_DECAY] [--do_valid DO_VALID]
          [--do_valid_plot DO_VALID_PLOT] [--num_best NUM_BEST]
          [--num_workers NUM_WORKERS]
--: error: unrecognized arguments: -f C:\Users\caoyang\Ap

Function `build_similarity` runtime is 2.9344797134399414 seconds.
正在导出dataset
Function `build_similarity` runtime is 0.011523962020874023 seconds.
Function `build_similarity` runtime is 0.33296775817871094 seconds.
Function `build_similarity` runtime is 0.25089430809020996 seconds.
Function `build_similarity` runtime is 0.04689383506774902 seconds.
Function `build_similarity` runtime is 0.1567080020904541 seconds.
Function `build_similarity` runtime is 0.32657551765441895 seconds.
Function `build_similarity` runtime is 0.1092379093170166 seconds.
Function `build_similarity` runtime is 0.10982489585876465 seconds.
Function `build_similarity` runtime is 0.10971570014953613 seconds.
Function `build_similarity` runtime is 0.1102292537689209 seconds.
Function `build_similarity` runtime is 0.2752554416656494 seconds.
Function `build_similarity` runtime is 0.3767549991607666 seconds.
Function `build_similarity` runtime is 0.062131404876708984 seconds.
Function `build_similarity` runtime is 0



Function `choice_pipeline` runtime is 392.9249093532562 seconds.


In [5]:
GENSIM_RETRIEVAL_MODEL_SUBJECT_SUMMARY['宪法']['summary']['tfidf']

{'corpus': 'model\\retrieval_model\\gensim\\subject\\宪法\\reference_tfidf_corpus.cps',
 'model': 'model\\retrieval_model\\gensim\\subject\\宪法\\reference_tfidf.m',
 'dictionary': 'model\\retrieval_model\\gensim\\subject\\宪法\\reference_dictionary.dtn',
 'build_function': 'GensimRetrievalModel.build_tfidf_model',
 'class': 'gensim.models.TfidfModel',
 'sequence': ['tfidf']}

In [4]:
import gensim
from gensim.corpora import MmCorpus, Dictionary
from gensim.similarities import Similarity

In [8]:
model = gensim.models.TfidfModel.load(GENSIM_RETRIEVAL_MODEL_SUBJECT_SUMMARY['宪法']['summary']['tfidf']['model'])
corpus = MmCorpus(GENSIM_RETRIEVAL_MODEL_SUBJECT_SUMMARY['宪法']['summary']['tfidf']['corpus'])
dictionary = Dictionary.load(GENSIM_RETRIEVAL_MODEL_SUBJECT_SUMMARY['宪法']['summary']['tfidf']['dictionary'])

In [40]:
similarity = Similarity('gensim_temp', corpus, num_features=len(dictionary), num_best=None)

In [37]:
query_tokens = ['下列', '有关', '国体', '和', '政体', '说法', '正确', '的', '是', '?']
# query_tokens = ['公民', '行使', '集会', '、', '游行', '、', '示威', '权利', '时应', '向', '主管机关', '提出申请', '并', '获得', '许可', '。', '下列', '选项', '中', '哪些', '属于', '依法', '不予', '许可', '的', '情形', '?']

In [41]:
query_corpus = dictionary.doc2bow(query_tokens)
query_corpus = model[query_corpus]
result = similarity[query_corpus]

In [34]:
numpy.count_nonzero(result)

38

In [43]:
print(result)

[0.         0.         0.         0.         0.         0.
 0.         0.         0.0929906  0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.04905723 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         

In [39]:
result

[(116, 0.09448226541280746),
 (8, 0.09299059957265854),
 (180, 0.07245595753192902),
 (137, 0.06240452080965042),
 (876, 0.06101685389876366),
 (604, 0.052845943719148636),
 (697, 0.05002279579639435),
 (40, 0.049057234078645706),
 (581, 0.04327135533094406)]