# 问答系统

读取文件，并把内容分别写到两个list里

In [1]:
import json
def read_corpus(filepath):
    with open(filepath) as f:
            data = json.load(f)
    qlist = list()
    alist = list() 
    for item in data['data']:
        for para in item['paragraphs']:
            for qa in para['qas']:  
                qlist.append(qa['question'])
                # 部分answers的list为空，所以会引发IndexError
                try:
                    alist.append(qa['answers'][0]['text'])
                except IndexError:
                    qlist.pop()
    assert len(qlist) == len(alist)  # 确保长度一样
    return qlist, alist

In [9]:
qlist, alist = read_corpus('../data/train-v2.0.json')
print("问答数量：%d" % len(qlist))
print(qlist[-4:])
print(alist[:4])

问答数量：86821
['What was Yangon previously known as?', 'With what Belorussian city does Kathmandu have a relationship?', 'In what year did Kathmandu create its initial international relationship?', 'What is KMC an initialism of?']
['in the late 1990s', 'singing and dancing', '2003', 'Houston, Texas']


In [3]:
from collections import Counter
import matplotlib.pyplot as plt
qlist, alist = read_corpus('../data/train-v2.0.json')
word_cut = Counter()
for item in qlist:
    word_cut.update(item.strip(".?!").split())
value_sort = sorted(word_cut.values(), reverse=True)
plt.subplot(221)
plt.plot(value_sort)
plt.subplot(222)
plt.plot(value_sort[:2000])
plt.subplot(223)
plt.plot(value_sort[:200])
plt.subplot(224)
plt.plot(value_sort[:20])
plt.show()

merge = dict(zip(word_cut.values(),word_cut.keys()))
print([[merge[v],v] for v in value_sort[:20]])

<Figure size 640x480 with 4 Axes>

[['the', 60960], ['What', 36995], ['of', 33969], ['in', 21767], ['to', 18417], ['was', 17063], ['is', 16197], ['did', 15634], ['what', 13187], ['a', 10753], ['How', 8023], ['Who', 8009], ['and', 7229], ['for', 7175], ['many', 5497], ['are', 5454], ['When', 5367], ['that', 4435], ['were', 4428], ['does', 4331]]


预文本处理

In [24]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter
import math
from sklearn.feature_extraction.text import TfidfVectorizer 

sw = set(stopwords.words('english'))
ps = PorterStemmer()
def text_process(text):
    seg = list()
    for word in word_tokenize(text):
         # 小写化、词干提取
        word = ps.stem(word.lower())
          # 数值归一
         # isdigit:检测字符串是否只由数字组成
        word = "#number" if word.isdigit() else word
         # 去停用词
        if len(word)>1 and word not in sw:
            seg.append(word)
    return seg
qlist, alist = read_corpus('../data/train-v2.0.json')
word_cut = Counter()
qlist_seg = list()
for text in qlist:
    seg = text_process(text)
    qlist_seg.append(seg)
    word_cut.update(seg)
# print(word_cut.values()) 
# python sorted函数是对可迭代对象的排序，参数有cmp,key,reverse.
# cmp:比较的函数，有两个参数，大于返回1，小于返回-1，等于返回0。
# key:用来比较进行排序的元素。
# reverse：排序的规则，reverse = True是降序
value_sort = sorted(word_cut.values(), reverse=True)
min_tf = value_sort[int(math.exp(0.99*math.log(len(word_cut))))]
for cur in range(len(qlist_seg)):
    qlist_seg[cur] = [word for word in qlist_seg[cur] if word_cut[word] >min_tf]
# for in if 连击推导式隐式的将link由str转变为了list,相当于：
# for word in qlist_seg[cur]:
#     if word_cut[word] > min_tf:
#         qlist_seg[cur] = word
print(qlist_seg[:10])
# 把qlist中的每一个问题字符串转换成tf-idf向量，计算其相似度问题
vectorizer = TfidfVectorizer() # 定义一个tf-idf的vectorizer
# 结果存放在X矩阵
X = vectorizer.fit_transform([' '.join(seg) for seg in qlist_seg])
def sparsity(X):
#     n=nnz(X)返回矩阵X中的非零元素的数目
    return 1.0-X.nnz/float(X.shape[0]*X.shape[1])
print("打印稀疏度：",sparsity(X))

[['beyonc', 'start', 'becom', 'popular'], ['area', 'beyonc', 'compet', 'wa', 'grow'], ['beyonc', 'leav', 'destini', "'s", 'child', 'becom', 'solo', 'singer'], ['citi', 'state', 'beyonc', 'grow'], ['decad', 'beyonc', 'becom', 'famou'], ['group', 'wa', 'lead', 'singer'], ['album', 'made', 'worldwid', 'known', 'artist'], ['manag', 'destini', "'s", 'child', 'group'], ['beyoncé', 'rise', 'fame'], ['role', 'beyoncé', 'destini', "'s", 'child']]
打印稀疏度： 0.9996166194981876


对于用户的输入问题，找到相似度最高的TOP5问题，并把5个潜在的答案做返回

In [26]:
from queue import PriorityQueue
def top5results(question):
#     给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。这里面需要做到以下几点：
#     1. 对于用户的输入 input_q 首先做一系列的预处理，然后再转换成tf-idf向量（利用上面的vectorizer)
#     2. 计算跟每个库里的问题之间的相似度
#     3. 找出相似度最高的top5问题的答案
    q_vector = vectorizer.transform([' '.join(text_process(question))])
#     print(q_vector.T * X)
    # 计算余弦相似度，tfidf默认l2范数；矩阵乘法
    sim = (X * q_vector.T).toarray()
#     print(sim)
    pq = PriorityQueue()
    for cur in range(sim.shape[0]):
        pq.put((sim[cur][0],cur))
        if len(pq.queue) > 5:
            pq.get()
    
    pq_rank = sorted(pq.queue,reverse = True,key = lambda x:x[0])
#     print([x[1] for x in pq_rank])
    top_idxs = [x[1] for x in pq_rank]  # top_idxs存放相似度最高的（存在qlist里的）问题的下表 
#     print(top_idxs)
    return [alist[i] for i in top_idxs]  # 返回相似度最高的问题对应的答案，作为TOP5答案    
print(top5results("Which airport was shut down?"))   
# print(top5results("Which airport is closed?"))

['Chengdu Shuangliu International Airport', 'Chengdu Shuangliu International Airport', 'aerodrome with facilities for flights to take off and land', 'newspapers', 'various gaming sites']


# 倒排表
上面的算法，一个最大的缺点是每一个用户问题都需要跟库里的所有的问题都计算相似度。假设我们库里的问题非常多，这将是效率非常低的方法。 这里面一个方案是通过倒排表的方式，先从库里面找到跟当前的输入类似的问题描述。然后针对于这些candidates问题再做余弦相似度的计算。这样会节省大量的时间

In [27]:
from collections import defaultdict
# 制定一个简单的倒排表
invert = defaultdict(set)
for cur in range(len(qlist_seg)):
    for word in qlist_seg[cur]:
        invert[word].add(cur)
        
def topresults(question):
    seg = text_process(question)
    candidates = set()
    for word in seg:
         # 取所有包含任意一个词的文档的并集
        candidates = candidates | invert[word]
    candidates = list(candidates)
    q_vector = vectorizer.transform([' '.join(seg)])
    sim = (X[candidates] * q_vector.T).toarray()
    
    pq = PriorityQueue()
    for cur in range(sim.shape[0]):
        pq.put((sim[cur][0],candidates[cur]))
        if len(pq.queue) > 5:
            pq.get()
    pq_rank = sorted(pq.queue,reverse = True,key = lambda x:x[0])
#     print([x[0] for x in pq_rank])
    top_idxs = [x[1] for x in pq_rank]  # top_idxs存放相似度最高的（存在qlist里的）问题的下表 
    return [alist[i] for i in top_idxs]  # 返回相似度最高的问题对应的答案，作为TOP5答案    

In [28]:
print(topresults("Which airport was shut down?"))  
print(top5results("Which airport was shut down?"))
print(topresults("Which government stopped aid after Hurricane Nargis?"))  
print(top5results("Which government stopped aid after Hurricane Nargis?"))

['Chengdu Shuangliu International Airport', 'Chengdu Shuangliu International Airport', 'aerodrome with facilities for flights to take off and land', 'newspapers', 'various gaming sites']
['Chengdu Shuangliu International Airport', 'Chengdu Shuangliu International Airport', 'aerodrome with facilities for flights to take off and land', 'newspapers', 'various gaming sites']
['Myanmar', 'Isabel', 'foreign aid', 'Soviet Union and China', '10 days']
['Myanmar', 'Isabel', 'foreign aid', 'Soviet Union and China', '10 days']


# 基于词向量的文本表示
上面所用到的方法论是基于词袋模型（bag-of-words model）。这样的方法论有两个主要的问题：1. 无法计算词语之间的相似度 2. 稀疏度很高。 接下来我们采用词向量作为文本的表示，下载训练好的d=100(100维)的词向量glove.6B.zip。

In [30]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import numpy as np
# 将GloVe转为word2vec 介绍：将词汇转化为机器能够读懂的向量
_ = glove2word2vec('../data/glove.6B/glove.6B.100d.txt', '../data/glove.6B/glove2word2vec.6B.100d.txt')
model = KeyedVectors.load_word2vec_format('../data/glove.6B/glove2word2vec.6B.100d.txt')
print(model)
def sentence_vectorizer(seg):
#     将分词数据转为句向量。
#     seg: 分词后的数据    
    vector = np.zeros((1,100))
    size = len(seg)
    for word in seg:
        try:
            vector += model.wv[word]
        except KeyError:
            size -= 1
    return vector/size
X = np.zeros((len(qlist_seg), 100))
for cur in range(X.shape[0]):
    X[cur] = sentence_vectorizer(qlist_seg[cur])
# 计算X每一行的l2范数
Xnorm2 = np.linalg.norm(X, axis=1, keepdims=True)
X = X / Xnorm2

def top5results_vector(question):
    seg = text_process(question)
    candidates = set()
    for word in seg:
        candidates = candidates | invert[word]
    candidates = list(candidates)
    q_vector = sentence_vectorizer(seg)
    qnorm2 = np.linalg.norm(q_vector, axis=1, keepdims=True)
    q_vector = q_vector / qnorm2
    sim = (X[candidates] * q_vector.T)
    # 使用优先队列找出top5
    pq = PriorityQueue()
    for cur in range(sim.shape[0]):
        pq.put((sim[cur][0], candidates[cur]))
        if len(pq.queue) > 5:
            pq.get()    
    pq_rank = sorted(pq.queue,reverse = True,key = lambda x:x[0])
    top_idxs = [x[1] for x in pq_rank]  # top_idxs存放相似度最高的（存在qlist里的）问题的下表 
    return [alist[i] for i in top_idxs]  # 返回相似度最高的问题对应的答案，作为TOP5答案                

<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x000000000EED9DD8>


  from ipykernel import kernelapp as app


In [31]:
# print(top5results("Which airport was shut down?"))
print(top5results_vector("Which airport was shut down?"))

  from ipykernel import kernelapp as app


ValueError: operands could not be broadcast together with shapes (17026,100) (100,1) 