In [75]:
# -*- coding: UTF-8 -*-
import time
import jieba
import os
from gensim.models import word2vec
from gensim import models
import numpy as np
from scipy import spatial

In [76]:
# 精確模式 ：將句子最精確地切開，叫適合文本分析, cut_all=False
# 全模式：把句子中所有的可以成詞的詞語都掃描出來, 速度快, cut_all=True
# 搜索引擎模式：在精確模式的基礎上對長詞再次切分，提高召回率，適合用於搜尋引擎分詞, jieba.cut_for_search(Content)            
# call jieba api
def jiebaCut(s):
    words = jieba.cut(s,cut_all=True)
    #words = jieba.cut_for_search(s)
    result = removeStopWords(words)
    return result

# remove stopwords
def removeStopWords(words):
    result = []
    for w in words:
        if w not in stopWordsSet:
            result.append(w)
    return result

# define all state 
def state(s,flag):
    nextline = 1
    if s is None or s == "":
        return flag, nextline
    # state: 1, s[0] = C
    if s[0] == 'C':
        flag, nextline = 1, 0
    # state: 2, s[0] = Q
    elif s[0] == 'Q':
        flag, nextline = 2, 0
    # state: 3, s[0] = A
    elif s[0] == 'A':
        flag, nextline = 3, 0
    # state: 4, do jieba cut
    return flag, nextline

In [77]:
def main():
    sTime = time.time()
    print("Start process CQA dataset")
    cNum, accuracy = 0, 0
    with open('CQA.txt', 'r') as file:
        flag, end = 0, 0
        cList, qList, aList = [],[],[]
        tempC = []
        ans = ""
        for i in file.readlines():
            s = i.strip()
            flag, nextline = state(s,flag)
            # one corpus process done!
            if end == 4:
                ans = s
                print("Corpus: %d" % cNum)
                guessAns = word2VecSum(cList, qList, aList, cNum)
                with open('wiki_true_result.txt', 'a') as res:
                    res.write("\nCorpus :" + str(cNum))
                    res.write("\nCorrect answer is: " + ans)
                    res.write("\nPredict answer is: " + guessAns)
                    res.write('\n')
                if guessAns == ans:
                    accuracy +=1
                print("====== Final result ======")
                print("Correct answer is: %s." %(ans))
                print("Predict answer is: %s.\n" %(guessAns))
                #print("corpus:\n",cList,'\nquestion:\n',qList,'\nanswer:\n',aList,'\ncorrect ans:\n',ans,'\n')
                cList, qList, aList = [],[],[]
                flag, end = 0, 0
                ans = ""
                cNum +=1
                continue
            # still on state
            if nextline != 1:
                continue
            # on state 1, process Corpus
            elif flag == 1:
                cutRes = jiebaCut(s)
                for c in cutRes:
                    tempC.append(c)
                if nextline == 1:
                    if tempC:
                        cList.append(tempC)
                        tempC = []
            # on state 2, process Question
            elif flag == 2:
                cutRes = jiebaCut(s)
                for c in cutRes:
                    qList.append(c)
            # on state 3, process Answer
            elif flag == 3:
                end += 1
                # example: （B） 吃飯比讀書更為重要 
                tempS = ""
                skip = ['A','B','C','D','（',')']
                check = 0
                for j in s:
                    if check == 3:
                        tempS += j
                    else:
                        check += 1
                tempS = tempS.strip()
                cutRes = jiebaCut(tempS)
                tempL = []
                for c in cutRes:
                    tempL.append(c)
                aList.append(tempL)
    
    with open('result.txt', 'a') as res:
        res.write("\nTotal corpus number :" + str(cNum))
        res.write("\nAccuracy is :" + str(accuracy/cNum*100))
    print("\nTotal corpus numbers: %d" % cNum)
    print("Accuracy is %.3f percent" % (accuracy/cNum*100))
    print("Processing all CQA dataset corpus took %.2fs" % (time.time()- sTime))
        

In [78]:

def word2VecSum(cList, qList, aList, cNum):

    sTime = time.time()
    print("====== Start process words vector sum ======")
    nc = np.zeros((len(cList),250),dtype=float)
    nq = np.zeros(250,dtype=float)
    na = np.zeros((len(aList),250),dtype=float)
    count, ind, notExist = 0 , 0 , 0
    # take all element from corpus List
    for c in cList:
        for w in c:
            # take word vector from word2vec model
            try:
                m = model[w]
            except KeyError as e:
                notExist +=1
                continue
            # calculate word vector sum from corpus list
            for n in range(250):
                nc[ind][n] += m[n]
            count +=1
        ind +=1
    # take all element from question List
    for w in qList:
        try:
            m = model[w]
        except KeyError as e:
            notExist +=1
            continue
        # calculate word vector sum from question list
        for n in range(250):
            nq[n] += m[n]
        count +=1
        
    ind = 0
     # take all element from answer List
    for a in aList:
        for w in a:
            try:
                m = model[w]
            except KeyError as e:
                notExist +=1
                continue
             # calculate word vector sum from answer list
            for n in range(250):
                na[ind][n] += m[n]
            count +=1
        ind +=1
        
    print("This corpus has total %d split words." % (count))
    print("This corpus has %d words not in word2vec model." % (notExist))
    print("Process all corpus content took %.2fs." % (time.time()- sTime))
    # go to final step, calculate similarity
    guessAns = similarity(nc, nq, na, cNum)
    return guessAns
    

In [79]:
def similarity(nc, nq, na, cNum):
    
    sTime = time.time()
    l = ['A','B','C','D']
    print("====== Start process vector similarity ======")
    # highest corpus/answer similarity
    h_c_Sim, h_a_Sim, highCorpus, ans = 0, 0, 0, 0
    
    # calculate the most similar corpus and question
    for c in nc:
        cosSim = 1 - spatial.distance.cosine(c, nq)
        if cosSim > h_c_Sim:
            h_c_Sim = cosSim
            # record highest similarity corpus
            highCorpus = c
            
    # calculate the most similar corpus and answer
    i = 0
    for a in na:
        cosSim = 1 - spatial.distance.cosine(a, highCorpus)
        if cosSim > h_a_Sim:
            h_a_Sim = cosSim
            ans = i
        i += 1
    
    print("The best match answer to this CQA is %s." %(l[ans]))
    print("The best match answer similarity to this CQA is %.2f." %(h_a_Sim))
    print("Process all similarity calculation took %.2fs.\n" % (time.time()- sTime))
    return l[ans]

In [80]:
# ====== initial setting ======

print("Start loading initial setting!")
# jieba setting
print("Start loading jieba dictionary!")
relativePath = os.getcwd()
jieba.set_dictionary(relativePath + '/jieba_setting/dict.txt.big')
# add user dictionary to improve jieba cut precision
# jieba.load_userdict(relativePath + '/jieba_setting/yourfile.txt')

# stopwords setting
print("Start add stopwords!")
stopWordsSet = set()
with open(relativePath + '/jieba_setting/stopwords.txt', 'r') as stop:
    for i in stop:
        stopWordsSet.add(i.strip('\n'))

# load word2vec model
print("Start loading word2vec model!")
sTime = time.time()
model = models.Word2Vec.load(relativePath + '/wiki/python/word2vec.model')
print("Load word2vec model success! took %.2fs" % (time.time()-sTime))

# ====== initial setting ======

if __name__ == "__main__":
    main()

Start loading initial setting!
Start loading jieba dictionary!
Start add stopwords!
Start loading word2vec model!


Building prefix dict from /home/ethan/pythonwork/ipynotebook/MOST/jieba_setting/dict.txt.big ...
Loading model from cache /tmp/jieba.ub845031c377e450e44c5e52006afa1e4.cache


Load word2vec model success! took 1.13s
Start process CQA dataset


Loading model cost 1.008 seconds.
Prefix dict has been built succesfully.
  from ipykernel import kernelapp as app


Corpus: 0
This corpus has total 150 split words.
This corpus has 80 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.79.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: A.

Corpus: 1
This corpus has total 142 split words.
This corpus has 83 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.71.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: D.

Corpus: 2
This corpus has total 153 split words.
This corpus has 80 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.79.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 3
This corpus has total

  dist = 1.0 - uv / np.sqrt(uu * vv)


This corpus has total 370 split words.
This corpus has 179 words not in word2vec model.
Process all corpus content took 0.05s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.90.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: B.

Corpus: 18
This corpus has total 396 split words.
This corpus has 174 words not in word2vec model.
Process all corpus content took 0.05s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.90.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: B.

Corpus: 19
This corpus has total 386 split words.
This corpus has 173 words not in word2vec model.
Process all corpus content took 0.05s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.83.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: C.

Corpus: 20
This corpus has total 394

This corpus has total 253 split words.
This corpus has 93 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.87.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: D.

Corpus: 41
This corpus has total 241 split words.
This corpus has 105 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.84.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: B.

Corpus: 42
This corpus has total 251 split words.
This corpus has 94 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.86.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 43
This corpus has total 227 s

Correct answer is: C.
Predict answer is: D.

Corpus: 61
This corpus has total 258 split words.
This corpus has 105 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.76.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: C.

Corpus: 62
This corpus has total 477 split words.
This corpus has 206 words not in word2vec model.
Process all corpus content took 0.06s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.56.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: C.

Corpus: 63
This corpus has total 220 split words.
This corpus has 88 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.80.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Pred

This corpus has total 144 split words.
This corpus has 66 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.90.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: C.

Corpus: 84
This corpus has total 160 split words.
This corpus has 58 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.87.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: A.

Corpus: 85
This corpus has total 169 split words.
This corpus has 60 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.85.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: D.

Corpus: 86
This corpus has total 33 spl

Correct answer is: B.
Predict answer is: A.

Corpus: 118
This corpus has total 74 split words.
This corpus has 32 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.88.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: A.

Corpus: 119
This corpus has total 86 split words.
This corpus has 32 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.81.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: B.

Corpus: 120
This corpus has total 67 split words.
This corpus has 30 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.46.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predic

This corpus has total 121 split words.
This corpus has 41 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.83.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: B.

Corpus: 144
This corpus has total 105 split words.
This corpus has 39 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.74.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 145
This corpus has total 98 split words.
This corpus has 35 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.73.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: D.

Corpus: 146
This corpus has total 89 s

This corpus has total 241 split words.
This corpus has 162 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.75.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: A.

Corpus: 164
This corpus has total 250 split words.
This corpus has 168 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.77.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: C.

Corpus: 165
This corpus has total 216 split words.
This corpus has 86 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.57.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: A.

Corpus: 166
This corpus has total 3

This corpus has total 234 split words.
This corpus has 61 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.86.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: C.

Corpus: 186
This corpus has total 240 split words.
This corpus has 62 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.84.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 187
This corpus has total 206 split words.
This corpus has 95 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.82.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: A.

Corpus: 188
This corpus has total 192

This corpus has total 72 split words.
This corpus has 30 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.77.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: C.

Corpus: 216
This corpus has total 76 split words.
This corpus has 23 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.80.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: C.

Corpus: 217
This corpus has total 55 split words.
This corpus has 18 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.85.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: C.

Corpus: 218
This corpus has total 59 spl

This corpus has total 108 split words.
This corpus has 28 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.81.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: B.

Corpus: 235
This corpus has total 100 split words.
This corpus has 39 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.85.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: C.

Corpus: 236
This corpus has total 79 split words.
This corpus has 38 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.82.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: C.

Corpus: 237
This corpus has total 133 

The best match answer similarity to this CQA is 0.84.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: D.

Corpus: 257
This corpus has total 79 split words.
This corpus has 23 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.82.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: C.

Corpus: 258
This corpus has total 210 split words.
This corpus has 70 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.88.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 259
This corpus has total 202 split words.
This corpus has 71 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer simi

Correct answer is: B.
Predict answer is: D.

Corpus: 279
This corpus has total 297 split words.
This corpus has 114 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.69.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: C.

Corpus: 280
This corpus has total 290 split words.
This corpus has 117 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.52.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 281
This corpus has total 285 split words.
This corpus has 124 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.64.
Process all similarity calculation took 0.00s.

Correct answer is: （D）

This corpus has total 300 split words.
This corpus has 95 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.85.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: C.

Corpus: 302
This corpus has total 299 split words.
This corpus has 99 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.82.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: D.

Corpus: 303
This corpus has total 283 split words.
This corpus has 98 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.72.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: B.

Corpus: 304
This corpus has total 302

This corpus has total 87 split words.
This corpus has 35 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.73.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 330
This corpus has total 59 split words.
This corpus has 28 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.78.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: C.

Corpus: 331
This corpus has total 71 split words.
This corpus has 32 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.79.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: B.

Corpus: 332
This corpus has total 70 spl

This corpus has total 229 split words.
This corpus has 78 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.85.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: D.

Corpus: 350
This corpus has total 215 split words.
This corpus has 77 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.81.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: D.

Corpus: 351
This corpus has total 217 split words.
This corpus has 78 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.80.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: D.

Corpus: 352
This corpus has total 308

This corpus has total 297 split words.
This corpus has 87 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.83.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: C.

Corpus: 374
This corpus has total 301 split words.
This corpus has 86 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.85.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: D.

Corpus: 375
This corpus has total 304 split words.
This corpus has 86 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.84.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 376
This corpus has total 263

Correct answer is: B.
Predict answer is: B.

Corpus: 392
This corpus has total 154 split words.
This corpus has 53 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.88.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: D.

Corpus: 393
This corpus has total 90 split words.
This corpus has 51 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.47.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: A.

Corpus: 394
This corpus has total 124 split words.
This corpus has 39 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.84.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Pred