In [1]:
# -*- coding: UTF-8 -*-
import time
import jieba
import os
from gensim.models import word2vec
from gensim import models
import numpy as np
from scipy import spatial

In [2]:
# 精確模式 ：將句子最精確地切開，叫適合文本分析, cut_all=False
# 全模式：把句子中所有的可以成詞的詞語都掃描出來, 速度快, cut_all=True
# 搜索引擎模式：在精確模式的基礎上對長詞再次切分，提高召回率，適合用於搜尋引擎分詞, jieba.cut_for_search(Content)            
# call jieba api
def jiebaCut(s):
    words = jieba.cut(s,cut_all=True)
    #words = jieba.cut_for_search(s)
    result = removeStopWords(words)
    return result

# remove stopwords
def removeStopWords(words):
    result = []
    for w in words:
        if w not in stopWordsSet:
            result.append(w)
    return result

# define all state 
def state(s,flag):
    nextline = 1
    if s is None or s == "":
        return flag, nextline
    # state: 1, s[0] = C
    if s[0] == 'C':
        flag, nextline = 1, 0
    # state: 2, s[0] = Q
    elif s[0] == 'Q':
        flag, nextline = 2, 0
    # state: 3, s[0] = A
    elif s[0] == 'A':
        flag, nextline = 3, 0
    # state: 4, do jieba cut
    return flag, nextline

In [3]:
def main():
    sTime = time.time()
    result = 'wiki_true_result.txt'
    print("Start process CQA dataset")
    cNum, accuracy = 0, 0
    with open('CQA.txt', 'r') as file:
        flag, end = 0, 0
        cList, qList, aList = [],[],[]
        tempC = []
        ans = ""
        for i in file.readlines():
            s = i.strip()
            flag, nextline = state(s,flag)
            # one corpus process done!
            if end == 4:
                ans = s
                print("Corpus: %d" % cNum)
                guessAns = word2VecSum(cList, qList, aList, cNum)
                with open(result, 'a') as res:
                    res.write("\nCorpus :" + str(cNum))
                    res.write("\nCorrect answer is: " + ans)
                    res.write("\nPredict answer is: " + guessAns)
                    res.write('\n')
                if guessAns == ans:
                    accuracy +=1
                print("====== Final result ======")
                print("Correct answer is: %s." %(ans))
                print("Predict answer is: %s.\n" %(guessAns))
                #print("corpus:\n",cList,'\nquestion:\n',qList,'\nanswer:\n',aList,'\ncorrect ans:\n',ans,'\n')
                cList, qList, aList = [],[],[]
                flag, end = 0, 0
                ans = ""
                cNum +=1
                continue
            # still on state
            if nextline != 1:
                continue
            # on state 1, process Corpus
            elif flag == 1:
                cutRes = jiebaCut(s)
                for c in cutRes:
                    tempC.append(c)
                if nextline == 1:
                    if tempC:
                        cList.append(tempC)
                        tempC = []
            # on state 2, process Question
            elif flag == 2:
                cutRes = jiebaCut(s)
                for c in cutRes:
                    qList.append(c)
            # on state 3, process Answer
            elif flag == 3:
                end += 1
                # example: （B） 吃飯比讀書更為重要 
                tempS = ""
                skip = ['A','B','C','D','（',')']
                check = 0
                for j in s:
                    if check == 3:
                        tempS += j
                    else:
                        check += 1
                tempS = tempS.strip()
                cutRes = jiebaCut(tempS)
                tempL = []
                for c in cutRes:
                    tempL.append(c)
                aList.append(tempL)
    
    with open(result, 'a') as res:
        res.write("\nTotal corpus number :" + str(cNum))
        res.write("\nAccuracy is :" + str(accuracy/cNum*100))
    print("\nTotal corpus numbers: %d" % cNum)
    print("Accuracy is %.3f percent" % (accuracy/cNum*100))
    print("Processing all CQA dataset corpus took %.2fs" % (time.time()- sTime))
        

In [4]:

def word2VecSum(cList, qList, aList, cNum):

    sTime = time.time()
    print("====== Start process words vector sum ======")
    nc = np.zeros((len(cList),250),dtype=float)
    nq = np.zeros(250,dtype=float)
    na = np.zeros((len(aList),250),dtype=float)
    count, ind, notExist = 0 , 0 , 0
    # take all element from corpus List
    for c in cList:
        for w in c:
            # take word vector from word2vec model
            try:
                m = model[w]
            except KeyError as e:
                notExist +=1
                continue
            # calculate word vector sum from corpus list
            for n in range(250):
                nc[ind][n] += m[n]
            count +=1
        ind +=1
    # take all element from question List
    for w in qList:
        try:
            m = model[w]
        except KeyError as e:
            notExist +=1
            continue
        # calculate word vector sum from question list
        for n in range(250):
            nq[n] += m[n]
        count +=1
        
    ind = 0
     # take all element from answer List
    for a in aList:
        for w in a:
            try:
                m = model[w]
            except KeyError as e:
                notExist +=1
                continue
             # calculate word vector sum from answer list
            for n in range(250):
                na[ind][n] += m[n]
            count +=1
        ind +=1
        
    print("This corpus has total %d split words." % (count))
    print("This corpus has %d words not in word2vec model." % (notExist))
    print("Process all corpus content took %.2fs." % (time.time()- sTime))
    # go to final step, calculate similarity
    guessAns = similarity(nc, nq, na, cNum)
    return guessAns
    

In [5]:
def similarity(nc, nq, na, cNum):
    
    sTime = time.time()
    l = ['A','B','C','D']
    print("====== Start process vector similarity ======")
    # highest corpus/answer similarity
    h_c_Sim, h_a_Sim, highCorpus, ans = 0, 0, 0, 0
    
    # calculate the most similar corpus and question
    for c in nc:
        cosSim = 1 - spatial.distance.cosine(c, nq)
        if cosSim > h_c_Sim:
            h_c_Sim = cosSim
            # record highest similarity corpus
            highCorpus = c
            
    # calculate the most similar corpus and answer
    i = 0
    for a in na:
        cosSim = 1 - spatial.distance.cosine(a, highCorpus)
        if cosSim > h_a_Sim:
            h_a_Sim = cosSim
            ans = i
        i += 1
    
    print("The best match answer to this CQA is %s." %(l[ans]))
    print("The best match answer similarity to this CQA is %.2f." %(h_a_Sim))
    print("Process all similarity calculation took %.2fs.\n" % (time.time()- sTime))
    return l[ans]

In [6]:
# ====== initial setting ======

print("Start loading initial setting!")
# jieba setting
print("Start loading jieba dictionary!")
relativePath = os.getcwd()
jieba.set_dictionary(relativePath + '/jieba_setting/dict.txt.big')
# add user dictionary to improve jieba cut precision
# jieba.load_userdict(relativePath + '/jieba_setting/yourfile.txt')

# stopwords setting
print("Start add stopwords!")
stopWordsSet = set()
with open(relativePath + '/jieba_setting/stopwords.txt', 'r') as stop:
    for i in stop:
        stopWordsSet.add(i.strip('\n'))

# load word2vec model
print("Start loading word2vec model!")
sTime = time.time()
model = models.Word2Vec.load(relativePath + '/wiki/python/word2vec.model')
print("Load word2vec model success! took %.2fs" % (time.time()-sTime))

# ====== initial setting ======

if __name__ == "__main__":
    main()

Start loading initial setting!
Start loading jieba dictionary!
Start add stopwords!
Start loading word2vec model!


Building prefix dict from /home/ethan/pythonwork/ipynotebook/MOST/jieba_setting/dict.txt.big ...


Load word2vec model success! took 26.95s
Start process CQA dataset


Dumping model to file cache /tmp/jieba.ub845031c377e450e44c5e52006afa1e4.cache
Loading model cost 1.709 seconds.
Prefix dict has been built succesfully.
  from ipykernel import kernelapp as app


Corpus: 0
This corpus has total 146 split words.
This corpus has 2 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.79.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: C.

Corpus: 1
This corpus has total 141 split words.
This corpus has 2 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.64.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: C.

Corpus: 2
This corpus has total 150 split words.
This corpus has 2 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.78.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 3
This corpus has total 22

  dist = 1.0 - uv / np.sqrt(uu * vv)


This corpus has total 369 split words.
This corpus has 2 words not in word2vec model.
Process all corpus content took 0.05s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.80.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: B.

Corpus: 18
This corpus has total 406 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.05s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.91.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: B.

Corpus: 19
This corpus has total 395 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.05s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.78.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: D.

Corpus: 20
This corpus has total 405 split

This corpus has total 256 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.87.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: C.

Corpus: 41
This corpus has total 249 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.75.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: A.

Corpus: 42
This corpus has total 250 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.88.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: C.

Corpus: 43
This corpus has total 226 split

Correct answer is: D.
Predict answer is: C.

Corpus: 59
This corpus has total 264 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.82.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: C.

Corpus: 60
This corpus has total 260 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.85.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: B.

Corpus: 61
This corpus has total 251 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.74.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict a

This corpus has total 186 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.89.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: C.

Corpus: 89
This corpus has total 203 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.82.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: D.

Corpus: 90
This corpus has total 201 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.85.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: D.

Corpus: 91
This corpus has total 212 split

This corpus has total 74 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.78.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 123
This corpus has total 67 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.81.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: A.

Corpus: 124
This corpus has total 83 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.83.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: D.

Corpus: 125
This corpus has total 73 split 

This corpus has total 112 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.74.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 146
This corpus has total 98 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.64.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: A.

Corpus: 147
This corpus has total 89 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.52.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: A.

Corpus: 148
This corpus has total 117 spli

This corpus has total 281 split words.
This corpus has 3 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.81.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: A.

Corpus: 172
This corpus has total 262 split words.
This corpus has 3 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.72.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 173
This corpus has total 267 split words.
This corpus has 3 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.74.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: C.

Corpus: 174
This corpus has total 200 sp

This corpus has total 255 split words.
This corpus has 6 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.75.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: C.

Corpus: 193
This corpus has total 247 split words.
This corpus has 7 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.78.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: D.

Corpus: 194
This corpus has total 253 split words.
This corpus has 6 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.78.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: C.

Corpus: 195
This corpus has total 352 sp

Correct answer is: B.
Predict answer is: D.

Corpus: 214
This corpus has total 117 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.78.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: D.

Corpus: 215
This corpus has total 71 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.77.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: B.

Corpus: 216
This corpus has total 69 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.84.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict 

Correct answer is: A.
Predict answer is: A.

Corpus: 233
This corpus has total 76 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.79.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: D.

Corpus: 234
This corpus has total 60 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.84.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: D.

Corpus: 235
This corpus has total 102 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.80.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict 

This corpus has total 109 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.88.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: D.

Corpus: 255
This corpus has total 111 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.82.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: B.

Corpus: 256
This corpus has total 121 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.81.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: D.

Corpus: 257
This corpus has total 80 spl

This corpus has total 206 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.72.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: B.

Corpus: 274
This corpus has total 212 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.69.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: C.

Corpus: 275
This corpus has total 285 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.78.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: B.

Corpus: 276
This corpus has total 281 sp

Correct answer is: D.
Predict answer is: D.

Corpus: 297
This corpus has total 267 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.82.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: A.

Corpus: 298
This corpus has total 269 split words.
This corpus has 2 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.74.
Process all similarity calculation took 0.00s.

Correct answer is: A.
Predict answer is: C.

Corpus: 299
This corpus has total 280 split words.
This corpus has 2 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.83.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predic

This corpus has total 54 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.77.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: C.

Corpus: 329
This corpus has total 60 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.83.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: A.

Corpus: 330
This corpus has total 82 split words.
This corpus has 1 words not in word2vec model.
Process all corpus content took 0.01s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.71.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 331
This corpus has total 67 split 

This corpus has total 266 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.85.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predict answer is: D.

Corpus: 349
This corpus has total 251 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is D.
The best match answer similarity to this CQA is 0.65.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: D.

Corpus: 350
This corpus has total 224 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is B.
The best match answer similarity to this CQA is 0.86.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: B.

Corpus: 351
This corpus has total 213 sp

Correct answer is: B.
Predict answer is: D.

Corpus: 369
This corpus has total 184 split words.
This corpus has 9 words not in word2vec model.
Process all corpus content took 0.02s.
The best match answer to this CQA is A.
The best match answer similarity to this CQA is 0.00.
Process all similarity calculation took 0.00s.

Correct answer is: C.
Predict answer is: A.

Corpus: 370
This corpus has total 197 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.03s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.84.
Process all similarity calculation took 0.00s.

Correct answer is: B.
Predict answer is: C.

Corpus: 371
This corpus has total 199 split words.
This corpus has 0 words not in word2vec model.
Process all corpus content took 0.04s.
The best match answer to this CQA is C.
The best match answer similarity to this CQA is 0.83.
Process all similarity calculation took 0.00s.

Correct answer is: D.
Predic