In [32]:
from pprint import pprint
from Parser import Parser
import util
from tfidf import *
import glob, os
import math
import numpy as np
import jieba
from numpy.linalg import norm
from tqdm import tqdm
from __future__ import division, unicode_literals
from textblob import TextBlob as tb

class VectorSpace:
    """ A algebraic model for representing text documents as vectors of identifiers. 
    A document is represented as a vector. Each dimension of the vector corresponds to a 
    separate term. If a term occurs in the document, then the value in the vector is non-zero.
    """

    #Collection of document term vectors
    documentVectors = []

    #Mapping of vector index to keyword
    vectorKeywordIndex = []
    #vectorKeywordIndex_chi = []

    #Tidies terms
    parser=None
    
    def __init__(self, documents=[], vectorMode = 'tf'):
        self.documentVectors=[]
        self.parser = Parser()
        self.BlobList = self.getBlobList(documents)
        self.vectorMode = vectorMode
        if(len(documents)>0):
            self.build(documents)
    
    def getBlobList(self, documents): 
        bloblist = []
        for doc in documents:
            wordList = self.parser.tokenise(doc)
            wordList = self.parser.removeStopWords(wordList)
            bloblist.append(tb(" ".join(wordList)))
        return bloblist
            
    def build(self,documents):
        """ Create the vector space for the passed document strings """
        self.vectorKeywordIndex = self.getVectorKeywordIndex(documents)
        #print(self.vectorKeywordIndex_chi)
        #self.vectorKeywordIndex_chi = self.getVectorKeywordIndex_chi(documents)
        #print(self.vectorKeywordIndex_chi)
        #self.vectorKeywordIndex = self.vectorKeywordIndex.update(self.vectorKeywordIndex_chi)
        self.documentVectors = [self.makeVector(document, self.vectorMode) for document in tqdm(documents)]
        
    
    def getVectorKeywordIndex_chi(self, documentList):
        
        pass

    def getVectorKeywordIndex(self, documentList):
        """ create the keyword associated to the position of the elements within the document vectors """

        #Mapped documents into a single word string	
        vocabularyString = " ".join(documentList)
        print('把所有document連載一起 ',vocabularyString)
        vocabularyList = self.parser.tokenise(vocabularyString) #斷詞
        print('斷詞', vocabularyList)
        #Remove common words which have no search value
        vocabularyList = self.parser.removeStopWords(vocabularyList) # 去掉stop words
        print('remove stop word', vocabularyList)
        uniqueVocabularyList = util.removeDuplicates(vocabularyList) # 去掉重複詞
        print('去掉重複詞', uniqueVocabularyList)
        
        vectorIndex={}
        offset=0
        #Associate a position with the keywords which maps to the dimension on the vector used to represent this word
        for word in uniqueVocabularyList:
            vectorIndex[word]=offset
            offset+=1
        return vectorIndex  #(keyword:position)


    def makeVector(self, wordString, mode):
        """ @pre: unique(vectorIndex) """
        #Initialise vector with 0's
        vector = [0] * len(self.vectorKeywordIndex)
        wordList = self.parser.tokenise(wordString)
        wordList = self.parser.removeStopWords(wordList)
        tbString = tb(" ".join(wordList))
        if mode == 'tf':
            for word in list(set(wordList)):
                vector[self.vectorKeywordIndex[word]] = tf(word, tbString) #Use simple Term Count Model
            return vector 
        
        if mode == 'tf-idf':
            #print('bloblist:', self.BlobList)
            for word in list(set(wordList)):
                vector[self.vectorKeywordIndex[word]] =  tfidf(word, tbString , self.BlobList) 
            return vector
        
    def buildQueryVector(self, termList):
        """ convert query string into a term vector """
        query = self.makeVector(" ".join(termList), self.vectorMode)
        return query

    def related(self,documentId):
        """ find documents that are related to the document indexed by passed Id within the document Vectors"""
        ratings = [util.cosine(self.documentVectors[documentId], documentVector) for documentVector in self.documentVectors]
        #ratings.sort(reverse=True)
        return ratings
    
    def search(self,searchList, mode = 'cos'):
        """ search for documents that match based on a list of terms """
        if type(searchList[0]) == str:
            queryVector = self.buildQueryVector(searchList)
        else:
            queryVector = searchList
        
        if mode == 'cos':
            ratings = [util.cosine(queryVector, documentVector) for documentVector in tqdm(self.documentVectors)]
        #ratings.sort(reverse=True)
            return ratings
        if mode == 'eucli':
            ratings = [util.Euclidean(queryVector, documentVector) for documentVector in tqdm(self.documentVectors)]
            return ratings

    def printresult(self,searchlist, files,n,mode='cos'):
        scoreList = self.search(searchlist, mode = mode)
        for i in np.flip(np.argsort(scoreList))[:n]:
            print( 'document:' , files[i],'score:', scoreList[i])
        return np.flip(np.argsort(scoreList))[:n]
            
    def getFeedbackVector(self, searchList, top10results ,mode = 'cos'):
        queryVector = self.buildQueryVector(searchList)
        feedbackVector = self.buildQueryVector(top10results)
        queryArray = np.array(queryVector)
        feedbackArray = np.array(feedbackVector)
        newQueryVector = list(queryArray + 0.5 * feedbackArray)
        return newQueryVector

In [33]:
#test data

documents = ["The cat cat in the hat disabled",
                 "A cat is a fine pet ponies.",
                 "Dogs and cats make good pets.",
                 "I haven't got a hat."]
            #"烏克蘭總統是澤倫斯基。", 
            #"澤倫斯基是親西方國家的烏克蘭總統，這讓普丁很不爽"]

vectorSpace = VectorSpace(documents, 'tf-idf')  # vectorSpace(documents, vectorMode = 'tf' or 'tf-idf') (default is 'tf')


#print(vectorSpace.vectorKeywordIndex)

#print(vectorSpace.documentVectors)

#print(vectorSpace.related(1))

#print(vectorSpace.search(["cat"]))   

#vectorSpace.printresult(["cat"])
#vectorSpace.printresult(["cat"], mode = 'eucli')  # mode = 'cos' or 'eucli' (default is 'cos')

###################################################

把所有document連載一起  The cat cat in the hat disabled A cat is a fine pet ponies. Dogs and cats make good pets. I haven't got a hat.
斷詞 ['the', 'cat', 'cat', 'in', 'the', 'hat', 'disabl', 'a', 'cat', 'is', 'a', 'fine', 'pet', 'poni', 'dog', 'and', 'cat', 'make', 'good', 'pet', 'i', "haven't", 'got', 'a', 'hat']
remove stop word ['cat', 'cat', 'hat', 'disabl', 'cat', 'fine', 'pet', 'poni', 'dog', 'cat', 'make', 'good', 'pet', 'hat']
去掉重複詞 {'make', 'hat', 'pet', 'poni', 'cat', 'disabl', 'dog', 'fine', 'good'}


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 1914.11it/s]


In [77]:
ChiStop = []
with open('ChineseStopwords.txt', encoding="utf-8") as f:
    lines =f.readlines()
    stop = ' '.join(lines)
    ChiStop = stop.replace("\n", "").split()  
print(ChiStop)    
'''
lines = f.readlines()
doc = ' '.join(lines)
doc1 = doc.replace("\n", "")
documents_chi.append(doc1)'''

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '--', '.', '..', '...', '......', '...................', './', '.一', '.數', '.日', '/', '//', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '://', '::', ';', '<', '=', '>', '>>', '?', '@', 'A', 'Lex', '[', ']', '^', '_', '`', 'exp', 'sub', 'sup', '|', '}', '~', '~~~~', '·', '×', '×××', 'Δ', 'Ψ', 'γ', 'μ', 'φ', 'φ．', 'В', '—', '——', '———', '‘', '’', '’‘', '“', '”', '”。', '…', '……', '…………………………………………………③', '′∈', '′｜', '℃', 'Ⅲ', '↑', '→', '∈［', '∪φ∈', '≈', '①', '②', '②ｃ', '③', '③］', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩', '──', '■', '▲', '、', '。', '〈', '〉', '《', '》', '》）。', '」', '『', '』', '【', '】', '〔', '〕', '〕〔', '㈧', '一', '一.', '一一', '一下', '一個', '一些', '一何', '一切', '一則', '一則通過', '一天', '一定', '一方面', '一旦', '一時', '一來', '一樣', '一次', '一片', '一番', '一直', '一致', '一般', '一起', '一轉眼', '一邊', '一面', '七', '萬一', '三', '三天兩頭', '三番兩次', '三番五次', '上', '上下', '上升', '上去', '上來', '上述', '上面', '下', '下列', '下去', '下來', '以下', '不', '不一', '不下', '不久', '不了', '不亦樂

'\nlines = f.readlines()\ndoc = \' \'.join(lines)\ndoc1 = doc.replace("\n", "")\ndocuments_chi.append(doc1)'

In [46]:
documents_chi = ["烏克蘭總統是澤倫斯基。", 
            "澤倫斯基是親西方國家的烏克蘭總統，這讓普丁很不爽。"]
vocabularyString = " ".join(documents_chi)
print(vocabularyString)
vocabularyList = jieba.lcut_for_search(vocabularyString)
    #vocabularyList.append(word)
print(vocabularyList)

烏克蘭總統是澤倫斯基。 澤倫斯基是親西方國家的烏克蘭總統，這讓普丁很不爽。
['烏克蘭', '總統', '是澤倫斯基', '。', ' ', '澤倫斯', '基是親', '西方', '國家', '的', '烏克蘭', '總統', '，', '這讓', '普丁', '很', '不爽', '。']


In [80]:
vocabularyList  = [word for word in vocabularyList if word not in ChiStop ]
print(vocabularyList)
uniqueVocabularyList = set(vocabularyList)
print(uniqueVocabularyList)

['烏克蘭', '總統', '是澤倫斯基', ' ', '澤倫斯', '基是親', '西方', '國家', '烏克蘭', '總統', '，', '這讓', '普丁', '不爽']
{'總統', '是澤倫斯基', '，', '這讓', '基是親', '烏克蘭', '西方', '國家', '不爽', '澤倫斯', ' ', '普丁'}


In [81]:
vectorIndex={}
offset=0
#Associate a position with the keywords which maps to the dimension on the vector used to represent this word
for word in uniqueVocabularyList:
    vectorIndex[word]=offset
    offset+=1

In [82]:
print(vectorIndex)

{'總統': 0, '是澤倫斯基': 1, '，': 2, '這讓': 3, '基是親': 4, '烏克蘭': 5, '西方': 6, '國家': 7, '不爽': 8, '澤倫斯': 9, ' ': 10, '普丁': 11}


In [15]:
vocabularyString = " ".join(documents[4:])
print(vocabularyString)

烏克蘭總統是澤倫斯基。 澤倫斯基是親西方國家的烏克蘭總統，這讓普丁很不爽


In [13]:
vectorSpace = VectorSpace(documents, 'tf-idf')

[]
None


TypeError: 'NoneType' object is not iterable

In [3]:
# Q3
documents_chi = [] #0-499是英文檔案 ， 500-1500是中文檔案
files_chi = []
for file in os.listdir("./News/News"):
    if file.endswith(".txt"):
        filename_chi = os.path.join("./News/News", file)
        files_chi.append(file[:-4])
        with open(filename_chi, encoding="utf-8") as f:
            lines = f.readlines()
            doc = ' '.join(lines)
            doc1 = doc.replace("\n", "")
            documents_chi.append(doc1)

In [6]:
# Q3
documents_chi = [] #0-499是英文檔案 ， 500-1500是中文檔案
files_chi = []
for file in os.listdir("./News/News"):
    if file.endswith(".txt"):
        filename_chi = os.path.join("./News/News", file)
        files_chi.append(file[:-4])
        with open(filename_chi, encoding="utf-8") as f:
            lines = f.readlines()
            doc = ' '.join(lines)
            doc1 = doc.replace("\n", "")
            documents_chi.append(doc1)

In [16]:
print(len(documents_chi[500:1500]))

1000
