In [1]:
from pprint import pprint
from Parser import Parser
import util
from tfidf import *
import glob, os
import math
import numpy as np
from numpy.linalg import norm
from tqdm import tqdm
from __future__ import division, unicode_literals
from textblob import TextBlob as tb

class VectorSpace:
    """ A algebraic model for representing text documents as vectors of identifiers. 
    A document is represented as a vector. Each dimension of the vector corresponds to a 
    separate term. If a term occurs in the document, then the value in the vector is non-zero.
    """

    #Collection of document term vectors
    documentVectors = []

    #Mapping of vector index to keyword
    vectorKeywordIndex=[]

    #Tidies terms
    parser=None
    
    def __init__(self, documents=[], vectorMode = 'tf'):
        self.documentVectors=[]
        self.parser = Parser()
        self.BlobList = self.getBlobList(documents)
        self.vectorMode = vectorMode
        if(len(documents)>0):
            self.build(documents)
    
    def getBlobList(self, documents): 
        bloblist = []
        for doc in documents:
            wordList = self.parser.tokenise(doc)
            wordList = self.parser.removeStopWords(wordList)
            bloblist.append(tb(" ".join(wordList)))
        return bloblist
            
    def build(self,documents):
        """ Create the vector space for the passed document strings """
        self.vectorKeywordIndex = self.getVectorKeywordIndex(documents)
        self.documentVectors = [self.makeVector(document, self.vectorMode) for document in tqdm(documents)]

        #print(self.vectorKeywordIndex)
        #print(self.documentVectors)


    def getVectorKeywordIndex(self, documentList):
        """ create the keyword associated to the position of the elements within the document vectors """

        #Mapped documents into a single word string	
        vocabularyString = " ".join(documentList)

        vocabularyList = self.parser.tokenise(vocabularyString)
        #Remove common words which have no search value
        vocabularyList = self.parser.removeStopWords(vocabularyList)
        uniqueVocabularyList = util.removeDuplicates(vocabularyList)
        
        vectorIndex={}
        offset=0
        #Associate a position with the keywords which maps to the dimension on the vector used to represent this word
        for word in uniqueVocabularyList:
            vectorIndex[word]=offset
            offset+=1
        return vectorIndex  #(keyword:position)


    def makeVector(self, wordString, mode):
        """ @pre: unique(vectorIndex) """
        #Initialise vector with 0's
        vector = [0] * len(self.vectorKeywordIndex)
        wordList = self.parser.tokenise(wordString)
        wordList = self.parser.removeStopWords(wordList)
        tbString = tb(" ".join(wordList))
        #print(wordList)
        if mode == 'tf':
            for word in list(set(wordList)):
                vector[self.vectorKeywordIndex[word]] = tf(word, tbString) #Use simple Term Count Model
            return vector 
        
        if mode == 'tf-idf':
            #print('bloblist:', self.BlobList)
            for word in list(set(wordList)):
                #print('word',word)
                vector[self.vectorKeywordIndex[word]] =  tfidf(word, tbString , self.BlobList) 
                #print('word:',word, 'idf:', idf(word, self.BlobList),  )
                #print('word:', word, 'tf:', tf(word, tbString))
            return vector
        
    def buildQueryVector(self, termList):
        """ convert query string into a term vector """
        #print(termList)
        #print(self.vectorMode)
        query = self.makeVector(" ".join(termList), self.vectorMode)
        return query

    def related(self,documentId):
        """ find documents that are related to the document indexed by passed Id within the document Vectors"""
        ratings = [util.cosine(self.documentVectors[documentId], documentVector) for documentVector in self.documentVectors]
        #ratings.sort(reverse=True)
        return ratings
    
    def search(self,searchList, mode = 'cos'):
        """ search for documents that match based on a list of terms """
        #print(searchList)
        if type(searchList[0]) == str:
            queryVector = self.buildQueryVector(searchList)
            #print(queryVector)
        else:
            queryVector = searchList
        
        if mode == 'cos':
            ratings = [util.cosine(queryVector, documentVector) for documentVector in tqdm(self.documentVectors)]
        #ratings.sort(reverse=True)
            return ratings
        if mode == 'eucli':
            ratings = [util.Euclidean(queryVector, documentVector) for documentVector in tqdm(self.documentVectors)]
            return ratings

    def printresult(self,searchlist, files,n, mode='cos'):
        scoreList = self.search(searchlist, mode = mode)
        if self.vectorMode == 'tf' and mode == 'cos':
            print('Term Frequency (TF) Weighting + Cosine Similarity\n')
        elif self.vectorMode == 'tf' and mode == 'eucli':
            print('Term Frequency (TF) Weighting + Euclidean Distance\n')
        elif self.vectorMode == 'tf-idf' and mode == 'cos':
            print('TF-IDF Weighting + Cosine Similarity\n')
        elif self.vectorMode == 'tf-idf' and mode == 'eucli':
            print('TF-IDF Weighting + Euclidean Distance\n')
        print( 'NewsID' ,'         ','score')
        print('----------','     ','--------')
        for i in np.flip(np.argsort(scoreList))[:n]:
            print(files[i],'     ' ,scoreList[i])
        return np.flip(np.argsort(scoreList))[:n]
            
    def getFeedbackVector(self, searchList, top10results ,mode = 'cos'):
        queryVector = self.buildQueryVector(searchList)
        feedbackVector = self.buildQueryVector(top10results)
        queryArray = np.array(queryVector)
        feedbackArray = np.array(feedbackVector)
        newQueryVector = list(queryArray + 0.5 * feedbackArray)
        return newQueryVector

In [2]:
if __name__ == '__main__':
    documents = []
    files = []
    for file in os.listdir("./EnglishNews/EnglishNews"):
        if file.endswith(".txt"):
            filename = os.path.join("./EnglishNews/EnglishNews", file)
            files.append(file[:-4])
            with open(filename, encoding="utf-8") as f:
                lines = f.readlines()
                doc = ' '.join(lines)
                doc1 = doc.replace("\n", "")
                documents.append(doc1)
                
    query = ["Trump Biden Taiwan China"]
    
    vectorSpace_tf = VectorSpace(documents, 'tf')
    
    rank_cos = vectorSpace_tf.printresult(query,files,10,'cos')  #60% 前30: 100% 
    rank_euc = vectorSpace_tf.printresult(query,files, 10,mode = 'eucli')  #50% 前30: 80%
    '''
    vectorSpace_tfidf = VectorSpace(documents,'tf-idf')
    
    rank_cos_tfidf = vectorSpace_tfidf.printresult(query,files, 10,'cos')#70% #前30個100%
    rank_euc_tfidf = vectorSpace_tfidf.printresult(query,files, 10,'eucli')#40% #前30個40%
    '''

    # Q2
    '''
    feedBack = []
    for i in rank_cos_tfidf[:10]:
        feedBack.append(documents[i])
        feedback = [" ".join(feedBack)]

    print('TF-IDF Weighting + Cosine Similarity') # 70%
    newquery = vectorSpace_tfidf.getFeedbackVector(query, feedback, 'cos')
    vectorSpace_tfidf.printresult(newquery,files, 10,'cos')    
    '''

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7034/7034 [00:10<00:00, 667.75it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7034/7034 [01:02<00:00, 111.68it/s]


Term Frequency (TF) Weighting + Cosine Similarity

NewsID           score
----------       --------
News123256       0.516398
News119356       0.516398
News108578       0.468521
News120265       0.468521
News103117       0.428746
News115594       0.426401
News112667       0.400892
News122919       0.40032
News111959       0.395285
News115859       0.395285


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7034/7034 [00:25<00:00, 271.94it/s]

Term Frequency (TF) Weighting + Euclidean Distance

NewsID           score
----------       --------
News108482       0.67082
News110871       0.67082
News110141       0.67082
News111696       0.67082
News108964       0.67082
News108940       0.67082
News107883       0.661438
News107832       0.645497
News108270       0.645497
News110401       0.645497





In [3]:
rank_cos[:10]

array([6902, 5093, 2128, 5369,  750, 4078, 3247, 6565, 3037, 4152],
      dtype=int64)

TF Weighting + Cosine Similarity


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7034/7034 [00:55<00:00, 127.08it/s]

Term Frequency (TF) Weighting + Cosine Similarity

NewsID           score
----------       --------
News123256       0.564763
News119356       0.564763
News120265       0.511991
News108578       0.511991
News103117       0.454892
News115594       0.454539
News115859       0.446891
News111959       0.446891
News101763       0.446891
News119746       0.446891





array([6902, 5093, 5369, 2128,  750, 4078, 4152, 3037,  425, 5205],
      dtype=int64)

In [8]:
cos

100%|█████████████████████████████████████████████████████████████████████████████| 7034/7034 [00:07<00:00, 902.13it/s]


In [9]:

#print('Term Frequency (TF) Weighting + Euclidean Distance')

100%|█████████████████████████████████████████████████████████████████████████████| 7034/7034 [00:53<00:00, 130.64it/s]


Term Frequency (TF) Weighting + Cosine Similarity
document: News119356 score: 0.5163977794943222
document: News123256 score: 0.5163977794943222
document: News120265 score: 0.4685212856658182
document: News108578 score: 0.4685212856658182
document: News103117 score: 0.42874646285627205
document: News115594 score: 0.42640143271122083
document: News112667 score: 0.4008918628686366
document: News122919 score: 0.4003203845127178
document: News111959 score: 0.39528470752104733
document: News115859 score: 0.39528470752104733
document: News119746 score: 0.39528470752104733
document: News101763 score: 0.39528470752104733
document: News120085 score: 0.39043440472151514
document: News104498 score: 0.39043440472151514
document: News110775 score: 0.39043440472151514
document: News112298 score: 0.39043440472151514
document: News122462 score: 0.39043440472151514
document: News114963 score: 0.3857583749052298
document: News122750 score: 0.3857583749052298
document: News107163 score: 0.3857583749052298

100%|█████████████████████████████████████████████████████████████████████████████| 7034/7034 [00:21<00:00, 324.14it/s]


Term Frequency (TF) Weighting + Euclidean Distance
document: News108482 score: 0.6708203932499369
document: News110871 score: 0.6708203932499369
document: News108964 score: 0.6708203932499369
document: News108940 score: 0.6708203932499369
document: News110141 score: 0.6708203932499369
document: News111696 score: 0.6708203932499369
document: News107883 score: 0.6614378277661477
document: News107832 score: 0.6454972243679028
document: News110401 score: 0.6454972243679028
document: News108270 score: 0.6454972243679028
document: News110747 score: 0.6267831705280087
document: News110223 score: 0.6267831705280087
document: News109912 score: 0.6267831705280087
document: News110329 score: 0.6267831705280087
document: News110497 score: 0.6267831705280087
document: News111420 score: 0.6236095644623235
document: News110833 score: 0.6236095644623235
document: News108024 score: 0.6236095644623235
document: News109276 score: 0.6123724356957945
document: News107804 score: 0.6123724356957945
document:

100%|██████████████████████████████████████████████████████████████████████████████| 7034/7034 [18:41<00:00,  6.27it/s]


100%|█████████████████████████████████████████████████████████████████████████████| 7034/7034 [00:56<00:00, 124.90it/s]


TF-IDF Weighting + Cosine Similarity
document: News103134 score: 0.47354915840176326
document: News103767 score: 0.4560496763154008
document: News116613 score: 0.4110554885255448
document: News104913 score: 0.4110554885255448
document: News108813 score: 0.4110554885255448
document: News104914 score: 0.38949623702525554
document: News112714 score: 0.38949623702525554
document: News101014 score: 0.38949623702525554
document: News116634 score: 0.38593749188632803
document: News103728 score: 0.3649292024732102
document: News103602 score: 0.32887988817071073
document: News120085 score: 0.3149122389640781
document: News104498 score: 0.3149122389640781
document: News112298 score: 0.3149122389640781
document: News110040 score: 0.3133693689239125
document: News110441 score: 0.3111255372629767
document: News106640 score: 0.3080687882294075
document: News100757 score: 0.29709045391611844
document: News110804 score: 0.29511422948403626
document: News121995 score: 0.29225581198231654
document: News

100%|█████████████████████████████████████████████████████████████████████████████| 7034/7034 [00:23<00:00, 303.99it/s]

TF-IDF Weighting + Euclidean Distance
document: News111696 score: 3.1738412094930877
document: News108964 score: 3.090312939538556
document: News107883 score: 3.0655439111104896
document: News110141 score: 3.0640754366202367
document: News122771 score: 3.0143726040179266
document: News110747 score: 3.009559318414031
document: News108482 score: 2.9345996967448014
document: News108940 score: 2.9248801263833597
document: News108270 score: 2.874122470395115
document: News110871 score: 2.8523192991552713
document: News107804 score: 2.829682742565021
document: News107832 score: 2.8115392443751284
document: News110833 score: 2.746581890623016
document: News110401 score: 2.744118697256618
document: News123304 score: 2.73874948545406
document: News109473 score: 2.730692961761022
document: News110223 score: 2.7254989764094484
document: News111034 score: 2.6819244734119745
document: News109622 score: 2.66270152162418
document: News123024 score: 2.6601117436200727
document: News109912 score: 2.636




TF-IDF Weighting + Cosine Similarity


100%|██████████████████████████████████████████████████████████████████████████████| 7034/7034 [01:20<00:00, 87.79it/s]

document: News103134 score: 0.5197886511286802
document: News104913 score: 0.5124974838557128
document: News108813 score: 0.5124974838557128
document: News116613 score: 0.5124974838557128
document: News101014 score: 0.4989410742564866
document: News112714 score: 0.4989410742564866
document: News104914 score: 0.4989410742564866
document: News103767 score: 0.49212543366673545
document: News116634 score: 0.4379371826487922
document: News103728 score: 0.3943061599242882





array([ 754, 1168, 2183, 4360,  240, 3259, 1169,  900, 4368,  889],
      dtype=int64)

In [53]:
#test data

documents = ["The cat cat in the hat disabled",
                 "A cat is a fine pet ponies.",
                 "Dogs and cats make good pets.",
                 "I haven't got a hat."]

vectorSpace = VectorSpace(documents, 'tf-idf')  # vectorSpace(documents, vectorMode = 'tf' or 'tf-idf') (default is 'tf')


#print(vectorSpace.vectorKeywordIndex)

#print(vectorSpace.documentVectors)

#print(vectorSpace.related(1))

#print(vectorSpace.search(["cat"]))   

vectorSpace.printresult(["cat"])
vectorSpace.printresult(["cat"], mode = 'eucli')  # mode = 'cos' or 'eucli' (default is 'cos')

###################################################

score: nan document: I haven't got a hat.
score: nan document: Dogs and cats make good pets.
score: nan document: A cat is a fine pet ponies.
score: nan document: The cat cat in the hat disabled
score: 1.2345525572306575 document: Dogs and cats make good pets.
score: 1.0216002166437488 document: A cat is a fine pet ponies.
score: 0.7504758415354574 document: The cat cat in the hat disabled
score: 0.28768207245178085 document: I haven't got a hat.
