In [55]:
from pprint import pprint
from Parser import Parser
import util
from tfidf import *
import glob, os
import math
import numpy as np
from tqdm import tqdm
from __future__ import division, unicode_literals
from textblob import TextBlob as tb

class VectorSpace:
    """ A algebraic model for representing text documents as vectors of identifiers. 
    A document is represented as a vector. Each dimension of the vector corresponds to a 
    separate term. If a term occurs in the document, then the value in the vector is non-zero.
    """

    #Collection of document term vectors
    documentVectors = []

    #Mapping of vector index to keyword
    vectorKeywordIndex=[]

    #Tidies terms
    parser=None
    
    def __init__(self, documents=[], vectorMode = 'tf'):
        self.documentVectors=[]
        self.parser = Parser()
        self.BlobList = self.getBlobList(documents)
        self.vectorMode = vectorMode
        if(len(documents)>0):
            self.build(documents)
    
    def getBlobList(self, documents): 
        bloblist = []
        for doc in documents:
            wordList = self.parser.tokenise(doc)
            wordList = self.parser.removeStopWords(wordList)
            bloblist.append(tb(" ".join(wordList)))
        return bloblist
            
    def build(self,documents):
        """ Create the vector space for the passed document strings """
        self.vectorKeywordIndex = self.getVectorKeywordIndex(documents)
        self.documentVectors = [self.makeVector(document, self.vectorMode) for document in tqdm(documents)]

        #print(self.vectorKeywordIndex)
        #print(self.documentVectors)


    def getVectorKeywordIndex(self, documentList):
        """ create the keyword associated to the position of the elements within the document vectors """

        #Mapped documents into a single word string	
        vocabularyString = " ".join(documentList)

        vocabularyList = self.parser.tokenise(vocabularyString)
        #print(vocabularyList)
        #print(vocabularyString)
        #Remove common words which have no search value
        vocabularyList = self.parser.removeStopWords(vocabularyList)
        #print(vocabularyList)
        uniqueVocabularyList = util.removeDuplicates(vocabularyList)
        #print(uniqueVocabularyList)
        vectorIndex={}
        offset=0
        #Associate a position with the keywords which maps to the dimension on the vector used to represent this word
        for word in uniqueVocabularyList:
            vectorIndex[word]=offset
            offset+=1
        return vectorIndex  #(keyword:position)


    def makeVector(self, wordString, mode):
        """ @pre: unique(vectorIndex) """
        #Initialise vector with 0's
        vector = [0] * len(self.vectorKeywordIndex)
        wordList = self.parser.tokenise(wordString)
        wordList = self.parser.removeStopWords(wordList)
        tbString = tb(" ".join(wordList))
        #print(wordList)
        if mode == 'tf':
            for word in list(set(wordList)):
                vector[self.vectorKeywordIndex[word]] = tf(word, tbString) #Use simple Term Count Model
            return vector 
        
        if mode == 'tf-idf':
            #print('bloblist:', self.BlobList)
            for word in list(set(wordList)):
                #print('word',word)
                vector[self.vectorKeywordIndex[word]] =  tfidf(word, tbString , self.BlobList) 
                #print('word:',word, 'idf:', idf(word, self.BlobList),  )
                #print('word:', word, 'tf:', tf(word, tbString))
            return vector

    def buildQueryVector(self, termList):
        """ convert query string into a term vector """
        #print(termList)
        #print(self.vectorMode)
        query = self.makeVector(" ".join(termList), self.vectorMode)
        return query

    def related(self,documentId):
        """ find documents that are related to the document indexed by passed Id within the document Vectors"""
        ratings = [util.cosine(self.documentVectors[documentId], documentVector) for documentVector in self.documentVectors]
        #ratings.sort(reverse=True)
        return ratings
    
    def search(self,searchList, mode = 'cos'):
        """ search for documents that match based on a list of terms """
        #print(searchList)
        if type(searchList[0]) == str:
            queryVector = self.buildQueryVector(searchList)
            #print(queryVector)
        else:
            queryVector = searchList
        if mode == 'cos':
            ratings = [util.cosine(queryVector, documentVector) for documentVector in tqdm(self.documentVectors)]
        #ratings.sort(reverse=True)
            return ratings
        if mode == 'eucli':
            ratings = [util.Euclidean(queryVector, documentVector) for documentVector in tqdm(self.documentVectors)]
            return ratings

    def printresult(self,searchlist, files, mode='cos'):
        scoreList = self.search(searchlist, mode = mode)
        for i in np.flip(np.argsort(scoreList))[:10]:
            print( 'document:' , files[i],'score:', scoreList[i])
        return np.flip(np.argsort(scoreList))[:10]
            
    def getFeedbackVector(self, searchList, top10results ,mode = 'cos'):
        queryVector = self.buildQueryVector(searchList)
        feedbackVector = self.buildQueryVector(top10results)
        queryArray = np.array(queryVector)
        feedbackArray = np.array(feedbackVector)
        newQueryVector = list(queryArray + 0.5 * feedbackArray)
        return newQueryVector

In [44]:
documents = []
files = []
for file in os.listdir("./EnglishNews/EnglishNews"):
    if file.endswith(".txt"):
        filename = os.path.join("./EnglishNews/EnglishNews", file)
        files.append(file[:-4])
        with open(filename, encoding="utf-8") as f:
            lines = f.readlines()
            doc = ' '.join(lines)
            doc1 = doc.replace("\n", "")
            documents.append(doc1)

In [62]:
vectorSpace_tf = VectorSpace(documents, 'tf')

100%|█████████████████████████████████████████████████████████████████████████████| 7034/7034 [00:12<00:00, 576.27it/s]


In [63]:
query = ["Trump Biden Taiwan China"]

In [64]:
print('Term Frequency (TF) Weighting + Cosine Similarity')
rank_cos = vectorSpace_tf.printresult(query,files,'cos')
print('Term Frequency (TF) Weighting + Euclidean Distance')
rank_euc = vectorSpace_tf.printresult(query,files, mode = 'eucli')

Term Frequency (TF) Weighting + Cosine Similarity


100%|██████████████████████████████████████████████████████████████████████████████| 7034/7034 [01:32<00:00, 76.39it/s]


document: News119356 score: 0.5163977794943223
document: News123256 score: 0.5163977794943223
document: News120265 score: 0.46852128566581813
document: News108578 score: 0.46852128566581813
document: News103117 score: 0.42874646285627205
document: News115594 score: 0.42640143271122094
document: News112667 score: 0.4008918628686366
document: News122919 score: 0.4003203845127178
document: News111959 score: 0.39528470752104733
document: News101763 score: 0.39528470752104733
Term Frequency (TF) Weighting + Euclidean Distance


100%|█████████████████████████████████████████████████████████████████████████████| 7034/7034 [00:57<00:00, 123.02it/s]

document: News110871 score: 0.6708203932499369
document: News108482 score: 0.6708203932499369
document: News108964 score: 0.6708203932499369
document: News110141 score: 0.6708203932499369
document: News108940 score: 0.6708203932499369
document: News111696 score: 0.6708203932499369
document: News107883 score: 0.6614378277661477
document: News108270 score: 0.6454972243679028
document: News107832 score: 0.6454972243679028
document: News110401 score: 0.6454972243679028





In [65]:
# Q2
feedBack = []
for i in rank_cos:
    feedBack.append(documents[i])
    feedback = [" ".join(feedBack)]

print('Term Frequency (TF) Weighting + Cosine Similarity')    
newquery = vectorSpace_tf.getFeedbackVector(query, feedback, 'cos')
vectorSpace_tf.printresult(newquery,files,'cos')

print('Term Frequency (TF) Weighting + Euclidean Distance')
vectorSpace_tf.printresult(newquery,files, mode = 'eucli')

Term Frequency (TF) Weighting + Cosine Similarity


100%|██████████████████████████████████████████████████████████████████████████████| 7034/7034 [02:11<00:00, 53.32it/s]


document: News123256 score: 0.5647630786445903
document: News119356 score: 0.5647630786445903
document: News120265 score: 0.5119909520356302
document: News108578 score: 0.5119909520356302
document: News103117 score: 0.4548918488685839
document: News115594 score: 0.45453885681717704
document: News115859 score: 0.4468905173012529
document: News111959 score: 0.4468905173012529
document: News101763 score: 0.4468905173012529
document: News119746 score: 0.4468905173012529
Term Frequency (TF) Weighting + Euclidean Distance


100%|█████████████████████████████████████████████████████████████████████████████| 7034/7034 [00:51<00:00, 135.64it/s]

document: News108964 score: 0.7039485425409744
document: News108482 score: 0.7039485425409744
document: News111696 score: 0.7039485425409744
document: News110141 score: 0.7039485425409744
document: News110871 score: 0.7031339977133836
document: News108940 score: 0.7031339977133836
document: News107883 score: 0.6950133455880988
document: News110401 score: 0.6798604395111019
document: News107832 score: 0.6798604395111019
document: News108270 score: 0.6798604395111019





array([2221, 2102, 2957, 2515, 2727, 2214, 1958, 2577, 1949, 2053],
      dtype=int64)

In [None]:
for i in np.flip(np.argsort(scoreList))[:10]:
    print( 'document:' , files[i],'score:', scoreList[i])

In [3]:
vectorSpace_tfidf = VectorSpace(documents ,'tf-idf')
print('TF-IDF Weighting + Cosine Similarity')
vectorSpace_tfidf.printresult(["Trump Biden Taiwan China"],files)
print('TF-IDF Weighting + Euclidean Distance')
vectorSpace_tfidf.printresult(["Trump Biden Taiwan China"],files, mode = 'eucli')

100%|██████████████████████████████████████████████████████████████████████████████| 7034/7034 [25:01<00:00,  4.69it/s]


TF-IDF Weighting + Cosine Similarity


100%|██████████████████████████████████████████████████████████████████████████████| 7034/7034 [01:40<00:00, 69.98it/s]


document: News103134 score: 0.46434426041191357
document: News103767 score: 0.44893938649720727
document: News104913 score: 0.41023815627691995
document: News108813 score: 0.41023815627691995
document: News116613 score: 0.41023815627691995
document: News104914 score: 0.3887053364880285
document: News112714 score: 0.3887053364880285
document: News101014 score: 0.3887053364880285
document: News116634 score: 0.3824226011372703
document: News103728 score: 0.36096936067594315
TF-IDF Weighting + Euclidean Distance


100%|█████████████████████████████████████████████████████████████████████████████| 7034/7034 [00:34<00:00, 203.70it/s]

document: News111696 score: 3.2621792687215017
document: News107883 score: 3.188692831038035
document: News108964 score: 3.1789584035989207
document: News122771 score: 3.166603585904829
document: News110747 score: 3.12086863322629
document: News110141 score: 3.0968146116076833
document: News108940 score: 3.0117984545040595
document: News108482 score: 2.955387623457143
document: News107804 score: 2.9483446300958107
document: News108270 score: 2.9218838035101777





In [53]:
#test data

documents = ["The cat cat in the hat disabled",
                 "A cat is a fine pet ponies.",
                 "Dogs and cats make good pets.",
                 "I haven't got a hat."]

vectorSpace = VectorSpace(documents, 'tf-idf')  # vectorSpace(documents, vectorMode = 'tf' or 'tf-idf') (default is 'tf')


#print(vectorSpace.vectorKeywordIndex)

#print(vectorSpace.documentVectors)

#print(vectorSpace.related(1))

#print(vectorSpace.search(["cat"]))   

vectorSpace.printresult(["cat"])
vectorSpace.printresult(["cat"], mode = 'eucli')  # mode = 'cos' or 'eucli' (default is 'cos')

###################################################

score: nan document: I haven't got a hat.
score: nan document: Dogs and cats make good pets.
score: nan document: A cat is a fine pet ponies.
score: nan document: The cat cat in the hat disabled
score: 1.2345525572306575 document: Dogs and cats make good pets.
score: 1.0216002166437488 document: A cat is a fine pet ponies.
score: 0.7504758415354574 document: The cat cat in the hat disabled
score: 0.28768207245178085 document: I haven't got a hat.


In [6]:
x = np.array([1, 2, 3, 4, 5])
y = np.array([3, 2, 1, 5, 2])
print(x-y)
print((x-y) ** 2)
print(((x-y) ** 2) ** 0.5)

[-2  0  2 -1  3]
[4 0 4 1 9]
[2. 0. 2. 1. 3.]
