In [33]:
from pprint import pprint
from Parser import Parser
import util
import tfidf
import glob, os
from __future__ import division, unicode_literals
import math
from textblob import TextBlob as tb

class VectorSpace:
    """ A algebraic model for representing text documents as vectors of identifiers. 
    A document is represented as a vector. Each dimension of the vector corresponds to a 
    separate term. If a term occurs in the document, then the value in the vector is non-zero.
    """

    #Collection of document term vectors
    documentVectors = []

    #Mapping of vector index to keyword
    vectorKeywordIndex=[]

    #Tidies terms
    parser=None
    
    def __init__(self, documents=[]):
        self.documentVectors=[]
        self.parser = Parser()
        self.BlobList = self.getBlobList(documents)
        
        if(len(documents)>0):
            self.build(documents)
    
    def getBlobList(self, documents): 
        bloblist = []
        for doc in documents:
            wordList = self.parser.tokenise(doc)
            wordList = self.parser.removeStopWords(wordList)
            bloblist.append(tb(" ".join(wordList)))
        return bloblist
            
    def build(self,documents):
        """ Create the vector space for the passed document strings """
        self.vectorKeywordIndex = self.getVectorKeywordIndex(documents)
        self.documentVectors = [self.makeVector(document) for document in documents]

        #print(self.vectorKeywordIndex)
        #print(self.documentVectors)


    def getVectorKeywordIndex(self, documentList):
        """ create the keyword associated to the position of the elements within the document vectors """

        #Mapped documents into a single word string	
        vocabularyString = " ".join(documentList)

        vocabularyList = self.parser.tokenise(vocabularyString)
        #Remove common words which have no search value
        vocabularyList = self.parser.removeStopWords(vocabularyList)
        uniqueVocabularyList = util.removeDuplicates(vocabularyList)

        vectorIndex={}
        offset=0
        #Associate a position with the keywords which maps to the dimension on the vector used to represent this word
        for word in uniqueVocabularyList:
            vectorIndex[word]=offset
            offset+=1
        return vectorIndex  #(keyword:position)


    def makeVector(self, wordString, mode = 'tf-idf'):
        """ @pre: unique(vectorIndex) """
        #Initialise vector with 0's
        vector = [0] * len(self.vectorKeywordIndex)
        wordList = self.parser.tokenise(wordString)
        wordList = self.parser.removeStopWords(wordList)
        tbString = tb(" ".join(wordList))
        if mode == 'tf':
            for word in list(set(wordList)):
                vector[self.vectorKeywordIndex[word]] = self.tf(word, tbString) #Use simple Term Count Model
            return vector 
        
        if mode == 'tf-idf':
            print('bloblist:', self.BlobList)
            for word in list(set(wordList)):
                vector[self.vectorKeywordIndex[word]] =  self.idf(word, self.BlobList) 
                print('word:',word, 'idf:',self.idf(word, self.BlobList),  )
            return vector
            
    
    def tf(self, word, blob):
        return blob.words.count(word) / len(blob.words)
    
    def n_containing(self, word, bloblist):
        print()
        return sum(1 for blob in bloblist if word in blob.words)

    def idf(self, word, bloblist):
        print('總文件數量：', len(bloblist), '含有{}字的文章數量+1:'.format(word), (1 + self.n_containing(word, bloblist)))
        print('{}的idf為:'.format(word), math.log(len(bloblist) / (1 + self.n_containing(word, bloblist))))
        return math.log(len(bloblist) / (1 + self.n_containing(word, bloblist)))

    def tfidf(self, word, blob, bloblist):
        return tf(word, blob) * idf(word, bloblist)

    def old_makeVector(self, wordString):
        """ @pre: unique(vectorIndex) """

        #Initialise vector with 0's
        vector = [0] * len(self.vectorKeywordIndex)
        wordList = self.parser.tokenise(wordString)
        wordList = self.parser.removeStopWords(wordList)
        for word in wordList:
            vector[self.vectorKeywordIndex[word]] += 1; #Use simple Term Count Model
        return vector


    def buildQueryVector(self, termList):
        """ convert query string into a term vector """
        query = self.makeVector(" ".join(termList))
        return query


    def related(self,documentId):
        """ find documents that are related to the document indexed by passed Id within the document Vectors"""
        ratings = [util.cosine(self.documentVectors[documentId], documentVector) for documentVector in self.documentVectors]
        #ratings.sort(reverse=True)
        return ratings


    def search(self,searchList):
        """ search for documents that match based on a list of terms """
        queryVector = self.buildQueryVector(searchList)
        #print(queryVector)
        ratings = [util.cosine(queryVector, documentVector) for documentVector in self.documentVectors]
        #ratings.sort(reverse=True)
        return ratings

In [34]:
#test data

documents = ["The cat cat in the hat disabled",
                 "A cat is a fine pet ponies.",
                 "Dogs and cats make good pets.",
                 "I haven't got a hat."]

vectorSpace = VectorSpace(documents)

#print(vectorSpace.vectorKeywordIndex)

print(vectorSpace.documentVectors)

#print(vectorSpace.related(1))

#print(vectorSpace.search(["cat"]))

###################################################

bloblist: [TextBlob("cat cat hat disabl"), TextBlob("cat fine pet poni"), TextBlob("dog cat make good pet"), TextBlob("hat")]

總文件數量： 4 含有cat字的文章數量+1: 4

cat的idf為: 0.0


總文件數量： 4 含有cat字的文章數量+1: 4

cat的idf為: 0.0

word: cat idf: 0.0

總文件數量： 4 含有hat字的文章數量+1: 3

hat的idf為: 0.28768207245178085


總文件數量： 4 含有hat字的文章數量+1: 3

hat的idf為: 0.28768207245178085

word: hat idf: 0.28768207245178085

總文件數量： 4 含有disabl字的文章數量+1: 2

disabl的idf為: 0.6931471805599453


總文件數量： 4 含有disabl字的文章數量+1: 2

disabl的idf為: 0.6931471805599453

word: disabl idf: 0.6931471805599453
bloblist: [TextBlob("cat cat hat disabl"), TextBlob("cat fine pet poni"), TextBlob("dog cat make good pet"), TextBlob("hat")]

總文件數量： 4 含有cat字的文章數量+1: 4

cat的idf為: 0.0


總文件數量： 4 含有cat字的文章數量+1: 4

cat的idf為: 0.0

word: cat idf: 0.0

總文件數量： 4 含有fine字的文章數量+1: 2

fine的idf為: 0.6931471805599453


總文件數量： 4 含有fine字的文章數量+1: 2

fine的idf為: 0.6931471805599453

word: fine idf: 0.6931471805599453

總文件數量： 4 含有pet字的文章數量+1: 3

pet的idf為: 0.28768207245178085


總文件數量

In [36]:
math.log(4/3) /math.log(10)

0.1249387366082999

In [56]:
with open("./EnglishNews/EnglishNews/News100012.txt", encoding = 'utf-8') as f:
    lines = f.readlines()
    doc = ' '.join(lines)
    doc1 = doc.replace("\n", "")

print(doc1)


'''
documents = []
bloblist = []
for file in os.listdir("./EnglishNews/EnglishNews"):
    if file.endswith(".txt"):
        filename = os.path.join("./EnglishNews/EnglishNews", file)
        bloblist.append(filename)
        with open(filename, encoding="utf-8") as f:
            lines = f.readlines()
            doc = ' '.join(lines)
            doc1 = doc.replace("\n", "")
            documents.append(doc1)
'''             
    

Trump aide who stormed the Capitol broke an 'oath to protect America,' judge says WASHINGTON (Reuters) - A man appointed to the U.S. State Department during the Trump administration will remain in jail while he awaits trial on charges that he took part in the deadly storming of the U.S. Capitol and assaulted police officers, a judge said on Tuesday.


'\ndocuments = []\nbloblist = []\nfor file in os.listdir("./EnglishNews/EnglishNews"):\n    if file.endswith(".txt"):\n        filename = os.path.join("./EnglishNews/EnglishNews", file)\n        bloblist.append(filename)\n        with open(filename, encoding="utf-8") as f:\n            lines = f.readlines()\n            doc = \' \'.join(lines)\n            doc1 = doc.replace("\n", "")\n            documents.append(doc1)\n'

In [51]:
query = tb("Trump Biden Taiwan China")
document0 = tb(documents[0])
for word in query.words:
    if word not in document0.words:
        vector = document0 + ' '+ word
print(vector)
#doc_que = " ".join(document0 + query)
#print(doc_que)

Trump aide who stormed the Capitol broke an 'oath to protect America,' judge says WASHINGTON (Reuters) - A man appointed to the U.S. State Department during the Trump administration will remain in jail while he awaits trial on charges that he took part in the deadly storming of the U.S. Capitol and assaulted police officers, a judge said on Tuesday. China


In [126]:
'''
from __future__ import division, unicode_literals
import math
from textblob import TextBlob as tb

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    
    return tf(word, blob) * idf(word, bloblist)

document1 = tb("""Python is a 2000 made-for-TV horror movie directed by Richard Clabaugh. The film features several cult favorite actors, including William Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy, Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean Whalen. The film concerns a genetically engineered snake, a python, that escapes and unleashes itself on a small town. It includes the classic final girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles, California and Malibu, California. Python was followed by two sequels: Python II (2002) and Boa vs. Python (2004), both also made-for-TV films.""")
document2 = tb("""Python, from the Greek word, is a genus of nonvenomous pythons[2] found in Africa and Asia. Currently, 7 species are recognised.[2] A member of this genus, P. reticulatus, is among the longest snakes known.""")
document3 = tb("""The Colt Python is a .357 Magnum caliber revolver formerly manufactured by Colt's Manufacturing Company of Hartford, Connecticut.  It is sometimes referred to as a "Combat Magnum".[1] It was first introduced in 1955, the same year as Smith &amp; Wesson's M29 .44 Magnum. The now discontinued Colt Python targeted the premium revolver market segment. Some firearm collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy Thompson, Renee Smeets and Martin Dougherty have described the Python as the finest production revolver ever made.""")


bloblist = [document1, document2, document3]

for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words} # 計算該文字的tf-idf值
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

print(scores)
'''

Top words in document 1
	Word: python, TF-IDF: 0.01662
	Word: films, TF-IDF: 0.00997
	Word: made-for-TV, TF-IDF: 0.00665
Top words in document 2
	Word: genus, TF-IDF: 0.02253
	Word: 2, TF-IDF: 0.02253
	Word: from, TF-IDF: 0.01126
Top words in document 3
	Word: Colt, TF-IDF: 0.01367
	Word: Magnum, TF-IDF: 0.01367
	Word: revolver, TF-IDF: 0.01367
{'The': 0.0, 'Colt': 0.013667363194657226, 'Python': -0.009697148509610592, 'is': -0.006464765673073728, 'a': -0.006464765673073728, '357': 0.0045557877315524084, 'Magnum': 0.013667363194657226, 'caliber': 0.0045557877315524084, 'revolver': 0.013667363194657226, 'formerly': 0.0045557877315524084, 'manufactured': 0.0045557877315524084, 'by': 0.0, "'s": 0.009111575463104817, 'Manufacturing': 0.0045557877315524084, 'Company': 0.0045557877315524084, 'of': -0.003232382836536864, 'Hartford': 0.0045557877315524084, 'Connecticut': 0.0045557877315524084, 'It': 0.0, 'sometimes': 0.0045557877315524084, 'referred': 0.0045557877315524084, 'to': 0.00455578773