In [66]:
import os
import string
from stemming.porter2 import stem
import math

In [2]:
#从txt文件中读取并重新构建字典：words_dic
def read_words_indexing():
    data_path = os.path.join(os.getcwd(),'review_indexing.txt')
    file_read = open(data_path,'r',encoding='utf-8')
    
    words_dic={}# key is word, value is {doc_ID,[word_pos1,word_pos2,...]}
    word=''
    for line in file_read.readlines():
        if line[0] != '\t':
            word = line.split(':')[0]
            word_count = line.split(':')[1]
            
        elif line[0] == '\t':
            line = line.strip()
            doc_ID = line.split(" : ")[0]
            word_pos = line.split(" : ")[1].split(",")
            for i in range(0,len(word_pos)):
                word_pos[i] = int(word_pos[i])
            if word not in words_dic.keys():#如果这个单词不在字典里
                words_dic.update({word:{doc_ID : word_pos}})#一二级字典都添加
            elif word in words_dic.keys():#如果这个单词在字典里
                words_dic[word].update({doc_ID : word_pos})#只更新二级字典
                
    return words_dic

words_dic = read_words_indexing()

In [3]:
len(words_dic)

676304

In [95]:
# words_dic['flex']

In [6]:
def load_English_stop_words():
    data_path = os.path.join(os.getcwd(),'data','englishST.txt')
    ref_file = open(data_path, "r")
    English_stop_words = []#store English stop words
    for ref_line in ref_file.readlines():
        English_stop_words.append(ref_line.strip())
    return(English_stop_words)

In [64]:
def find_df(word):
    if word in words_dic.keys():
        return len(words_dic[word])
    else:
        return None

def find_tf(word, doc_ID):
    if word in words_dic.keys() and doc_ID in words_dic[word].keys():
        return len(words_dic[word][doc_ID])
    else:
        return None

def simple_search(searched_word): #Return the doc_ID of reviews that the searched_word appears
#     print('simple search: \"', searched_word, "\"")
    doc_IDs = []
    searched_word = stem(searched_word.lower())
    if words_dic.get(searched_word) == None:
#         print('Cannot find the word: ', searched_word)
        return None
    for doc_ID,word_pos in words_dic[searched_word].items():
        doc_IDs.append(doc_ID)
#     print('result: doc_IDs = ', doc_IDs, '\n')
    return doc_IDs


In [93]:
#需要review的总数量，想个办法把review的总数弄成全局变量!!!

#line是用户输入的查询请求，
#total_DOC_num指review的总数量，
#max_result_num是返回结果数量的最大值（多余这个数字的结果部分不会返回）
def TFIDF_search(line, total_DOC_num=430283, max_result_num=150):
    #Delete punctuations
    punctuations = list(string.punctuation)
    punctuations.append('—')
    punctuations.append('\n')
    for p in punctuations:
        line = line.replace(p, '').strip()
    
    #split review into words
    words = line.split(' ')
    
    # To low case
    for i in range(0,len(words)):  
        words[i] = words[i].lower()
    
    #Remove English stop words
    English_stop_words = load_English_stop_words()# load English_stop_words
    words = [elem for elem in words if elem not in English_stop_words]
    
    #Stemming
    for i in range(0,len(words)):
        words[i]=stem(words[i].lower())
        
    print('Preprocessed words:',words)
    
    
    DOC_where_words_appears=[]
    for word in words:
        DOCs = simple_search(word)#documents where this word appears
        DOC_where_words_appears.append(DOCs)
#     print(DOC_where_words_appears)
        
    union_set=[]#set of document.NO where at least one word appears in
    for DOCs in DOC_where_words_appears:#Find the union of these set
        union_set = list(set(union_set).union(set(DOCs)))
#     print('union_set=',union_set)
#     print('union_set size = ',len(union_set))
    
    
    #Find the score for each documnet basing on TFIDF term weighting
    DOC_scores=[]#([DOCNO-1,score],[DOCNO-2,score]...[DOCNO-n,score])
    for DOCNO in union_set:#For each document
        DOC_score=0
        for word in words:#For each word
            word_tf = find_tf(word,DOCNO)
            word_df = find_df(word)
#             print('word_tf=',word_tf)
#             print('word_df=',word_df)
            if word_tf!=None:#If this word is not in the document,we do not calculate it
                word_TFIDF = (1+math.log10(word_tf))*math.log10(total_DOC_num/word_df)
                DOC_score+=word_TFIDF
        DOC_scores.append([DOCNO,DOC_score])
#         print('DOC_score=',DOC_score)
#         break
    
    
    DOC_scores=sorted(DOC_scores, key=(lambda x: x[1]),reverse=True)
#     print('DOC_scores=',DOC_scores)
    
    if len(DOC_scores)>max_result_num:
        return DOC_scores[0:max_result_num]
    else:
        return DOC_scores

In [94]:
TFIDF_search("feel emotionally connected")

Preprocessed words: ['feel', 'emot', 'connect']


[['days of heaven_1978_rw6192050', 6.206617846286154],
 ['the midnight sky_2020_rw6452954', 6.174935669700188],
 ['tenet_2020_rw6075510', 6.133424433495703],
 ['10 cloverfield lane_2016_rw3429299', 6.1330809350303035],
 ['her_2013_rw6366463', 6.113099558340525],
 ['the social network_2010_rw2489614', 5.9956780067344795],
 ['the willoughbys_2020_rw5774972', 5.953022865129567],
 ['paris, texas_1984_rw5947800', 5.920777585416646],
 ['extremely loud & incredibly close_2011_rw6370677', 5.878465942277133],
 ['her_2013_rw2943654', 5.84988493123444],
 ['noah_2014_rw2988918', 5.824995665133974],
 ['black box_2020_rw6186536', 5.682075534699219],
 ['les misérables_2012_rw2967546', 5.639050254948186],
 ['don jon_2013_rw2882502', 5.599939828472598],
 ['13 hours_2016_rw3457659', 5.593897911268513],
 ['her_2013_rw2944616', 5.579958451782819],
 ['her_2013_rw2994978', 5.576806591197295],
 ['her_2013_rw2937339', 5.457019698037844],
 ['her_2013_rw2944561', 5.422272279982646],
 ['the hunger games: catchin