
# TF-IDF Testing

In [154]:
import time
# import math for log
import math

import re
import spacy
from nltk.corpus import stopwords

import pandas as pd
import ast

class TextCleaner:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.stop_words = set(stopwords.words('english'))

    def normalize(self, raw_text):
        """Remove special characters and lowercase text"""
        return re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", raw_text.lower())

    def remove_stopwords(self, raw_text):
        """Remove stopwords"""
        words = [word for word in raw_text.split() if word not in self.stop_words]
        return " ".join(words)

    def lemmatize(self, raw_text):
        """Perform lemmatization"""
        doc = self.nlp(raw_text)
        return [token.lemma_ for token in doc]

    def clean(self, raw_text):
        """Clean text by normalizing, removing stopwords, and lemmatizing"""
        raw_text = self.normalize(raw_text)
        raw_text = self.remove_stopwords(raw_text)
        return self.lemmatize(raw_text)


class InvertedIndexSearch:
    def __init__(self):
        self.invertedIndex = pd.read_csv('csv/invertedIndex_v1.csv', index_col=0)
        self.webData = pd.read_csv('csv/webData_new.csv', index_col=0)
    
    def get_common_id(self, lists):
        common_data = set(lists[0])
        for lst in lists[1:]:
            common_data.intersection_update(set(lst))
        return list(common_data)

    def return_url_by_id(self, id):    
        try:
            return self.webData.loc[self.webData['ID'] == id, 'URL'][0]
        except KeyError:
            return 'No ID found in webData'

    def query_clean(self, input_str):
        if (input_str != "") and (type(input_str) == str):
            text_cleaner = TextCleaner()
            normalized_text = text_cleaner.normalize(input_str)
            no_stopwords_text = text_cleaner.remove_stopwords(normalized_text)
            lemmatized_text = text_cleaner.lemmatize(no_stopwords_text)
            return lemmatized_text
        else:
            return 'Input Error'

    def token_to_match_list_list(self, token_list):
        res_temp = []
        for token in token_list:
            res_temp.append( ast.literal_eval(self.invertedIndex.loc[self.invertedIndex['Gram'] == token, 'DocsID_Dict'][0]) )
        return res_temp

    def search_inverted_index(self, inputQuery):
        inp_query_list = self.query_clean(inputQuery)
        output_temp = self.get_common_id(self.token_to_match_list_list(inp_query_list))
        # print('Total ', len(output_temp), " Results")
        # for id in output_temp:
        #     # return as URLs
        #     # print(self.return_url_by_id(id))
        #     # return as ID
        #     print(id)
        return output_temp


In [155]:
class TfIdf:
    def __init__(self):
        pass

    def termFreq(self, term_inDocs, term_total):
        """Get term Frequency (No. of term in docs / total term in docs) """
        return term_inDocs / term_total

    def inverseDocsFreq(self, numDocsContain, totalDocs):
        """Get Inverse Document Frequency log(total Document / num of docs contain term) """
        return math.log(totalDocs / numDocsContain)

    def get_tf_idf(self, term_inDocs, term_total, numDocsContain, totalDocs):
        """Get TF-IDF"""
        return self.inverseDocsFreq(numDocsContain, totalDocs) * self.termFreq(term_inDocs, term_total)


In [158]:
start_time = time.time()

inputQuery = input(print("Input Search Query : "))
inverted_index = InvertedIndexSearch()
term_list = inverted_index.query_clean(inputQuery)
print( "Got Keyword : " )
print(term_list)

# got list of filtered document containing keywords
docs_with_words = inverted_index.search_inverted_index(inputQuery)

tfidf = TfIdf()

invertedIndex = pd.read_csv('csv/invertedIndex_v1.csv', index_col=0)
invertedIndex['DocsID_Dict'] = invertedIndex['DocsID_Dict'].apply(lambda x: ast.literal_eval(x))
webData = pd.read_csv('csv/webData_new.csv', index_col=0)

# Total Web
totalDocs = webData.index.size

# Result Ranking Dict
ranked_list = {}

# Loop through each id
for link in docs_with_words:
    # Loop through each terms
    score = 0
    term_inDocs = webData.loc[webData['ID'] == link, 'totalTerm'][0]
    for term in term_list:
        term_total = invertedIndex.loc[invertedIndex['Gram'] == term, 'DocsID_Dict'][0][link]
        numDocsContain = invertedIndex.loc[invertedIndex['Gram'] == term, 'DocsFreq'][0]
        score += tfidf.get_tf_idf(term_inDocs, term_total, numDocsContain, totalDocs)
        
    # print('id : ', link, ' score : ', score)
    ranked_list[link] = score

    
# Get ranked result
ranked_list = dict(sorted(ranked_list.items(), key=lambda item: item[1], reverse=True))

end_time = time.time()

print("Total ", len(ranked_list), " Result")
print("Search Time : ", end_time - start_time, " sec")
# Print ranked result
for key in ranked_list.keys():
    print(inverted_index.return_url_by_id(key))

Input Search Query : 


None pentax film point and shoot camera


Got Keyword : 
['pentax', 'film', 'point', 'shoot', 'camera']
Total  123  Result
Search Time :  17.443414449691772
https://www.35mmc.com/01/03/2015/leica-iiic-competition-entries-121-140/
https://www.35mmc.com/06/06/2022/hiking-the-uncanny-valley-by-don-goodman-wilson/
https://www.35mmc.com/06/06/2022/hiking-the-uncanny-valley-by-don-goodman-wilson/
https://www.35mmc.com/13/07/2018/petri-1-9-color-corrected-super-type-iii-rangefinder-review/
https://www.35mmc.com/01/06/2020/shooting-5x4-with-connie-reminding-myself-how-easy-large-format-photography-is/
https://www.35mmc.com/01/06/2020/shooting-5x4-with-connie-reminding-myself-how-easy-large-format-photography-is/
https://www.35mmc.com/31/05/2019/minolta-hi-matic-7-review/
https://www.35mmc.com/19/11/2021/fuji-discovery-875-zoom-plus-attention-kmart-shoppers-or-a-discount-department-store-photographic-history-by-shawn-granton/
https://www.35mmc.com/19/11/2021/fuji-discovery-875-zoom-plus-attention-kmart-shoppers-or-a-discount-department

# Playground

In [137]:
webData.loc[webData['ID'] == 1, 'totalTerm'][0]

404

In [138]:
webData

Unnamed: 0,ID,totalTerm,URL
0,1,404,https://www.35mmc.com/26/02/2021/panorama-wide...
0,2,333,https://www.35mmc.com/20/05/2016/halina-af700-...
0,3,962,https://www.35mmc.com/15/08/2022/ricoh-mirai-o...
0,4,433,https://www.35mmc.com/28/10/2020/pf-micro-110-...
0,5,1279,https://www.35mmc.com/05/06/2020/fujifilm-zoom...
...,...,...,...
0,1089,472,https://www.35mmc.com/27/06/2018/olympus-mju-i...
0,1090,299,https://www.35mmc.com/16/12/2018/konica-z-up-1...
0,1091,579,https://www.35mmc.com/10/07/2015/konica-mermai...
0,1092,919,https://www.35mmc.com/14/12/2020/pentax-espio-...


In [142]:
invertedIndex.loc[invertedIndex['Gram'] == 'point', 'DocsID_Dict'][0][]

4

In [100]:
def termFreq(term_inDocs, term_total):
    """Get term Frequency"""
    return term_inDocs / term_total    

In [103]:
def inverseDocsFreq(numDocsContain, totalDocs):
    """Get Inverse Document Frequency"""
    return math.log(totalDocs / numDocsContain)    

In [None]:
def get_tf_idf(term_inDocs, term_total, numDocsContain, totalDocs):
    """Get TF-IDF"""
    return inverseDocsFreq(numDocsContain, totalDocs) * termFreq(term_inDocs, term_total)

In [77]:
import math
import pandas as pd
import ast

#list of input keywords
keywords = ['leica', 'point', 'shoot', 'film', 'rangefinder', 'camera', 'street', 'photography']

#list of document IDs that contain all keywords
document_ids = inverted_index.search_inverted_index(inputQuery)

#pandas dataframe with columns 'Gram', 'DocsFreq', and 'DocsID_Dict'
df = pd.read_csv('csv/invertedIndex_v1.csv')
df['DocsID_Dict'] = df['DocsID_Dict'].apply(lambda x: ast.literal_eval(x))

#initialize dictionary for final scores
final_scores = {}

#loop through keywords
for keyword in keywords:
    #get number of documents containing keyword
    docs_freq = df[df['Gram'] == keyword]['DocsFreq'].values[0]
    #get dictionary of document IDs and term frequency of keyword in each document
    docs_id_dict = df[df['Gram'] == keyword]['DocsID_Dict'].values[0]
    #initialize dictionary for TF-IDF scores for this keyword
    tfidf_scores = {}
    #loop through document IDs
    for doc_id in document_ids:
        #get term frequency of keyword in this document
        tf = docs_id_dict[doc_id]
        #calculate TF-IDF
        tfidf = tf * math.log(len(document_ids) / docs_freq)
        #add to TF-IDF scores dictionary
        tfidf_scores[doc_id] = tfidf
    #loop through TF-IDF scores dictionary and add to final scores dictionary
    for doc_id in tfidf_scores:
        if doc_id in final_scores:
            final_scores[doc_id] += tfidf_scores[doc_id]
        else:
            final_scores[doc_id] = tfidf_scores[doc_id]

#sort final scores dictionary in descending order based on values
final_scores = {k: v for k, v in sorted(final_scores.items(), key=lambda item: item[1], reverse=True)}

#get ranked list of document IDs
ranked_docs = list(final_scores.keys())

print(ranked_docs)


[538, 450, 569, 78, 120, 296, 565, 250, 458, 525, 1049, 624, 628, 930, 232, 365, 329, 716, 339, 301, 383, 147, 916, 124, 135, 142, 961, 206, 218, 492, 271, 1040, 1058, 186, 630, 903, 604, 111, 293, 427, 334, 638, 48, 83, 992, 203]


In [61]:
import ast
df = pd.read_csv('csv/invertedIndex_v1.csv', index_col=0)
# inp['DocsID_Dict'].tolist()[0]
df['DocsID_Dict'] = df['DocsID_Dict'].apply(lambda x: ast.literal_eval(x))

In [78]:
import math
import pandas as pd
import ast

class TfIdfRanking:
    def __init__(self, keywords, document_ids, dataframe):
        self.keywords = keywords
        self.document_ids = document_ids
        self.df = dataframe
        self.df['DocsID_Dict'] = self.df['DocsID_Dict'].apply(lambda x: ast.literal_eval(x))
        self.final_scores = {}

    def rank_docs(self):
        for keyword in self.keywords:
            docs_freq = self.df[self.df['Gram'] == keyword]['DocsFreq'].values[0]
            docs_id_dict = self.df[self.df['Gram'] == keyword]['DocsID_Dict'].values[0]
            tfidf_scores = {}
            for doc_id in self.document_ids:
                tf = docs_id_dict[doc_id]
                tfidf = tf * math.log(len(self.document_ids) / docs_freq)
                tfidf_scores[doc_id] = tfidf
            for doc_id in tfidf_scores:
                if doc_id in self.final_scores:
                    self.final_scores[doc_id] += tfidf_scores[doc_id]
                else:
                    self.final_scores[doc_id] = tfidf_scores[doc_id]
        self.final_scores = {k: v for k, v in sorted(self.final_scores.items(), key=lambda item: item[1], reverse=True)}
        return list(self.final_scores.keys())


#list of input keywords
inputQuery = "leica point and shoot film rangefinder camera street photography"
inverted_index = InvertedIndexSearch()
term_list = inverted_index.query_clean(inputQuery)

#list of document IDs that contain all keywords
document_ids = inverted_index.search_inverted_index(inputQuery)

#pandas dataframe with columns 'Gram', 'DocsFreq', and 'DocsID_Dict'
df = pd.read_csv('csv/invertedIndex_v1.csv')

ranking = TfIdfRanking(term_list, document_ids, df)
ranked_docs = ranking.rank_docs()
print(ranked_docs)

[538, 450, 569, 78, 120, 296, 565, 250, 458, 525, 1049, 624, 628, 930, 232, 365, 329, 716, 339, 301, 383, 147, 916, 124, 135, 142, 961, 206, 218, 492, 271, 1040, 1058, 186, 630, 903, 604, 111, 293, 427, 334, 638, 48, 83, 992, 203]


import pandas as pd
df = pd.read_csv('csv/invertedIndex_v1.csv')
df.assign('totalTerms')