# Inverted Index Search from Indexed data

In [2]:
import re
import spacy
from nltk.corpus import stopwords

import pandas as pd
import ast

# Read the csv file
# invertedIndex file
# OneDrive/Documents/GitHub/swdev02_work/project/temp_src/invertedIndex_v0.csv
invertedIndex = pd.read_csv('csv/invertedIndex_v1.csv', index_col=0)
# webData file
webData = pd.read_csv('csv/webData_v0.csv', index_col=0)

class TextCleaner:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.stop_words = set(stopwords.words('english'))

    def normalize(self, raw_text):
        """Remove special characters and lowercase text"""
        return re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", raw_text.lower())

    def remove_stopwords(self, raw_text):
        """Remove stopwords"""
        words = [word for word in raw_text.split() if word not in self.stop_words]
        return " ".join(words)

    def lemmatize(self, raw_text):
        """Perform lemmatization"""
        doc = self.nlp(raw_text)
        return [token.lemma_ for token in doc]

    def clean(self, raw_text):
        """Clean text by normalizing, removing stopwords, and lemmatizing"""
        raw_text = self.normalize(raw_text)
        raw_text = self.remove_stopwords(raw_text)
        return self.lemmatize(raw_text)


def get_common_id(lists):
    # Initialize an empty set to store the common elements
    common_data = set(lists[0])
    # Iterate over the rest of the lists
    for lst in lists[1:]:
        # Update the set with the common elements of the current list and the set
        common_data.intersection_update(set(lst))
    return list(common_data)

def return_url_by_id(id):    
    try:
        return webData.loc[webData['ID'] == id, 'URL'][0]
    except KeyError:
        return 'No ID found in webData'

# Input Query convert to Token
def query_clean(input_str):
    if (input_str != "") and (type(input_str) == str):
        text_cleaner = TextCleaner()
        normalized_text = text_cleaner.normalize(input_str)
        no_stopwords_text = text_cleaner.remove_stopwords(normalized_text)
        lemmatized_text = text_cleaner.lemmatize(no_stopwords_text)
        return lemmatized_text
    else:
        return 'Input Error'

# Token to ID list
def token_to_match_list_list(token_list):
    res_temp = []
    for token in token_list:
        # get dict of each token and make a list
        res_temp.append( ast.literal_eval(invertedIndex.loc[invertedIndex['Gram'] == token, 'DocsID_Dict'][0]) )
    return res_temp

def search_inverted_index(inputQuery):
    output_temp = get_common_id(token_to_match_list_list(query_clean(inputQuery)))
    print('Total ', len(output_temp), " Results")
    for id in output_temp:
        print(return_url_by_id(id))    

In [4]:
inputQuery = ""

while inputQuery != "!ex":
    inputQuery = input(print("Input Search : "))
    print("Searching : ", inputQuery)
    if inputQuery == "!ex":
        pass
    else:
        search_inverted_index(inputQuery)    
    
print("\nEXIT . . .")   

Input Search : 


None pentax medium format


Searching :  pentax medium format
Total  13  Results
https://www.35mmc.com/25/11/2017/rollei-35-cameras-review/
https://www.35mmc.com/11/01/2019/minox-35-gt-guest-review/
https://www.35mmc.com/11/01/2019/minox-35-gt-guest-review/
https://www.35mmc.com/08/11/2020/my-journey-to-infrared-photography-by-markus-hofstatter/
https://www.35mmc.com/03/06/2021/my-journey-shooting-85-analog-cameras-and-writing-a-book-about-it-by-christof-bircher/
https://www.35mmc.com/03/06/2021/my-journey-shooting-85-analog-cameras-and-writing-a-book-about-it-by-christof-bircher/
https://www.35mmc.com/01/07/2017/making-sony-a7rii-work/
https://www.35mmc.com/11/01/2019/minox-35-gt-guest-review/
https://www.35mmc.com/08/11/2020/my-journey-to-infrared-photography-by-markus-hofstatter/
https://www.35mmc.com/31/05/2019/minolta-hi-matic-7-review/
https://www.35mmc.com/22/08/2021/montoggio-a-chronicle-of-an-absence-by-salvatore-da-cha/
https://www.35mmc.com/02/05/2022/the-descent-of-a-film-photographer-part-1-from-colo

None !ex


Searching :  !ex

EXIT . . .


# Class form

In [16]:
import re
import spacy
from nltk.corpus import stopwords

import pandas as pd
import ast

class TextCleaner:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.stop_words = set(stopwords.words('english'))

    def normalize(self, raw_text):
        """Remove special characters and lowercase text"""
        return re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", raw_text.lower())

    def remove_stopwords(self, raw_text):
        """Remove stopwords"""
        words = [word for word in raw_text.split() if word not in self.stop_words]
        return " ".join(words)

    def lemmatize(self, raw_text):
        """Perform lemmatization"""
        doc = self.nlp(raw_text)
        return [token.lemma_ for token in doc]

    def clean(self, raw_text):
        """Clean text by normalizing, removing stopwords, and lemmatizing"""
        raw_text = self.normalize(raw_text)
        raw_text = self.remove_stopwords(raw_text)
        return self.lemmatize(raw_text)


class InvertedIndexSearch:
    def __init__(self):
        self.invertedIndex = pd.read_csv('csv/invertedIndex_v1.csv', index_col=0)
        self.webData = pd.read_csv('csv/webData_v0.csv', index_col=0)
    
    def get_common_id(self, lists):
        common_data = set(lists[0])
        for lst in lists[1:]:
            common_data.intersection_update(set(lst))
        return list(common_data)

    def return_url_by_id(self, id):    
        try:
            return self.webData.loc[self.webData['ID'] == id, 'URL'][0]
        except KeyError:
            return 'No ID found in webData'

    def query_clean(self, input_str):
        if (input_str != "") and (type(input_str) == str):
            text_cleaner = TextCleaner()
            normalized_text = text_cleaner.normalize(input_str)
            no_stopwords_text = text_cleaner.remove_stopwords(normalized_text)
            lemmatized_text = text_cleaner.lemmatize(no_stopwords_text)
            return lemmatized_text
        else:
            return 'Input Error'

    def token_to_match_list_list(self, token_list):
        res_temp = []
        for token in token_list:
            res_temp.append( ast.literal_eval(self.invertedIndex.loc[self.invertedIndex['Gram'] == token, 'DocsID_Dict'][0]) )
        return res_temp

    def search_inverted_index(self, inputQuery):
        inp_query_list = self.query_clean(inputQuery)
        output_temp = self.get_common_id(self.token_to_match_list_list(inp_query_list))
        # print('Total ', len(output_temp), " Results")
        # for id in output_temp:
        #     # return as URLs
        #     # print(self.return_url_by_id(id))
        #     # return as ID
        #     print(id)
        return output_temp
            

In [17]:
inputQuery = "leica point and shoot film rangefinder camera street photography"

inverted_index = InvertedIndexSearch()
# got list of filtered document containing keywords
docs_with_words = inverted_index.search_inverted_index(inputQuery)



[903,
 135,
 525,
 142,
 271,
 1040,
 147,
 916,
 1049,
 538,
 1058,
 930,
 293,
 296,
 427,
 301,
 48,
 565,
 569,
 186,
 961,
 450,
 329,
 458,
 203,
 716,
 78,
 206,
 334,
 83,
 339,
 218,
 604,
 992,
 232,
 492,
 365,
 111,
 624,
 628,
 630,
 120,
 250,
 124,
 638,
 383]