In [1]:
import re
import spacy
from nltk.corpus import stopwords

class TextCleaner:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.stop_words = set(stopwords.words('english'))

    def normalize(self, raw_text):
        """Remove special characters and lowercase text"""
        return re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", raw_text.lower())

    def remove_stopwords(self, raw_text):
        """Remove stopwords"""
        words = [word for word in raw_text.split() if word not in self.stop_words]
        return " ".join(words)

    def lemmatize(self, raw_text):
        """Perform lemmatization"""
        doc = self.nlp(raw_text)
        return [token.lemma_ for token in doc]

    def clean(self, raw_text):
        """Clean text by normalizing, removing stopwords, and lemmatizing"""
        raw_text = self.normalize(raw_text)
        raw_text = self.remove_stopwords(raw_text)
        return self.lemmatize(raw_text)

# Inverted Index Search from Indexed data

In [43]:
import pandas as pd
import ast

# Read the csv file
# invertedIndex file
invertedIndex = pd.read_csv('invertedIndex.csv', index_col=0)

# webData file
webData = pd.read_csv('webData.csv', index_col=0)

def get_common_id(lists):
    # Initialize an empty set to store the common elements
    common_data = set(lists[0])
    # Iterate over the rest of the lists
    for lst in lists[1:]:
        # Update the set with the common elements of the current list and the set
        common_data.intersection_update(set(lst))
    return list(common_data)

def return_url_by_id(id):    
    try:
        return webData.loc[webData['ID'] == id, 'URL'][0]
    except KeyError:
        return 'No ID found in webData'
    
# dict_str = invertedIndex.loc[invertedIndex['Gram'] == 'leica', 'DocsID_Dict'][0]

# Input Query convert to Token
def query_clean(input_str):
    if (input_str != "") and (type(input_str) == str):
        text_cleaner = TextCleaner()
        normalized_text = text_cleaner.normalize(input_str)
        no_stopwords_text = text_cleaner.remove_stopwords(normalized_text)
        lemmatized_text = text_cleaner.lemmatize(no_stopwords_text)
        return lemmatized_text
    else:
        return 'Input Error'

# Token to ID list
def token_to_match_list_list(token_list):
    res_temp = []
    for token in token_list:
        res_temp.append( ast.literal_eval(invertedIndex.loc[invertedIndex['Gram'] == token, 'DocsID_Dict'][0]) )
    return res_temp

def search_inverted_index(inputQuery):
    output_temp = get_common_id(token_to_match_list_list(query_clean(inputQuery)))
    print('Total ', len(output_temp), " Results")
    for id in output_temp:
        print(return_url_by_id(id))    

dict_str = invertedIndex.loc[invertedIndex['Gram'] == 'grain', 'DocsID_Dict'][0]

dict_result = ast.literal_eval(dict_str)

dict_result.keys()

In [None]:
inputQuery = ""

while inputQuery != "!ex":
    inputQuery = input(print("Input Search : "))
    print("Searching : ", inputQuery)
    if inputQuery == "!ex":
        pass
    else:
        search_inverted_index(inputQuery)
    
    
print("\nEXIT . . .")   

Input Search : 


None Sony Digital Cameras


Searching :  Sony Digital Cameras
Total  45  Results
https://www.35mmc.com/30/01/2016/nikon-l35af-review/
https://www.35mmc.com/20/11/2020/photographing-little-things-on-film-by-jordi-fradera/
https://www.35mmc.com/30/12/2022/shooting-infrared-with-a-rangefinder-camera-rollei-infrared-film-and-a-retina-iic-by-tony-warren/
https://www.35mmc.com/12/07/2019/agfa-karat-36-review/
https://www.35mmc.com/30/12/2022/shooting-infrared-with-a-rangefinder-camera-rollei-infrared-film-and-a-retina-iic-by-tony-warren/
https://www.35mmc.com/17/10/2021/a-cinematic-photographer-discovering-new-horizons-by-nicola-armento/
https://www.35mmc.com/08/07/2021/seeking-control-through-simplifying-the-digital-experience-by-vic-mortelmans/
https://www.35mmc.com/18/07/2020/the-yashica-124g-and-how-it-helped-me-get-married-by-sacha-cloutier/
https://www.35mmc.com/29/12/2019/panasonic-gm1-a-digital-ilc-alternative-to-the-olympus-xa2-by-charles-higham/
https://www.35mmc.com/30/08/2021/sony-a7iii-mini-review-finding-