# Inverted Index Search from Indexed data

In [36]:
import re
import spacy
from nltk.corpus import stopwords

import pandas as pd
import ast

# Read the csv file
# invertedIndex file
# OneDrive/Documents/GitHub/swdev02_work/project/temp_src/invertedIndex_v0.csv
invertedIndex = pd.read_csv('temp_src/invertedIndex_v1.csv', index_col=0)
# webData file
webData = pd.read_csv('temp_src/webData_v0.csv', index_col=0)

class TextCleaner:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.stop_words = set(stopwords.words('english'))

    def normalize(self, raw_text):
        """Remove special characters and lowercase text"""
        return re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", raw_text.lower())

    def remove_stopwords(self, raw_text):
        """Remove stopwords"""
        words = [word for word in raw_text.split() if word not in self.stop_words]
        return " ".join(words)

    def lemmatize(self, raw_text):
        """Perform lemmatization"""
        doc = self.nlp(raw_text)
        return [token.lemma_ for token in doc]

    def clean(self, raw_text):
        """Clean text by normalizing, removing stopwords, and lemmatizing"""
        raw_text = self.normalize(raw_text)
        raw_text = self.remove_stopwords(raw_text)
        return self.lemmatize(raw_text)


def get_common_id(lists):
    # Initialize an empty set to store the common elements
    common_data = set(lists[0])
    # Iterate over the rest of the lists
    for lst in lists[1:]:
        # Update the set with the common elements of the current list and the set
        common_data.intersection_update(set(lst))
    return list(common_data)

def return_url_by_id(id):    
    try:
        return webData.loc[webData['ID'] == id, 'URL'][0]
    except KeyError:
        return 'No ID found in webData'

# Input Query convert to Token
def query_clean(input_str):
    if (input_str != "") and (type(input_str) == str):
        text_cleaner = TextCleaner()
        normalized_text = text_cleaner.normalize(input_str)
        no_stopwords_text = text_cleaner.remove_stopwords(normalized_text)
        lemmatized_text = text_cleaner.lemmatize(no_stopwords_text)
        return lemmatized_text
    else:
        return 'Input Error'

# Token to ID list
def token_to_match_list_list(token_list):
    res_temp = []
    for token in token_list:
        # get dict of each token and make a list
        res_temp.append( ast.literal_eval(invertedIndex.loc[invertedIndex['Gram'] == token, 'DocsID_Dict'][0]) )
    return res_temp

def search_inverted_index(inputQuery):
    output_temp = get_common_id(token_to_match_list_list(query_clean(inputQuery)))
    print('Total ', len(output_temp), " Results")
    for id in output_temp:
        print(return_url_by_id(id))    

In [37]:
inputQuery = ""

while inputQuery != "!ex":
    inputQuery = input(print("Input Search : "))
    print("Searching : ", inputQuery)
    if inputQuery == "!ex":
        pass
    else:
        search_inverted_index(inputQuery)    
    
print("\nEXIT . . .")   

Input Search : 


None medium format


Searching :  medium format
Total  90  Results
https://www.35mmc.com/08/10/2016/ricoh-ff9s-review/
https://www.35mmc.com/12/06/2021/200-miles-by-bike-in-california-12-photos-from-a-lomo-lca-120-by-eric-norris/
https://www.35mmc.com/10/04/2018/5-frames-lomography-minitar-art-lens-barnaby-nutt/
https://www.35mmc.com/21/09/2020/nine-homemade-cameras-my-pinhole-journey-pt-1-by-sroyon-mukherjee/
https://www.35mmc.com/15/11/2019/fuji-natura-1600-shooting-star-trails-and-at-night/
https://www.35mmc.com/10/04/2018/5-frames-lomography-minitar-art-lens-barnaby-nutt/
https://www.35mmc.com/03/06/2021/my-journey-shooting-85-analog-cameras-and-writing-a-book-about-it-by-christof-bircher/
https://www.35mmc.com/24/12/2018/panomicron-oxygen-review/
https://www.35mmc.com/13/11/2020/shooting-darkroom-paper-in-camera-a-beginners-guide/
https://www.35mmc.com/01/07/2017/making-sony-a7rii-work/
https://www.35mmc.com/18/01/2021/3-ways-to-make-contact-sheets-and-what-they-can-tell-us-darkroom-technique-part-6-b

None !ex


Searching :  !ex

EXIT . . .


dict_str = invertedIndex.loc[invertedIndex['Gram'] == 'grain', 'DocsID_Dict'][0]

dict_result = ast.literal_eval(dict_str)

dict_result.keys()

run_id = 0
f = open("ScrapeDataStatus.py", "w")

f.write("CurrentID = " + str(run_id) )
f.close()

from ScrapeDataStatus import *

print(CurrentID)

import os.path

os.path.exists('webData.csv')