In [70]:
import re
import spacy
from nltk.corpus import stopwords

class TextCleaners:
    """Designed for Inverted Indexing"""
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.stop_words = set(stopwords.words('english'))

    def normalize(self, raw_text):
        """Remove special characters and lowercase text"""
        return re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", raw_text.lower())

    def remove_stopwords(self, raw_text):
        """Remove stopwords"""
        words = [word for word in raw_text.split() if word not in self.stop_words]
        return " ".join(words)

    def lemmatize(self, raw_text):
        """Perform lemmatization, return as a list of strings"""
        doc = self.nlp(raw_text)
        return [token.lemma_ for token in doc]

    def clean(self, raw_text):
        """Clean text by normalizing, removing stopwords, and lemmatizing"""
        raw_text = self.normalize(raw_text)
        raw_text = self.remove_stopwords(raw_text)
        return self.lemmatize(raw_text)   


In [71]:
# a file for searching url from database and return the ranked url

# from cleanRawText import *

import sqlite3

file_name = 'database_elt_test1.db'
database_file = 'project\database\\' + file_name

class invertedIndexSearch:
    """class for searching the url from database, Inverted Indexing Style"""

    def __init__(self, database_file):
        """initialize the database"""
        self.tc = TextCleaners()
        self.conn = sqlite3.connect(database_file)
        self.curr = self.conn.cursor()

    def queryCleaner(self, query):
        """clean the query from the user, return the clean query unrepeated list"""
        clean_query = self.tc.clean(query)
        # remove the repeated word
        non_repeated_query = []
        for word in clean_query:
            if word not in non_repeated_query:
                non_repeated_query.append(word)
        return non_repeated_query

    def getInvertedIndexDict(self, word_list):
        list_temp = []
        for word in word_list:
            self.curr.execute(f"SELECT * FROM Inverted_Index WHERE Word = '{word}'")
            dict_temp = self.curr.fetchone()
            list_temp.append( list(eval(dict_temp[2]).keys()) )
        return list_temp
            
    def get_common_id(self, lists):
        common_data = set(lists[0])
        for lst in lists[1:]:
            common_data.intersection_update(set(lst))
        return list(common_data)
    
    def search_full_process(self, user_query):
        """return a list of inverted index search web ID"""
        print("Searching Query : ", user_query)
        list_query = self.queryCleaner(user_query)
        print("Cleaned Query : ", list_query)
        temp_dict = self.getInvertedIndexDict(list_query)
        print("Results : ")
        return self.get_common_id(temp_dict)
        
        # return self.get_common_id(self.getInvertedIndexDict(self.queryCleaner(user_query)))
        
    def Link_from_ID(self, id_list):
        """return a url from id list"""
        temp_list = []
        for ids in id_list:
            self.curr.execute(f"SELECT URL FROM web_Data WHERE Web_ID = '{ids}'")
            temp_list.append(self.curr.fetchone())
        return temp_list
            
            
            
        

In [72]:
iis = invertedIndexSearch(file_name)

In [73]:
list_query = iis.queryCleaner("Digital Camera")
print(list_query)
temp_dict = iis.getInvertedIndexDict(list_query)
iis.get_common_id(temp_dict)

['digital', 'camera']


[1, 2, 3]

In [76]:
a = iis.search_full_process("Digital Camera")
a

Searching Query :  Digital Camera
Cleaned Query :  ['digital', 'camera']
Results : 


[1, 2, 3]