In [1]:
import os
import pandas as pd
import numpy as np
import re
import nltk.corpus
from unidecode                        import unidecode
from nltk.tokenize                    import word_tokenize
from nltk                             import SnowballStemmer
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
#needed libraries for previous build
from nltk.stem import PorterStemmer
import webcolors
from gensim.parsing.preprocessing import remove_stopwords
import string

In [3]:
Data_Path = os.path.join(os.getcwd(),'Data')

In [4]:
df = pd.read_csv(os.path.join(Data_Path,'Amazon Search Terms_Search Terms_US.csv')).sample(frac=0.01)
df.dropna(inplace=True)

# Optimized Model

## Data PreProcess New Approach

In [5]:
def removeWords(listOfTokens, listOfWords):
    return [token for token in listOfTokens if token not in listOfWords]

def applyStemming(listOfTokens, stemmer):
    return [stemmer.stem(token) for token in listOfTokens]

def twoLetters(listOfTokens):
    twoLetterWord = []
    for token in listOfTokens:
        if len(token) <= 2 or len(token) >= 21:
            twoLetterWord.append(token)
    return twoLetterWord

In [6]:
def processData(Dataset):   
    stopwords = nltk.corpus.stopwords.words('english')
    param_stemmer = SnowballStemmer('english')
    other_words = [line.rstrip('\n') for line in open('lists/stopwords_scrapmaker.txt')] # Load .txt file line by line
    
    for document in Dataset:
        try:
            index = Dataset.index(document)
            Dataset[index] = Dataset[index].replace(u'\ufffd', '8')   # Replaces the ASCII '�' symbol with '8'
            Dataset[index] = Dataset[index].replace(',', '')          # Removes commas
            Dataset[index] = Dataset[index].rstrip('\n')              # Removes line breaks
            Dataset[index] = Dataset[index].casefold()                # Makes all letters lowercase
            
            Dataset[index] = re.sub('\W_',' ', Dataset[index])        # removes specials characters and leaves only words
            Dataset[index] = re.sub("\S*\d\S*"," ", Dataset[index])   # removes numbers and words concatenated with numbers IE h4ck3r. Removes road names such as BR-381.
            Dataset[index] = re.sub("\S*@\S*\s?"," ", Dataset[index]) # removes emails and mentions (words with @)
            Dataset[index] = re.sub(r'http\S+', '', Dataset[index])   # removes URLs with http
            Dataset[index] = re.sub(r'www\S+', '', Dataset[index])    # removes URLs with www
            Dataset[index] = re.sub(r'www\S+', '', Dataset[index])    # removes URLs with www
        except:
            print(index)
        listOfTokens = word_tokenize(Dataset[index])
        twoLetterWord = twoLetters(listOfTokens)

        listOfTokens = removeWords(listOfTokens, stopwords)
        listOfTokens = removeWords(listOfTokens, twoLetterWord)
        listOfTokens = removeWords(listOfTokens, other_words)
        
        listOfTokens = applyStemming(listOfTokens, param_stemmer)
        listOfTokens = removeWords(listOfTokens, other_words)

        Dataset[index]   = " ".join(listOfTokens)
        Dataset[index] = unidecode(Dataset[index])

    return Dataset

In [7]:
products =  []
zipped = zip(df['#1 Product Title'].to_list() , df['#2 Product Title'].to_list() , df['#3 Product Title'].to_list())
for item in zipped:
    x,y,z = item[0],item[1],item[2]
    products.append(x)
    products.append(y)
    products.append(z)
products_raw = products.copy()

In [8]:
products = [ele for ele in products if ele is not None]
products = processData(products)

## Vectorization

In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(products)
tf_idf_X = pd.DataFrame(data = X.toarray(), columns=vectorizer.get_feature_names_out())

In [10]:
def OPtimized_Search_system(KNN_model: NearestNeighbors,Search_String : list,products = products):
    Search_q = processData(Search_String)
    Search_vectorized = vectorizer.transform(Search_q)
    NNs = KNN_model.kneighbors(Search_vectorized, return_distance=True)
    top = NNs[1][0][1:]
    recommendation = pd.DataFrame(columns = ['Search Term',  'Results', 'score'])
    count = 0
    index_score = NNs[0][0][1:]
    for i in top:
        recommendation.at[count, 'Search Term'] = Search_String[0]
        recommendation.at[count, 'Results'] = products[i]
        recommendation.at[count, 'score'] =  index_score[count]
        count += 1
    recommendation.index +=1
    return recommendation.loc[:,]

In [11]:
KNN = NearestNeighbors(
    n_neighbors= 6,
    )
KNN.fit(tf_idf_X)

NearestNeighbors(n_neighbors=6)

# Previous Model

## Previous PreProcess Function

In [12]:
def stemSentence(sentence):
    porter = PorterStemmer()
    token_words = word_tokenize(sentence)
    stem_sentence = [porter.stem(word) for word in token_words]
    return ' '.join(stem_sentence)
def Clean_Sequence(X=X):
    product = X
    product = [remove_stopwords(str(x))\
            .translate(str.maketrans('','',string.punctuation))\
            .translate(str.maketrans('','',string.digits))\
            for x in products]
    
    product = pd.Series([stemSentence(str(x)) for x in products])    
    colors = list(webcolors.CSS3_NAMES_TO_HEX)
    colors = [stemSentence(str(x)) for x in colors if x not in ('bisque','blanchedalmond','chocolate','honeydew','lime',
                                             'olive','orange','plum','salmon','tomato','wheat')]

    
    product = [' '.join([x for x in str(string).split() if x not in colors]) for string in products]

    return product

In [13]:
products_2 = Clean_Sequence(products_raw.copy())

## Previous Vectorization

In [14]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_productid = tfidf_vectorizer.fit_transform(products_2)

## Search Engine Functions

In [15]:
def Search_system(Search_String):
    Search_q = Clean_Sequence([Search_String])
    Search_tfidf = tfidf_vectorizer.transform(Search_q)
    return [Search_tfidf,Search_String]

In [16]:
def get_recommendation(top, df_all, scores, Search_String):
    recommendation = pd.DataFrame(columns = ['Search Term',  'Results Previous Model', 'score_2'])
    count = 0
    for i in top:
        recommendation.at[count, 'Search Term'] = Search_String
        recommendation.at[count, 'Results Previous Model'] = df_all[i]
        recommendation.at[count, 'score_2'] =  scores[count]
        count += 1
    recommendation.index += 1
    return recommendation.loc[:5,]

In [17]:
KNN_2 = NearestNeighbors(n_neighbors = 6)
KNN_2.fit(tfidf_productid)

NearestNeighbors(n_neighbors=6)

In [18]:
def previous_model_Search(ST:str):
    Searched = Search_system(ST)
    Search_tfidf,Search_string = Searched[0],Searched[1]
    NNs = KNN_2.kneighbors(Search_tfidf, return_distance=True)
    top = NNs[1][0][1:]
    index_score = NNs[0][0][1:]
    return get_recommendation(top, products_raw, index_score, Search_string)

# Comparing Results

In [19]:
Search_input = input('what do you want to search for?')
final_df_1 = OPtimized_Search_system(
    KNN_model = KNN,
    Search_String=[Search_input],
    products= products_raw
)
final_df_2 = previous_model_Search(Search_input)
pd.concat([final_df_1,final_df_2['Results Previous Model'],final_df_2['score_2']],axis = 1)



Unnamed: 0,Search Term,Results,score,Results Previous Model,score_2
1,book,"After: The After Series, Book 1",0.743758,5 LB - Ultra Clear Glycerin Soap Base by velon...,0.0
2,book,Before (The After Series Book 5),0.743758,Pifito Clear Melt and Pour Soap Base (2 lb) │ ...,0.874603
3,book,The Quiet Book padded board book,0.75322,NF,1.0
4,book,The Ultimate Bar Book: The Comprehensive Guide...,0.76426,Blue on Black,1.0
5,book,The Comfort Book,0.778027,Near You,1.0
