# Importing the libraries

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import nltk
from nltk.stem import SnowballStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer as CV
import re
from nltk.tokenize import word_tokenize
import warnings
import pickle
warnings.filterwarnings("ignore")

In [2]:
main = pd.read_csv('Preprocessed_amazonScrape.csv')

##### Some camera accessories were tagged as camera, they are reclassified here

In [3]:
main.loc[(main['item'] == 'camera') & (main['Price'] < 6999), 'item'] = 'camera accessory'

In [4]:
main.head(1)

Unnamed: 0.1,Unnamed: 0,Title,Price,Link,Image URL,Description,item,positive_feedback,critical_feedback,positive_top_words,critical_top_words
0,0,Fujifilm Instax Camera Mini 99 Premium Edition...,18999,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/51Rqiit2b9...,Take a snap and you are ready to create.; Colo...,camera,['I have now tested the Instax Mini extensivel...,['the newest of fujis instax range the is a b...,"['great', 'high', 'easy']","['lightdarkness', 'able', 'attached']"


Creating a copy of the main data frame that will go through the process but the results will be displayed using the raw main dataframe

In [5]:
df = main.copy()

## Feature engineering and text preprocessing

In [6]:
price_dict = {
    'expensive': "expensive, Costly, Pricey, High-priced, High-cost, Premium, Fancy, High-end, Valuable, branded, no matter the cost",
    'moderate': "Mid-range, Moderate, Median-priced, Standard-priced, Medium-priced, Reasonably-priced, Fair-priced, Normal-priced, Typical-priced",
    'cheap':"cheap, Inexpensive, Affordable, Low-cost, economic, Economical, Budget-friendly, Wallet-friendly, discount, Discounted, Low-priced, Pocket friendly"
}

item_types = df['item'].unique()

def cost_classifier(item_type):
    item_quantile_25 = df[df['item'] == item_type]['Price'].quantile(0.25)
    item_quantile_75 = df[df['item'] == item_type]['Price'].quantile(0.75)
    
    for index, row in df.iterrows():      # df.iterrows() to iterate through the rows
        if row['item'] == item_type:
            price = row['Price']
            if price <= item_quantile_25:
                df.loc[index, 'price_category'] = price_dict['cheap']
            elif item_quantile_25 < price <= item_quantile_75:
                df.loc[index, 'price_category'] = price_dict['moderate']
            else:
                df.loc[index, 'price_category'] = price_dict['expensive']


for i in item_types:
    cost_classifier(i)

In [7]:
def clean_description(x):
    if isinstance(x, str):
        return ' '.join(x.split(';'))
    else:
        return x
df['Description'] = df['Description'].apply(clean_description)

In [8]:
def concat_strings(x):
    result = ''
    for value in [x['item'], x['Description'], x['price_category']]:
        if isinstance(value, str):
            result += value + ' '
    return result.strip()

df['text'] = df.apply(concat_strings, axis=1)

In [9]:
from bs4 import BeautifulSoup
def preprocess(q):
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"  # dingbats
        "\U000024C2-\U0001F251" 
        "]+", 
        flags=re.UNICODE
    )
    q = emoji_pattern.sub(r'', q)
    # Define a regex pattern to match emojis and non-word characters
    pattern = re.compile("[^\w\s-]|\d|_")
    q = pattern.sub("", q)
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    # Removing HTML tags
    q = BeautifulSoup(q,'lxml')
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')  
    q = re.sub(pattern, ' ', q).strip()
    q = q.replace('\n', '')
    return q

In [10]:
df['text'] = df['text'].apply(preprocess)

##### Handling Title column separately

In [11]:
def title_preprocess(q):
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"  # dingbats
        "\U000024C2-\U0001F251" 
        "]+", 
        flags=re.UNICODE
    )
    q = emoji_pattern.sub(r'', q)
    
    pattern = re.compile("[^\w\s-]")  ### This regular expression will leave out the number from the title
    q = pattern.sub("", q)
    q = str(q).lower().strip()
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    # Removing HTML tags
    q = BeautifulSoup(q,'lxml')
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')  
    q = re.sub(pattern, ' ', q).strip()
    q = q.replace('\n', '')
    return q

In [12]:
df['Title'] = df['Title'].apply(title_preprocess)

In [13]:
def concat_title(x):
    result = ''
    for value in [x['Title'], x['text']]:
        result += value + ' '
    return result.strip()

df['text'] = df.apply(concat_title, axis=1)

In [14]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,Title,Price,Link,Image URL,Description,item,positive_feedback,critical_feedback,positive_top_words,critical_top_words,price_category,text
0,0,fujifilm instax camera mini 99 premium edition...,18999,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/51Rqiit2b9...,Take a snap and you are ready to create. Colo...,camera,['I have now tested the Instax Mini extensivel...,['the newest of fujis instax range the is a b...,"['great', 'high', 'easy']","['lightdarkness', 'able', 'attached']","Mid-range, Moderate, Median-priced, Standard-p...",fujifilm instax camera mini 99 premium edition...


# **Word2Vec model for semantic search results**

### *Dara Preprocessing & word2vec model*

In [15]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

def model_preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return tokens

df['tokens'] = df['text'].apply(model_preprocess)

# Train Word2Vec model
all_tokens = df['tokens'].tolist()  ## here i am converting the series object to list
word2vec_items_model = Word2Vec(all_tokens, vector_size=50, window=5)

In [16]:
with open('word2vec_items_model.pkl','wb') as file:
    pickle.dump(word2vec_items_model,file)

### *Generation of document vectors*

Main idea here is to calculate the avg wv matrix for a given tokenized token-text row 

In [17]:
def document_vector(tokens, model):
    vec = np.zeros(model.vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count != 0:
        vec /= count
    return vec

df['vector'] = df['tokens'].apply(lambda x: document_vector(x, word2vec_items_model))

In [18]:
df.to_csv('amazon_product_vectors.csv')

### *Function to find top N similar products*

The search query has to go through 2 additional steps --> preprocessing and document vectorization before similarity test

In [19]:
def recommend_products(search_query, df, model):
    search_tokens = model_preprocess(search_query)
    search_vector = document_vector(search_tokens, model)
    df['similarity'] = df['vector'].apply(lambda x: cosine_similarity([search_vector], [x])[0][0])
    top_products = df.sort_values(by='similarity', ascending=False).head(10).index
    return top_products

In [20]:
# User search query
search_query = "i want a cheap laptop that preferably supports ryzen"
top_product_index = recommend_products(search_query, df, word2vec_items_model)
top = main.loc[top_product_index]
main.loc[top_product_index]

Unnamed: 0.1,Unnamed: 0,Title,Price,Link,Image URL,Description,item,positive_feedback,critical_feedback,positive_top_words,critical_top_words
1290,1306,Acer Aspire Lite 12th Gen Intel Core i3-1215U ...,32999,https://www.amazon.in/Acer-i3-1215U-Windows-Gr...,https://m.media-amazon.com/images/I/51KL3aOZ0t...,Processor: Great performance meets long batter...,laptop,['the laptop is very good and easy to use it i...,['hello i have been using this laptop for day...,"['laptop', 'good', 'fine']","['laptop', 'good', 'capable']"
1303,1319,Acer One 14 AMD Ryzen 3 3250U Processor (8GB R...,26490,https://www.amazon.in/Acer-Processor-Graphics-...,https://m.media-amazon.com/images/I/81Xvvl-36a...,AMD Dual Core CPU : AMD Ryzen 3 3250U Processo...,laptop,['best affortable laptop under tight budget so...,['the laptop is good and functions well as exp...,"['good', 'laptop', 'new']","['purchase', 'bought', 'davinci']"
1265,1281,"ZEBRONICS PRO Series Y NBC 2S, Intel Core 11th...",28490,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/71s2KUCAUs...,Never miss the opportunity to grab your afford...,laptop,['laptop is very light processor is good accor...,['i could not agree more that the product is r...,"['good', 'laptop', 'overall']","['laptop', 'poor', 'discharge']"
1305,1321,"ZEBRONICS PRO Series Y NBC 2S, Intel Core 11th...",28490,https://www.amazon.in/ZEBRONICS-2S-Core-11th-1...,https://m.media-amazon.com/images/I/713u+ON+ii...,Never miss the opportunity to grab your afford...,laptop,['laptop is very light processor is good accor...,['i could not agree more that the product is r...,"['good', 'laptop', 'overall']","['laptop', 'poor', 'discharge']"
1239,1255,Acer Aspire Lite AMD Ryzen 5 5500U Premium Met...,34990,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/61wKPELn55...,Processor : AMD Ryzen 5 5500U Hexa-Core Mobile...,laptop,"['its good to use', 'superior look and light w...",['i purchased this mainly because of the suppo...,"['good', 'light', 'overall']","['good', 'worst', 'laptop']"
1230,1246,"ASUS Vivobook 16, Intel®Core™ i3-1215U 12th Ge...",36990,https://www.amazon.in/ASUS-Vivobook-Intel%C2%A...,https://m.media-amazon.com/images/I/71p1GQQCCW...,Processor: IntelCore i3-1215U Processor 1.2 GH...,laptop,['i have recently bought the asus vivobook an...,['this is my rd asus laptop purchase this is g...,"['good', 'laptop', 'normal']","['laptop', 'good', 'low']"
1286,1302,Lenovo IdeaPad Slim 3 Intel Core i3 12th Gen 1...,35700,https://www.amazon.in/Lenovo-IdeaPad-39-62cm-W...,https://m.media-amazon.com/images/I/81+SKGgJ9y...,Processor: 12th Gen Intel Core i3-1215U | Spee...,laptop,['this laptop has really a good performance ju...,[],"['good', 'nice', 'wonderful']",['No Review']
1301,1317,"ASUS Vivobook 14 Thin and Light Laptop, IntelC...",37990,https://www.amazon.in/ASUS-Vivobook-IntelCore-...,https://m.media-amazon.com/images/I/71qTbS09tj...,Processor: IntelCore i3-1215U Processor 1.2 GH...,laptop,['i have recently bought the asus vivobook an...,['this is my rd asus laptop purchase this is g...,"['good', 'laptop', 'normal']","['laptop', 'good', 'low']"
1178,1194,Acer Aspire Lite AMD Ryzen 5 5500U Premium Thi...,36990,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/71czGb00k5...,Processor : AMD Ryzen 5 5500U Hexa-Core Mobile...,laptop,"['its good to use', 'superior look and light w...",['i purchased this mainly because of the suppo...,"['good', 'light', 'overall']","['good', 'worst', 'laptop']"
1224,1240,"ASUS Vivobook 14, Intel Core i3-1115G4 11th Ge...",30990,https://www.amazon.in/ASUS-Vivobook-i3-1115G4-...,https://m.media-amazon.com/images/I/71mksp9a0L...,Processor: Intel Core i3-1115G4 Processor Lapt...,laptop,['laptop is worth the value what i spent perfo...,['worst products to buynew laptop not working ...,"['good', 'cheaper', 'speedbattery']","['worst', 'laptop']"


In [21]:
top['Link'].iloc[0]

'https://www.amazon.in/Acer-i3-1215U-Windows-Graphics-AL15-52/dp/B0CY1Z82QW/ref=sr_1_109?crid=207T28GKBK0UH&dib=eyJ2IjoiMSJ9.qi3xyi-beEoqXdxFfvPZV1H0R-C58iRhaJddSPuFlLp6d2kF65De2N5xEIXffAgG59UUm_JWlufIWT8cAM98PDWxk0a3NLkc1qYjXoHdVvwKtEtDH-LxbxmODyTxvjy_Q10n5rhAk0QSX5qPO5bbIAZoeg82dTGY2ZERWPQ49kdogG2GSXfVw3BV-hBsHOpmISOqMOTXmIxQElIuDmMJd4NPmGhQs7TdQrcRkRts7z4.Gb2ua1auOmOmd1d2sS1Ob3kaUUeWgPLSpcDsIZRhwYk&dib_tag=se&keywords=laptops&qid=1719164998&sprefix=laptops%2Caps%2C266&sr=8-109'