## Dependencies & libraries

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from scipy.sparse import hstack
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load Data

In [2]:
directory_path = "D:\\Programming\\Graduation Project\\Search-Engine-And-Recommendation-system-on-Amazon-Product-main\\"

In [3]:
mobiles_path = directory_path + "MobilesFinal.csv"
laptops_path = directory_path + "LaptopsFinal.csv"
accessories_path = directory_path + "Accessories_Final.csv"

In [4]:
mobiles = pd.read_csv(mobiles_path)
laptops = pd.read_csv(laptops_path)
accessories = pd.read_csv(accessories_path)

## Preprocess Data

#### Laptops

In [5]:
# removes spaces between words in display-size, ram, hdd
selected_columns = ['ram', 'HDD']
laptops[selected_columns] = laptops[selected_columns].applymap(lambda x: x.strip().replace(' ', '') if isinstance(x, str) else x)

In [6]:
laptops['features'] =  laptops['ram'] + ' ' + laptops['HDD'] + ' ' + laptops['SSD'] + ' ' + laptops['processor'] + ' ' + laptops['graphics-card']


In [7]:
laptops['category'] = 'laptop'

In [8]:
laptops = laptops[['name', 'brand', 'category', 'features']]

In [9]:
laptops.head(5)

Unnamed: 0,name,brand,category,features
0,"Lenovo Legion 7 82n600Q3ED Ryzen 9 5900HX, 32G...",Lenovo,laptop,32GP 1TB NONE AMD NVIDIA® GeForce RTX
1,"Hp Pavilion Laptop, Intel® Core™ i5-1135G7 , 8...",Hp,laptop,8GP 512GB NONE intel Core i5 NVIDIA® GeForce MX
2,HP ProBook 450 G9 Intel® Core™ i7-1255U - 8G -...,HP,laptop,NONE 512GB NONE intel Core i7 NVIDIA® GeForce MX
3,Asus Zenbook UX5304VA-OLED517W Intel® Core™i7-...,Asus,laptop,16GP 512GB NONE NONE Internal Intel card
4,"Asus ROG Ally Rayzen Z1, 16GB, 512GB, AMD Rad...",Asus,laptop,16GP 512GB NONE NONE AMD Radeon™


#### Mobiles

In [10]:
mobiles['features'] =  mobiles['ram'] + ' ' + mobiles['internal-memory'] + ' ' + mobiles['processor'] + ' ' + mobiles['prim-cam'] + ' ' + mobiles['second-cam'] 
mobiles['category'] = 'mobile'
mobiles = mobiles[['name', 'brand', 'category', 'features']]

In [11]:
mobiles.head(5)

Unnamed: 0,name,brand,category,features
0,Samsung Galaxy S23 Ultra - 12GB RAM - 256GB,Samsung,mobile,12 GB 256 GB Octa-core (1x3.36 GHz Cortex-X3 &...
1,Vivo Y35 - 8GB RAM - 128GB,Vivo,mobile,8 GB 128 GB Octa-core (4x2.4 GHz Kryo 265 Gold...
2,Nokia C10 - 2GB RAM - 32GB,Nokia,mobile,2 GB 32 GB Quad-core 1.3 GHz Cortex-A7 5 MP 5 MP
3,vivo Y73 - 8GB RAM - 128GB,Vivo,mobile,8 GB 128 GB Octa-core (2x2.05 GHz Cortex-A76 &...
4,Apple iPhone 13 - 128GB - Face ID (12 Month Wa...,Apple,mobile,4 GB 128 GB Hexa-core (2x3.22 GHz Avalanche + ...


#### Accessories

In [12]:
accessories = accessories.rename(columns={'type': 'category'})
accessories = accessories[['name', 'brand', 'category', 'features']]

In [13]:
accessories.head(5)

Unnamed: 0,name,brand,category,features
0,Honor X5 Bluetooth Earbuds - White,Honor,Earphones,"\n- Press control\n- Proximity discovery, dual..."
1,"Oraimo Wireless Earphones, Black - OEB-E03D",Oraimo,Earphones,NONE
2,Nothing Ear 2 Wireless Earphones - Black,Nothing,Earphones,-Built-in 13mm drivers dilvers dynamic sound
3,"JBL Tour Pro Plus Wireless Earphones, Black",JBL,Earphones,\n- Driver size: 6.8mm \n- IPX5 waterproof\n- ...
4,Xiaomi Redmi Buds 4 Active Wireless Earphones ...,Xiaomi,Earphones,NONE


#### Merge laptops, mobiles and accessories to one data frame

In [14]:
combined_data = pd.concat([laptops, mobiles, accessories], ignore_index=True)

## Model Implementation

#### Define a function that cleans, tokenizes and stems given text using the NLTK library.

In [15]:
def clean_text(text):
    if pd.isnull(text):
        return ''
    # Convert numbers to string
    text = re.sub(r'\d+', lambda x: str(x.group()), str(text))
    text = str(text).lower()
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    text = ' '.join([PorterStemmer().stem(word) for word in text.split() if word not in set(stopwords.words('english'))])
    return text

#### Apply function on combined_data dataset.

In [16]:
combined_data['name'] = combined_data['name'].apply(clean_text)
combined_data['brand'] = combined_data['brand'].apply(clean_text)
combined_data['category'] = combined_data['category'].apply(clean_text)
combined_data['features'] = combined_data['features'].apply(clean_text)

#### Define a function that converts textual information about product attributes into numerical representations

In [17]:
def vectorize_dataset(combined_data):
    # TF-IDF vectorization for name
    name_vectorizer = TfidfVectorizer()
    name_matrix = name_vectorizer.fit_transform(combined_data['name'].astype(str))

    # TF-IDF vectorization for brand
    brand_vectorizer = TfidfVectorizer()
    brand_matrix = brand_vectorizer.fit_transform(combined_data['brand'].astype(str))

    # TF-IDF vectorization for category
    category_vectorizer = TfidfVectorizer()
    category_matrix = category_vectorizer.fit_transform(combined_data['category'].astype(str))

    # TF-IDF vectorization for features
    features_vectorizer = TfidfVectorizer()
    features_matrix = features_vectorizer.fit_transform(combined_data['features'].astype(str))

    return name_vectorizer, brand_vectorizer, category_vectorizer, features_vectorizer, name_matrix, brand_matrix, category_matrix, features_matrix

#### Define a function that takes a query string and returns a DataFrame with the top 10 most relevant products based on the cosine similarity between the query string and the comined_data dataset.

In [18]:
def search_product(query, quantity):
    name_vectorizer, brand_vectorizer, category_vectorizer, features_vectorizer, name_matrix, brand_matrix, category_matrix, features_matrix = vectorizers

    name_weight, brand_weight, category_weight, features_weight = weights
    query = clean_text(query)
    # Vectorize the query
    query_vector = hstack([
        name_weight * name_vectorizer.transform([query]),
        brand_weight * brand_vectorizer.transform([query]),
        category_weight * category_vectorizer.transform([query]),
        features_weight * features_vectorizer.transform([query])
    ]).tocsr()  

    # Order matrices based on weights
    weighted_matrix = hstack([
        name_weight * name_matrix,
        brand_weight * brand_matrix,
        category_weight * category_matrix,
        features_weight * features_matrix
    ]).tocsr()  

    # Calculate cosine similarity
    cosine_similarities = cosine_similarity(query_vector, weighted_matrix).flatten()
    combined_data['similarity'] = cosine_similarities
    # Get indices of top 10 most similar products
    top_indices = cosine_similarities.argsort()[-quantity:][::-1]
    # Extract and return the top 10 products
    top_products = combined_data.iloc[top_indices]
    return top_products

#### Vectorize dataset & adjust weight parameters

In [19]:
# Vectorize the dataset
vectorizers = vectorize_dataset(combined_data)

all_queries = [
    "gaming laptop with NVIDIA RTX graphics card and 16GB RAM and AMD Ryzen 7",
    "ultrabook with Intel Core i5 processor and 512GB SSD",
    "dell laptop with 8GB RAM and 256GB SSD",
    "apple with 48MP primary camera and Snapdragon 888 processor",
    "5G mobile with 256GB internal memory and samsung",
    "soundcore earbuds",
    "m711 cobra"
]

best_weights = None
best_similarity = 0

# Define a range of weights to try
name_weights_to_try = np.arange(1, 5, 0.5)
brand_weights_to_try = np.arange(1, 5, 0.5)
category_weights_to_try = np.arange(1, 5, 0.5)
features_weights_to_try = np.arange(1, 5, 0.5)

for name_weight in name_weights_to_try:
    for brand_weight in brand_weights_to_try:
        for category_weight in category_weights_to_try:
            for features_weight in features_weights_to_try:
                # Set the current weights
                weights = (name_weight, brand_weight, category_weight, features_weight)
                
                avg_similarity = 0
                
                for query in all_queries:
                    # Search for products using the current weights
                    results = search_product(query,20)

                    # Calculate the average cosine similarity of the top 10 results for query i
                    avg_similarity += results['similarity'].mean()
                    
                #average over all queries    
                avg_similarity = avg_similarity / len(all_queries) 
                # Update best weights if the current combination has a higher similarity
                print("weights are: ", weights)
                if avg_similarity > best_similarity:
                    print(weights, ", similarity is", avg_similarity)
                    best_similarity = avg_similarity
                    best_weights = weights

# Print the best weights and similarity
print("Best Weights:", best_weights)
print("Best Cosine Similarity:", best_similarity)

weights = best_weights


weights are:  (1.0, 1.0, 1.0, 1.0)
(1.0, 1.0, 1.0, 1.0) , similarity is 0.3800643348658922
weights are:  (1.0, 1.0, 1.0, 1.5)
weights are:  (1.0, 1.0, 1.0, 2.0)
weights are:  (1.0, 1.0, 1.0, 2.5)
weights are:  (1.0, 1.0, 1.0, 3.0)
weights are:  (1.0, 1.0, 1.0, 3.5)
weights are:  (1.0, 1.0, 1.0, 4.0)
weights are:  (1.0, 1.0, 1.0, 4.5)
weights are:  (1.0, 1.0, 1.5, 1.0)
(1.0, 1.0, 1.5, 1.0) , similarity is 0.43287377600456045
weights are:  (1.0, 1.0, 1.5, 1.5)
weights are:  (1.0, 1.0, 1.5, 2.0)
weights are:  (1.0, 1.0, 1.5, 2.5)
weights are:  (1.0, 1.0, 1.5, 3.0)
weights are:  (1.0, 1.0, 1.5, 3.5)
weights are:  (1.0, 1.0, 1.5, 4.0)
weights are:  (1.0, 1.0, 1.5, 4.5)
weights are:  (1.0, 1.0, 2.0, 1.0)
(1.0, 1.0, 2.0, 1.0) , similarity is 0.47547191368963787
weights are:  (1.0, 1.0, 2.0, 1.5)
weights are:  (1.0, 1.0, 2.0, 2.0)
weights are:  (1.0, 1.0, 2.0, 2.5)
weights are:  (1.0, 1.0, 2.0, 3.0)
weights are:  (1.0, 1.0, 2.0, 3.5)
weights are:  (1.0, 1.0, 2.0, 4.0)
weights are:  (1.0, 1.0, 

## Test Model

In [20]:
# Assuming you have a search query
query = "mobile with primary camera 48MP and front camera 12MP"
# Search for products
search_product(query,10)

Unnamed: 0,name,brand,category,features,similarity
3626,oppo a94 5g 8gb ram 128 extend storag sim 48mp...,oppo,mobil,8gb 128gb none 48mp none,0.894703
3603,oppo a74 5g 6gb ram 128gb expand memori sim fr...,oppo,mobil,6gb 128gb none 48mp none,0.893984
3949,redmi 10 2022 dual sim sea blue 4gb ram 64gb g...,xiaomi,mobil,4 gb 64 gb octa core 50 mp primari camera seco...,0.891915
4525,10 dual sim white clash white 8gb ram 256gb ra...,realm,mobil,8 gb 256 gb octa core 48 mp primari camera sec...,0.891424
3727,nokia g60 5g smartphon 6 58 hd 120hz display 5...,nokia,mobil,6gb 128gb none 50mp none,0.889095
3728,nokia c31 4g smartphon 6 74 hd display 13 2 2 ...,nokia,mobil,4gb 64gb none 5mp none,0.88623
3643,ace fe4 2 8 camera black,ace,mobil,2gb 32gb none none none,0.884061
3566,dooge v20 2022 5g rug smartphon 6 43 amol fhd ...,dooge,mobil,8gb 256gb none 64mp none,0.882792
3579,ace fe2 1 77 camera black,ace,mobil,2gb 32gb none none none,0.882385
3539,ace fe2 1 77 camera blue,ace,mobil,32gb none none none none,0.882226


## Run model to get reccomendations on search history

In [21]:
import math
top_products = 20
percentages = [0.5, 0.25, 0.15, 0.05, 0.05]
all_queries = [
    "dell laptop 512GB ssd and wtih 8gb ram",
    "oppo reno 5",
    "mobile with primary camera 48MP and front camera 12MP",
    "apple watch",
    "soundcore earbuds"
]

res = combined_data.copy()
res = res.iloc[0:0]

for i in range(len(all_queries)):
    #print(all_queries[i] ,math.floor(percentages[i]*top_products))
    results = search_product(all_queries[i], math.floor(percentages[i]*top_products))
    res = pd.concat([res, results], ignore_index=True)
res

Unnamed: 0,name,brand,category,features,similarity
0,dell g15 5511 intel core i5 11260h 8gb 512gb s...,dell,laptop,,0.949603
1,dell latitud 3520 intel core i5 1135g7 8gb 256...,dell,laptop,,0.946624
2,dell vostro 3520 intel core i7 1255u 8gb 512 s...,dell,laptop,,0.946064
3,dell latitud 3420 intel core i5 1135g7 8gb 256...,dell,laptop,,0.945822
4,dell vostro 3520 intel core i7 11255u 8gb 512 ...,dell,laptop,,0.945532
5,dell vostro 3510 intel core i3 1115g4 4gb 256g...,dell,laptop,,0.943357
6,dell latitud 3540 intel core i5 1335u 8 gb 256...,dell,laptop,,0.942409
7,dell latitud 5540 intel core i7 1355u 8 gb 512...,dell,laptop,,0.94236
8,dell latitud 3540 intel core i7 1355u 8 gb 512...,dell,laptop,,0.942209
9,dell vostro 3500 laptop intel core i7 11th gen...,dell,laptop,8gb 512gb none intel nvidia dedic,0.942208
