In [3]:
import numpy as np
import pandas as pd
import re
import string

import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
df = pd.read_csv("datasets/newdf.csv")

df.head()

Unnamed: 0,id,product_category_id,name,description,brand,Model
0,1,1,HUAWEI,"Screen Size\t:\t7.12"" (1080 x 2244) Camera\t:\...",HUAWEI,HUAWEI ENJOY MAX
1,2,10,Duranta,This bicycle has a strong steel frame. Its bra...,Duranta,Duranta Super Sports Bicycle
2,19,20,Cadillac,"Basic: 4 Years / 50000 Miles, Corrosion: 4 Yea...",Cadillac,Cadillac Escalade ESV
3,20,20,Aston Martin,"3 Years / Unlimited Miles Basic, 10 Years / Un...",Aston Martin,Aston Martin DB11 V8
4,21,19,Nike,you can buy the Gym Bag Traveling Bag NIKE Onl...,Nike,Gym bag


In [5]:
df.product_category_id.info()

<class 'pandas.core.series.Series'>
RangeIndex: 153 entries, 0 to 152
Series name: product_category_id
Non-Null Count  Dtype
--------------  -----
153 non-null    int64
dtypes: int64(1)
memory usage: 1.3 KB


In [6]:
dic_category = {1: 'phone', 2: 'cosmetics', 3: 'computer accessories', 4: 'educational',
               5: 'jewelry', 6: 'wallet', 8: 'toys', 9: 'light', 10: 'cycle', 11: 'cloths',
               12: 'laptop', 13: 'watch', 14: 'chair', 15: 'television', 16:'fan', 17: 'tools',
               18: 'musical instrument', 19: 'bag', 20: 'car', 21:'house items', 22:'electronics'}

dic_category

{1: 'phone',
 2: 'cosmetics',
 3: 'computer accessories',
 4: 'educational',
 5: 'jewelry',
 6: 'wallet',
 8: 'toys',
 9: 'light',
 10: 'cycle',
 11: 'cloths',
 12: 'laptop',
 13: 'watch',
 14: 'chair',
 15: 'television',
 16: 'fan',
 17: 'tools',
 18: 'musical instrument',
 19: 'bag',
 20: 'car',
 21: 'house items',
 22: 'electronics'}

In [7]:
dic_category.values()

dict_values(['phone', 'cosmetics', 'computer accessories', 'educational', 'jewelry', 'wallet', 'toys', 'light', 'cycle', 'cloths', 'laptop', 'watch', 'chair', 'television', 'fan', 'tools', 'musical instrument', 'bag', 'car', 'house items', 'electronics'])

In [8]:
# Add category column

df['categoryName'] = df['product_category_id'].map(dic_category)

df.head()

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
0,1,1,HUAWEI,"Screen Size\t:\t7.12"" (1080 x 2244) Camera\t:\...",HUAWEI,HUAWEI ENJOY MAX,phone
1,2,10,Duranta,This bicycle has a strong steel frame. Its bra...,Duranta,Duranta Super Sports Bicycle,cycle
2,19,20,Cadillac,"Basic: 4 Years / 50000 Miles, Corrosion: 4 Yea...",Cadillac,Cadillac Escalade ESV,car
3,20,20,Aston Martin,"3 Years / Unlimited Miles Basic, 10 Years / Un...",Aston Martin,Aston Martin DB11 V8,car
4,21,19,Nike,you can buy the Gym Bag Traveling Bag NIKE Onl...,Nike,Gym bag,bag


## EDA

In [9]:
# Check null values

df.isnull().sum()

id                     0
product_category_id    0
name                   0
description            0
brand                  0
Model                  0
categoryName           0
dtype: int64

In [10]:
# Check duplicate value

df.duplicated().sum()

0

In [11]:
# Shape
df.shape

(153, 7)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   153 non-null    int64 
 1   product_category_id  153 non-null    int64 
 2   name                 153 non-null    object
 3   description          153 non-null    object
 4   brand                153 non-null    object
 5   Model                153 non-null    object
 6   categoryName         153 non-null    object
dtypes: int64(2), object(5)
memory usage: 8.5+ KB


In [13]:
type(df.loc[0, 'description'])

str

In [14]:
print(df.product_category_id.value_counts(ascending=True))

print("the lenght: ", len(df.product_category_id.value_counts(ascending=True)))

2      2
14     3
16     3
10     4
21     5
19     5
12     5
4      5
9      5
18     6
15     7
13     9
8     10
6     10
20    10
1     10
5     12
22    13
11    14
3     15
Name: product_category_id, dtype: int64
the lenght:  20


In [15]:
# The values count in categories name column

df.categoryName.value_counts(ascending=True)

cosmetics                2
chair                    3
fan                      3
cycle                    4
house items              5
bag                      5
laptop                   5
educational              5
light                    5
musical instrument       6
television               7
watch                    9
toys                    10
wallet                  10
car                     10
phone                   10
jewelry                 12
electronics             13
cloths                  14
computer accessories    15
Name: categoryName, dtype: int64

In [16]:
# Created a groupby objects

all_group = df.groupby(by='categoryName')

In [17]:
# tolist() function make an array into list

all_category_name = df.categoryName.unique().tolist()
print(all_category_name)
print("the length: ", len(all_category_name))

['phone', 'cycle', 'car', 'bag', 'cosmetics', 'cloths', 'television', 'fan', 'computer accessories', 'musical instrument', 'watch', 'light', 'toys', 'educational', 'jewelry', 'wallet', 'laptop', 'house items', 'electronics', 'chair']
the length:  20


In [18]:
# for productName,data in all_group:
#     print(productName)
#     print(data)

In [19]:
all_group.get_group(name="computer accessories")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
40,73,3,Asus PC,Processor Type : Intel Corei5-7400 Processor S...,Asus,Asus A4321UKH Core i5 7th Gen All-In-One PC,computer accessories
41,74,3,Dell PC,Desktop Type : Gaming PC Processor Type : Inte...,Dell,Dell Vostro 3668 Core i5 1TB HDD,computer accessories
42,75,3,HP PC,Processor Type : Intel Core i3-8100T 8th Gener...,HP,HP ProOne 400 G4,computer accessories
43,76,3,Apple iMac PC,Processor\tQuad Core Intel Core i5 Processor c...,Apple,Apple iMac,computer accessories
44,77,3,walton Desktop PC,WDPC740010 - Intel B250 Express Chipset - Inte...,walton,walton Desktop PC WDPC740010 Full specifications,computer accessories
136,170,3,Standard Backup Online UPS,Capacity: 3000VA Ranged Voltage: 220/230/240Va...,MaxGreen,MGO-W3KS 3KVA,computer accessories
140,174,3,Microsoft Windows 10 Professional DVD,Processor: 1GHz or faster RAM: 2 GB for 64-bit...,Microsoft,Eng INTL 1PK DSP,computer accessories
144,178,3,Intel Core i3-2120 2nd Gen Processor,Clock Speed: 3.30 GHz Cache: 3 MB Intel Smart ...,intel,Core i3,computer accessories
145,179,3,CPU Liquid Cooler,High-Class Radiator Fans for Better Cooling Pe...,ZADAK SPARK,Lite ARGB 240mn,computer accessories
146,180,3,Non Modular Power Supply,Guaranteed 650W Continuous Power 80 PLUS certi...,Antec,VP650 Plus 650W,computer accessories


In [20]:
all_group.get_group(name="cloths")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
9,26,11,Hijab,Product color may slightly vary due to photogr...,FASHION CAPITAL,Hijab 3 Pieces Combo Pack PS06.,cloths
100,133,11,Semi Fitted Panjabi,Comfortable to ware,Le Reve,Semi Fitted Panjabi,cloths
101,135,11,children dress,Magenta & White Floral A-Line Cotton Dress wit...,pspeaches,pspeaches,cloths
102,136,11,SILK READYMADE LEHENGA,Lehenga Choli Fabric Satin With Ambroidery Wor...,White Button,LEHENGA CHOLI FOR KIDS,cloths
103,137,11,TRADITIONAL DESIGNER SILK SAREE,Real Traditional silk saree with unistice blouse,Mirraw,WOVEN TRADITIONAL DESIGNER SILK SAREE,cloths
104,138,11,MAITHILI SILK SAREE,BLUE SOFT MAITHILI SILK SAREE WITH BLOUSE PIECE,Mirraw,BLUE SOFT MAITHILI SILK SAREE WITH BLOUSE PIECE,cloths
105,139,11,HANDWOVEN SILK SAREES,MAHATI GREEN HANDWOVEN SILK SAREES WITH BLOUSE,Mirraw,MAHATI GREEN HANDWOVEN SILK SAREES WITH BLOUSE,cloths
106,140,11,COTTON WOVEN SAREE,BLUE COTTON WOVEN SAREE WITH BLOUSE,Mirraw,BLUE COTTON WOVEN SAREE WITH BLOUSE,cloths
107,141,11,EMBROIDERY LEHENGA,DESIGNER EMBROIDERY LEHENGA CHOLI,Mirraw,DESIGNER EMBROIDERY LEHENGA CHOLI,cloths
108,142,11,FLORAL WEDDING LEHENGA,STUNNING ORGANZA ZARI AND SEQUINS WORK FLORAL...,Mirraw,STUNNING ORGANZA ZARI AND SEQUINS WORK FLORAL ...,cloths


In [21]:
all_group.get_group(name="jewelry")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
80,113,5,2pcs-bangles-100-101,Karat:\t21k,2pcs-bangles-100-101,2pcs-bangles-100-101,jewelry
81,114,5,nosepin,8 white diamond and 1 green diamond,Alamin,nosepin,jewelry
82,115,5,ring,Karat:\t22k,Alamin,ring,jewelry
83,116,5,earring,Karat:\t22k,Alamin,earring,jewelry
84,117,5,Brislet,Karat:\t21k,Alamin,Brislet,jewelry
85,118,5,shitahar,Karat:\t21k,Alamin,shitahar,jewelry
86,119,5,silaring,Karat:\t22k,Alamin,silaring,jewelry
87,120,5,tikli,Karat:\t21k,Alamin,tikli,jewelry
88,121,5,kanthohar,Karat:\t21k,Alamin,kanthohar,jewelry
89,122,5,necklace,Karat:\t21k,Alamin,necklace,jewelry


In [22]:
all_group.get_group(name="phone")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
0,1,1,HUAWEI,"Screen Size\t:\t7.12"" (1080 x 2244) Camera\t:\...",HUAWEI,HUAWEI ENJOY MAX,phone
16,34,1,Motorola Moto,Battery\t Type and Capacity\tLithium-polymer 5...,Motorola Moto,Motorola Moto G30,phone
19,37,1,Redmi,"OS\tAndroid 10, MIUI 12 Chipset\tQualcomm SM71...",Xiaomi,Xiaomi Redmi Note 9 Pro Max,phone
21,44,1,Realme,Display\t6.62 inches\t Primary Camera\t64 MP (...,Realme,Realme GT Neo 2,phone
23,47,1,Vivo,microSDXC (dedicated slot) Internal\t128GB 8GB...,Vivo,vivo V20,phone
24,49,1,Nokia,"4 MB RAM, 4 MB Internal storage, Single SIM",Nokia,Nokia 105,phone
26,54,1,iPhone,"Face ID, accelerometer, gyro, proximity, compa...",Apple,Apple iPhone 12 Pro Max,phone
28,57,1,Oppo,"Li-Po 4015 mAh, non-removable Charging\tFast c...",Oppo,Oppo F17 Pro,phone
29,59,1,OnePlus,"Single SIM (Nano-SIM) or Dual SIM (Nano-SIM, d...",OnePlus,OnePlus 9 Pro,phone
31,64,1,Samsung,"128GB 8GB RAM , 32-bit/384kHz audio",Samsung,Samsung Galaxy S20,phone


In [23]:
all_group.get_group(name="car")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
2,19,20,Cadillac,"Basic: 4 Years / 50000 Miles, Corrosion: 4 Yea...",Cadillac,Cadillac Escalade ESV,car
3,20,20,Aston Martin,"3 Years / Unlimited Miles Basic, 10 Years / Un...",Aston Martin,Aston Martin DB11 V8,car
17,35,20,Tesla,Tesla All-Wheel Drive has two ultra-responsive,Tesla,Model Y,car
18,36,20,Datsun,Cylinders 3 Cylinders Inline Valves 4 Valves/C...,Datsun,Datsun redi-GO,car
20,43,20,Toyota,Engine\t:4Cylinder/2487cc Transmission(Gear)\t...,TOYOTA,TOYOTA RAV4 HYBRID AWD,car
22,45,20,Black Panther,the Lexus LC 500 is not your ordinary car. It ...,Black Panther,Lexus LC 500,car
25,53,20,Hyundai,Engine\t: 1368 cc Transmission(Gear)\t: 6-Spee...,Hyundai,Hyundai,car
27,55,20,BMW,The compact SUV can be had in two trims: Sport...,BMW,BMW X1,car
30,63,20,Ferrari,"Displacement (Cc)3,902 cc / 238.1 cu inEngine ...",Ferrari,Ferrari 488 GT3 Evo,car
32,65,20,X corolla,Drive TypeFront Wheel DriveGear BoxAutomaticSt...,X Corolla,Toyota Corolla x,car


In [24]:
all_group.get_group(name="wallet")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
90,123,6,Genuine Leather Wallet,Made from 100% Genuine Leather Size : Height:...,Genuine Leather Wallet,RA94C ORAS Genuine Leather Wallet for Men,wallet
91,124,6,Crocodile Texture Genuine Leather,Made from Premium Genuine Leather 1 Year Leath...,Crocodile Texture Genuine Leather,RA87C ORAS Crocodile Texture Genuine Leather W...,wallet
92,125,6,ORAS Genuine Leather Wallet,Made from 100% Genuine Leather,ORAS Genuine Leather Wallet,RA86K ORAS Genuine Leather Wallet for Men,wallet
93,126,6,Jeep Genuine Leather Wallet,100% Genuine Leather Wallet. 1Year Leather War...,Jeep,RA79N Jeep Genuine Leather Wallet for Men,wallet
94,127,6,ORAS Genuine Leather Wallet,100% Genuine Leather Wallet. 1 Year Leather Wa...,ORAS,RA68C ORAS Genuine Leather Wallet for Men,wallet
95,128,6,RA38N ORAS Genuine Leather Wallet for Men,Made from 100% Genuine Leather 1 Year Leather ...,RA38N ORAS Genuine Leather Wallet for Men,RA38N ORAS Genuine Leather Wallet for Men,wallet
96,129,6,Key Wallet,100% Genuine Leather Wallet. Original ORAS Bra...,ORAS,RA99C ORAS Genuine Leather Key Wallet,wallet
97,130,6,Multi-Function Business Travel Wallet,Made from Genuine Leather 1 Year Leather Warranty,ORAS,RA43K ORAS Genuine Leather Multi-Function Busi...,wallet
98,131,6,ORAS Genuine Leather Key Wallet,Made From Genuine Leather 1 Year Leather Warranty,ORAS,ORAS Genuine Leather Key Wallet,wallet
99,132,6,Ladies Purse,Made from Premium Genuine Leather 1 Year Leath...,ORAS,RA69M ORAS Genuine Leather Ladies Purse,wallet


In [25]:
all_group.get_group(name="toys")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
65,98,8,Ride On Push Car,3 In 1 Kids Indoor Outdoor Ride On Push Car St...,Ride On Push Car,Ride On Push Car,toys
66,99,8,kick scooter kids,3 wheels Adjustable Multi-Color Scooter For ki...,kick scooter kids,kick scooter kids,toys
67,100,8,Baby gym rack and game pad combo,Baby gym rack and game pad combo baby gym play...,Baby gym rack and game pad combo,Baby gym rack and game pad combo,toys
68,101,8,Baby Kitchen Set,Baby Kitchen Set with music n Light,Baby Kitchen Set,Baby Kitchen Set,toys
69,102,8,Farlin Baby Magic Ball House,Farlin Baby Magic Ball House (With 100 Balls),Farlin Baby Magic Ball House,Farlin Baby Magic Ball House,toys
70,103,8,Playtime King Slider,Safe for children Strong Build quality,Playtime King Slider,Playtime King Slider,toys
71,104,8,Motor Bike Rocker,Process: Blow molding Material: HDPE Toxicity:...,Motor Bike Rocker,Motor Bike Rocker,toys
72,105,8,Scholar Table With Chair,Type: Educational Process: Blow molding Materi...,Scholar Table With Chair,Scholar Table With Chair,toys
73,106,8,Baby Bouncer For Playing Sleeping & Relxation,Soft mesh support conforms to a newborns baby ...,Baby Bouncer For Playing Sleeping & Relxation,Baby Bouncer For Playing Sleeping & Relxation,toys
74,107,8,Children's Study Table,Children's Study Table Baby Snack Table Storag...,Children's Study Table,Children's Study Table,toys


In [26]:
all_group.get_group(name="watch")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
51,84,13,Skmei watch,Product Types: Smart Watch Brand : SKMEI Model...,Skmei,Skmei 1481 Digital Watch,watch
52,85,13,Haylou Smart Watch,Product Tyeps: Smart Watch. Xiaomi Haylou RT2 ...,Haylou Smart Watch,Haylou Smart Watch LS10 RT2,watch
53,86,13,Xiaomi Mi Band 6 CN,Product Types: Smart Watch Band. Xiaomi Mi Ban...,Xiaomi Mi Band,Xiaomi Mi Band 6 CN,watch
54,87,13,CURREN 8356 Watch,Brand: Curren Watch style: Business Watches ca...,CURREN 8356,CURREN 8356 Luxury Business Quartz,watch
55,88,13,Curren,Style: Luxury Clasp Type: Bracelet Clasp Water...,Curren,Curren 8348 Luxury Brand Fashion Quartz Watch,watch
56,89,13,Ladies Bracelet Type Watch,Ladies Bracelet Type Watch Quartz Movement Sty...,Ladies Bracelet Type Watch,Ladies Bracelet Type Watch,watch
57,90,13,Electronic Watch,Electronicorigin China model Mi 3 touch white ...,Electronic Watch,Electronic Watch Girls and Boys Trend Touch Br...,watch
58,91,13,Michael Kors,Product Type: Watch Stainless Steel Mid Sizes ...,Michael Kors,Michael Kors Womens Watch,watch
59,92,13,Magnet Ladies Watch,Magnet Ladies Watch Boxes & Cases Material: Paper,Magnet Ladies Watch,Magnet Ladies Watch,watch


In [27]:
all_group.get_group(name="television")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
13,31,15,Starex 32” GS,Dynamic Mega Contrast Ratio 32-Inch Display wi...,Starex,Starex 32” GS Smart Android Led Tv Monitor,television
14,32,15,Samsung 32N4010,The dimension of this LED TV is 737.4 mm x 438...,Samsung,"Samsung 32N4010 32"" Basic HD LED Television",television
15,33,15,Haier,Miracast Screen Mirroring,Haier,Haier H32D2M 32 Inch Miracast HD Non-Smart LED...,television
33,66,15,"Mango MGN1 32"" Borderless HD Smart Android LED...",Smart Android LED TV is one of the largest TVs...,Mango,"Mango MGN1 32"" Borderless HD Smart Android LED...",television
34,67,15,"LG 32LK510B 32"" HD LED Television",Dynamic Color & Virtual Surround Sound Immersi...,LG,"LG 32LK510B 32"" HD LED Television",television
35,68,15,Xiaomi TV,"Processor: MSD6683, CPU: CA53 x 4, up to 1.2GH...",Xiaomi Mi,Xiaomi Mi P1 L32M6-6ARG/6AEU 32-Inch Smart And...,television
36,69,15,Sony Bravia TV,"WiFi ,HDMI, RF, FM Youtube and more Built-in W...",Sony Bravia,"Sony Bravia KDL-32W600D 32"" Smart HD LED TV",television


In [28]:
all_group.get_group(name="musical instrument")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
45,78,18,Guitar Stand Wall Mount Hanger Hook,Specifications of Guitar Stand Wall Mount Hang...,Guitar Stand Wall Mount Hanger Hook,Guitar Stand Wall Mount Hanger Hook,musical instrument
46,79,18,Casio keyboard SA 46,32 Mini Keys 8-Note Polyphony 50 Play Along Tr...,Casio,Casio keyboard SA 46,musical instrument
47,80,18,Fernedes Atlas 5x-2008 Bass Guitar,Alder Body/Maple Neck Bolt On Neck 21 Fret fin...,Fernedes Atlas 5x-2008 Bass Guitar,Fernedes Atlas 5x-2008 Bass Guitar,musical instrument
48,81,18,Sennheiser EW135P-G4 wireless Microphone,A broadcast quality sound solution. Providing ...,Sennheiser,Sennheiser EW135P-G4 wireless Microphone,musical instrument
49,82,18,Melody 3.5 Octave Stick Harmoniam,"-3.5 octaves, 42 keys in total the superb qu...",Melody,Melody 3.5 Octave Stick Harmoniam,musical instrument
50,83,18,Melody Tablya Baya,original neem wood tabla,Melody Tablya Baya,Melody Tablya Baya,musical instrument


In [29]:
all_group.get_group(name="light")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
60,93,9,Walton Light,WALTON MODULAR FLOOD LIGHT Model Name\t WLED-...,Walton Light,WLED-MOD-2M100W,light
61,94,9,WLED-RB3WB22 light,Energy efficient & energy saves up to 90% in r...,Walton,WLED-RB3WB22,light
62,95,9,Spot light,Energy efficient & energy saves up to 90% in r...,Walton,WLED-CSLS-5W (5 Watt),light
63,96,9,Torch light,"Powerful CREE T6 LED chip, all Weather Waterpr...",Walton,WLED-T6TORCH-5W,light
64,97,9,LED Light,High Brightness. - Aluminum Body. - Appl...,WLED-DC24V-10W,WLED-DC24V-10W,light


In [30]:
all_group.get_group(name="educational")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
75,108,4,science box fifth grade,"It includes solar cell, generator, windmill mo...",science box fifth grade,science box fifth grade,educational
76,109,4,Captain Curious,Mission Electrocannon Part 1. Electric cannon ...,Captain Curious,Captain Curious,educational
77,110,4,Mojar Periscope,There are two mirrors for making periscope wit...,Mojar Periscope,Mojar Periscope,educational
78,111,4,Smart Kit Focus Challenge,"This kit consists of three metal cables, such ...",Smart Kit Focus Challenge,Smart Kit Focus Challenge,educational
79,112,4,Onnorokom Science Box: Mystery Of Chemistry,"Rubber ball, Food color, Dish Cleaner, glyceri...",Onnorokom Science Box: Mystery Of Chemistry,Onnorokom Science Box: Mystery Of Chemistry,educational


In [31]:
all_group.get_group(name="laptop")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
115,149,12,Chuwi HeroBook Laptop,Processor Intel Celeron Processor N4020 (4M Ca...,Chuwi,Chuwi HeroBook Pro,laptop
116,150,12,AVITA Essential 14 laptop,Processor Intel Celeron Processor N4020 (4M Ca...,AVITA,Essential 14,laptop
117,151,12,Lenovo IdeaPad Slim 3i Laptop,"Processor: Intel Celeron N4020 (4M Cache, 1.10...",Lenovo,IdeaPad Slim 3i,laptop
118,152,12,HP 15s laptop,Processor: Intel Pentium Silver N5030 (4M Cach...,HP,15s-du1116TU,laptop
119,153,12,Dell Inspiron 15 3515 Laptop,"Processor: AMD Ryzen 3 3250U (4MB CPU Cache, 2...",Dell,Inspiron 15 3515,laptop


In [32]:
all_group.get_group(name="house items")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
120,154,21,VSN GD Refrigerator,Cooling effects: Freezer Cabinet Less than -18...,Vision,RE-200L Mirror Lotus-TM,house items
121,155,21,Vision Automatic Washing Machine,"Voltage/Frequency: 220V/50Hz Rated power, Wash...",Vision,STL02,house items
123,157,21,Microwave Oven,Oven Capacity: 25 L Rated Voltage: 230~50Hz Ra...,Vision,G25 Smart,house items
124,158,21,Rice Cooker RC-1.8,Inner Pot: Double pot (One is SS pot and one i...,Vision,L 40-06 SS Classic,house items
127,161,21,Blender VIS-SBL-018 Rapid Power,"220~240V, 50Hz, 650W, Stainless Steel Blender....",Vision,VIS-SBL-018,house items


In [33]:
all_group.get_group(name="bag")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
4,21,19,Nike,you can buy the Gym Bag Traveling Bag NIKE Onl...,Nike,Gym bag,bag
5,22,19,Ladies Backpack,Material: PU Leather 3 in 1 for total set. Siz...,Ladies Backpack,Ladies Backpack and Shoulder Bag TR002,bag
6,23,19,Ladies Backpack,Product color may slightly vary due to photogr...,Ladies Backpack,Ladies Backpack and Shoulder Bag SL002,bag
130,164,19,Laptop Pouch bag,Good Quality Material Compatible with 13 inch...,backpack,Laptop Pouch bag for 13 inch Notebook,bag
131,165,19,Targus Intellect Laptop Backpack,"COMPATIBILITY Up to 15.6"" laptops LITRE CAPACI...",backpack,model s,bag


In [34]:
all_group.get_group(name="cycle")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
1,2,10,Duranta,This bicycle has a strong steel frame. Its bra...,Duranta,Duranta Super Sports Bicycle,cycle
10,27,10,Forever Bicycle,"Mechanical dual disc brake, steel frame, 17-in...",Forever,Forever 2020 Bicycle,cycle
11,28,10,Express,This balance bicycle is made with a strong ste...,Express,Express Baby Pedal Balance Bicycle,cycle
12,30,10,Super 16,The Super 16 is a robust baby-balanced bicycle...,Super 16,Super 16 Heavy Baby Balanced Cycle,cycle


In [35]:
all_group.get_group(name="fan")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
37,70,16,New Mini Fan,Specifications of New GFC Mini Rechargeable US...,GFC,New GFC Mini Rechargeable USB Charging Portabl...,fan
38,71,16,GFC Glamour Ceiling Fan,Superior quality aluminum alloy construction. ...,GFC Glamour,GFC Glamour Ceiling Fan Full specifications,fan
39,72,16,GFC Monet Ceiling Fan,Energy efficient Electrical Steel Sheet and 99...,GFC,GFC Monet Ceiling Fan,fan


In [36]:
all_group.get_group(name="chair")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
141,175,14,Swivel Chair,Made from superior quality chrome plated mild ...,HATIL,Lyndon-127,chair
142,176,14,All Swivel Chair,Made from Chrome base mild steel and Nylon Bas...,HATIL,Pokemon-144,chair
143,177,14,HATIL Swivel Chair,Made from superior quality Nylon Base High qua...,HATIL,Ruby-140,chair


In [37]:
all_group.get_group(name="cosmetics")

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
7,24,2,Lakmé,"Water proof, lasts up to 22 hrs. It is dermato...",Lakme,Lakmé Eyeconic Kajal Deep Black,cosmetics
8,25,2,Lakmé 9 to 5,Provides uv protection. Gives Smooth Coverage....,Lakme,Lakmé Complexion Care Cream,cosmetics


In [38]:
# Check the unique values in name column

unique_value ,num_count = np.unique(ar=df.name, return_counts=True)

In [39]:
len(unique_value)

151

In [40]:
num_count

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      dtype=int64)

In [41]:
df.name.value_counts()

ORAS Genuine Leather Wallet    2
Ladies Backpack                2
HUAWEI                         1
HANDWOVEN SILK SAREES          1
Ladies Purse                   1
                              ..
Haylou Smart Watch             1
Xiaomi Mi Band 6 CN            1
CURREN 8356 Watch              1
Curren                         1
Apple Magic Mouse              1
Name: name, Length: 151, dtype: int64

In [42]:
for i, j in zip(unique_value, num_count):
    print(f'Product Name: {i} -----> Total counts: {j}')

Product Name: 2pcs-bangles-100-101 -----> Total counts: 1
Product Name: AFGHANI OXIDISED SILVER JEWELLERY -----> Total counts: 1
Product Name: AVITA Essential 14 laptop -----> Total counts: 1
Product Name: All Swivel Chair -----> Total counts: 1
Product Name: Apple Magic Mouse -----> Total counts: 1
Product Name: Apple iMac PC -----> Total counts: 1
Product Name: Aston Martin -----> Total counts: 1
Product Name: Asus PC -----> Total counts: 1
Product Name: BMW -----> Total counts: 1
Product Name: Baby Bouncer For Playing Sleeping & Relxation -----> Total counts: 1
Product Name: Baby Kitchen Set -----> Total counts: 1
Product Name: Baby gym rack and game pad combo -----> Total counts: 1
Product Name: Black Panther -----> Total counts: 1
Product Name: Blender VIS-SBL-018 Rapid Power -----> Total counts: 1
Product Name: Brislet -----> Total counts: 1
Product Name: COTTON WOVEN SAREE -----> Total counts: 1
Product Name: CPU Liquid Cooler -----> Total counts: 1
Product Name: CURREN 8356 Wat

## Text preprocessing

In [43]:
df.head(5)

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
0,1,1,HUAWEI,"Screen Size\t:\t7.12"" (1080 x 2244) Camera\t:\...",HUAWEI,HUAWEI ENJOY MAX,phone
1,2,10,Duranta,This bicycle has a strong steel frame. Its bra...,Duranta,Duranta Super Sports Bicycle,cycle
2,19,20,Cadillac,"Basic: 4 Years / 50000 Miles, Corrosion: 4 Yea...",Cadillac,Cadillac Escalade ESV,car
3,20,20,Aston Martin,"3 Years / Unlimited Miles Basic, 10 Years / Un...",Aston Martin,Aston Martin DB11 V8,car
4,21,19,Nike,you can buy the Gym Bag Traveling Bag NIKE Onl...,Nike,Gym bag,bag


In [44]:
df.loc[0, 'description']

'Screen Size\t:\t7.12" (1080 x 2244) Camera\t:\t16 + 2 | 8 MP RAM\t:\t4GB Battery\t:\t5000 mAh Operating system\t:\tAndroid Soc\t:\tQualcomm SDM636 Snapdragon 636 Processor\t:\tOcta'

In [45]:
# Make lower case the text

def makeLowerCase(text):
    return text.lower()

In [46]:
df['name'] = df['name'].apply(func=makeLowerCase)
df['description'] = df['description'].apply(func=makeLowerCase)
df['brand'] = df['brand'].apply(func=makeLowerCase)
df['Model'] = df['Model'].apply(func=makeLowerCase)

In [47]:
df.head()

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
0,1,1,huawei,"screen size\t:\t7.12"" (1080 x 2244) camera\t:\...",huawei,huawei enjoy max,phone
1,2,10,duranta,this bicycle has a strong steel frame. its bra...,duranta,duranta super sports bicycle,cycle
2,19,20,cadillac,"basic: 4 years / 50000 miles, corrosion: 4 yea...",cadillac,cadillac escalade esv,car
3,20,20,aston martin,"3 years / unlimited miles basic, 10 years / un...",aston martin,aston martin db11 v8,car
4,21,19,nike,you can buy the gym bag traveling bag nike onl...,nike,gym bag,bag


In [48]:
def simplePreprocessing(text):
    patt = re.compile(pattern=r"[\t:\d\.]")
    a = patt.sub(repl="", string=text)
    b = a.translate(str.maketrans('','',string.punctuation))
    c = " ".join(b.split())
    return c

In [49]:
df['description'] = df['description'].apply(func=simplePreprocessing)

df.head()

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
0,1,1,huawei,screen size x camera mp ramgb battery mah oper...,huawei,huawei enjoy max,phone
1,2,10,duranta,this bicycle has a strong steel frame its brak...,duranta,duranta super sports bicycle,cycle
2,19,20,cadillac,basic years miles corrosion years miles drivet...,cadillac,cadillac escalade esv,car
3,20,20,aston martin,years unlimited miles basic years unlimited mi...,aston martin,aston martin db11 v8,car
4,21,19,nike,you can buy the gym bag traveling bag nike onl...,nike,gym bag,bag


In [50]:
patt = re.compile(pattern=r" (mm|mm x|w mm|mp|x|w|mp) ")
patt2 = re.compile(pattern=r" (ww|x x|f|w|mm|n m|a l v|n) ")
patt3 = re.compile(pattern=r" (q|v|l|hz – khz) ")

mypatt = [patt, patt2, patt3]

def removeSingleAlphabet(text):
    for i in mypatt:
        text = i.sub(repl=" ", string=text)
    return text

In [51]:
df['description'] = df['description'].apply(func=removeSingleAlphabet)

df.head()

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName
0,1,1,huawei,screen size camera ramgb battery mah operating...,huawei,huawei enjoy max,phone
1,2,10,duranta,this bicycle has a strong steel frame its brak...,duranta,duranta super sports bicycle,cycle
2,19,20,cadillac,basic years miles corrosion years miles drivet...,cadillac,cadillac escalade esv,car
3,20,20,aston martin,years unlimited miles basic years unlimited mi...,aston martin,aston martin db11 v8,car
4,21,19,nike,you can buy the gym bag traveling bag nike onl...,nike,gym bag,bag


In [52]:
df['tags'] = df['name'] + df['description'] + df['brand'] + df['Model']

df.head()

Unnamed: 0,id,product_category_id,name,description,brand,Model,categoryName,tags
0,1,1,huawei,screen size camera ramgb battery mah operating...,huawei,huawei enjoy max,phone,huaweiscreen size camera ramgb battery mah ope...
1,2,10,duranta,this bicycle has a strong steel frame its brak...,duranta,duranta super sports bicycle,cycle,durantathis bicycle has a strong steel frame i...
2,19,20,cadillac,basic years miles corrosion years miles drivet...,cadillac,cadillac escalade esv,car,cadillacbasic years miles corrosion years mile...
3,20,20,aston martin,years unlimited miles basic years unlimited mi...,aston martin,aston martin db11 v8,car,aston martinyears unlimited miles basic years ...
4,21,19,nike,you can buy the gym bag traveling bag nike onl...,nike,gym bag,bag,nikeyou can buy the gym bag traveling bag nike...


In [53]:
df2 = df.drop(columns=['description', 'brand', 'Model'])

df2.head()

Unnamed: 0,id,product_category_id,name,categoryName,tags
0,1,1,huawei,phone,huaweiscreen size camera ramgb battery mah ope...
1,2,10,duranta,cycle,durantathis bicycle has a strong steel frame i...
2,19,20,cadillac,car,cadillacbasic years miles corrosion years mile...
3,20,20,aston martin,car,aston martinyears unlimited miles basic years ...
4,21,19,nike,bag,nikeyou can buy the gym bag traveling bag nike...


In [54]:
df2.shape

(153, 5)

In [55]:
# Check null value

df2.isnull().sum().sum()

0

In [56]:
# analysis a single row

df2.loc[1, 'tags']

'durantathis bicycle has a strong steel frame its brake system is very strong which allows you to maintain control in all situations its seat is adjustable so you can change it as needed its front and rear wheels are fitted with mudguards so that the rider does not get mud while riding on the roaddurantaduranta super sports bicycle'

In [57]:
# demo test

print(df2.loc[1, 'tags'].split())

['durantathis', 'bicycle', 'has', 'a', 'strong', 'steel', 'frame', 'its', 'brake', 'system', 'is', 'very', 'strong', 'which', 'allows', 'you', 'to', 'maintain', 'control', 'in', 'all', 'situations', 'its', 'seat', 'is', 'adjustable', 'so', 'you', 'can', 'change', 'it', 'as', 'needed', 'its', 'front', 'and', 'rear', 'wheels', 'are', 'fitted', 'with', 'mudguards', 'so', 'that', 'the', 'rider', 'does', 'not', 'get', 'mud', 'while', 'riding', 'on', 'the', 'roaddurantaduranta', 'super', 'sports', 'bicycle']


In [58]:
print(word_tokenize(text=df2.loc[1, 'tags']))

# demo end

['durantathis', 'bicycle', 'has', 'a', 'strong', 'steel', 'frame', 'its', 'brake', 'system', 'is', 'very', 'strong', 'which', 'allows', 'you', 'to', 'maintain', 'control', 'in', 'all', 'situations', 'its', 'seat', 'is', 'adjustable', 'so', 'you', 'can', 'change', 'it', 'as', 'needed', 'its', 'front', 'and', 'rear', 'wheels', 'are', 'fitted', 'with', 'mudguards', 'so', 'that', 'the', 'rider', 'does', 'not', 'get', 'mud', 'while', 'riding', 'on', 'the', 'roaddurantaduranta', 'super', 'sports', 'bicycle']


In [59]:
df3 = df2.copy()

## Create WordNetLemmatizer object

In [60]:
lemmatizer = WordNetLemmatizer()

## Remove stopwords and lemmatize those words

In [61]:
def textLemmatizer(text):
    words = word_tokenize(text=text)
    words = [lemmatizer.lemmatize(word=word) for word in words if word not in stopwords.words('english')]
    text = ' '.join(words)
    return text

In [62]:
# Demo test

text = """this bicycle has a strong steel frame its brake system is very strong which allows you to maintain control in all situations its seat is adjustable so you can change it as needed its front and rear wheels are fitted with mudguards so that the rider does not get mud while riding on the roaddurantaduranta super sports bicycle"""

result = textLemmatizer(text)

result

# demo test end

'bicycle strong steel frame brake system strong allows maintain control situation seat adjustable change needed front rear wheel fitted mudguard rider get mud riding roaddurantaduranta super sport bicycle'

In [63]:
df2['tags'] = df2['tags'].apply(textLemmatizer)

df2.head()

Unnamed: 0,id,product_category_id,name,categoryName,tags
0,1,1,huawei,phone,huaweiscreen size camera ramgb battery mah ope...
1,2,10,duranta,cycle,durantathis bicycle strong steel frame brake s...
2,19,20,cadillac,car,cadillacbasic year mile corrosion year mile dr...
3,20,20,aston martin,car,aston martinyears unlimited mile basic year un...
4,21,19,nike,bag,nikeyou buy gym bag traveling bag nike online ...


In [64]:
df2.loc[1, 'tags']

'durantathis bicycle strong steel frame brake system strong allows maintain control situation seat adjustable change needed front rear wheel fitted mudguard rider get mud riding roaddurantaduranta super sport bicycle'

## Apply TF-IDF 

In [65]:
tf_idf = TfidfVectorizer(max_features=2000)

In [66]:
tf_idf_vectors = tf_idf.fit_transform(raw_documents=df2['tags']).toarray()

In [67]:
tf_idf.idf_

array([5.34380542, 5.34380542, 5.34380542, ..., 5.34380542, 5.34380542,
       5.34380542])

In [68]:
tf_idf.vocabulary_

{'huaweiscreen': 653,
 'size': 1193,
 'camera': 241,
 'ramgb': 1056,
 'battery': 167,
 'mah': 816,
 'operating': 945,
 'systemandroid': 1280,
 'socqualcomm': 1213,
 'sdm': 1153,
 'snapdragon': 1211,
 'processoroctahuaweihuawei': 1028,
 'enjoy': 463,
 'max': 832,
 'durantathis': 437,
 'bicycle': 184,
 'strong': 1256,
 'steel': 1247,
 'frame': 536,
 'brake': 214,
 'system': 1279,
 'allows': 96,
 'maintain': 819,
 'control': 338,
 'situation': 1191,
 'seat': 1155,
 'adjustable': 82,
 'change': 277,
 'needed': 913,
 'front': 542,
 'rear': 1073,
 'wheel': 1439,
 'fitted': 521,
 'mudguard': 899,
 'rider': 1110,
 'get': 566,
 'mud': 898,
 'riding': 1111,
 'roaddurantaduranta': 1115,
 'super': 1264,
 'sport': 1234,
 'cadillacbasic': 239,
 'year': 1473,
 'mile': 860,
 'corrosion': 353,
 'drivetrain': 426,
 'roadside': 1116,
 'assistance': 122,
 'milescadillaccadillac': 862,
 'escalade': 468,
 'esv': 470,
 'aston': 123,
 'martinyears': 828,
 'unlimited': 1355,
 'basic': 164,
 'warrantyaston': 14

In [69]:
tf_idf.get_feature_names_out()

array(['010', '013', '018', ..., 'zari', 'zigzag', 'zoom'], dtype=object)

In [70]:
tf_idf_vectors[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [71]:
tf_idf_vectors.shape

(153, 1482)

In [72]:
tf_idf_vectors

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Check the similarity between the words

In [73]:
similarity = cosine_similarity(X=tf_idf_vectors)

In [74]:
similarity.shape

(153, 153)

In [75]:
similarity[0]

array([1.        , 0.        , 0.        , 0.        , 0.        ,
       0.05080068, 0.        , 0.        , 0.        , 0.        ,
       0.03643842, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.05569261, 0.        , 0.08197062, 0.10671665,
       0.        , 0.16662862, 0.        , 0.        , 0.        ,
       0.        , 0.05367003, 0.        , 0.05862727, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.06005527, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.01432803, 0.03641095, 0.        ,
       0.        , 0.        , 0.        , 0.04259562, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [76]:
similarity[1]

array([0.        , 1.        , 0.        , 0.        , 0.03726122,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.20649522, 0.4637698 , 0.15366469, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.04770264, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.01919043, 0.02285002,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.01722134, 0.        ,
       0.        , 0.        , 0.        , 0.01606094, 0.        ,
       0.        , 0.        , 0.        , 0.02369431, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.02143076,
       0.        , 0.02947831, 0.        , 0.        , 0.        ,
       0.07261612, 0.        , 0.        , 0.        , 0.     

In [77]:
similarity[1].shape

(153,)

### Demo test start

In [78]:
df2[df2['name'] == 'huawei']

Unnamed: 0,id,product_category_id,name,categoryName,tags
0,1,1,huawei,phone,huaweiscreen size camera ramgb battery mah ope...


In [79]:
df2[df2['name'] == 'huawei'].index

Int64Index([0], dtype='int64')

In [80]:
df2[df2['name'] == 'huawei'].index[0]

0

In [81]:
list(enumerate(similarity[df2[df2['name'] == 'huawei'].index[0]]))[:20]

[(0, 1.0000000000000004),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.050800679422527105),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.03643841692405252),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.05569260861762558),
 (17, 0.0),
 (18, 0.08197061839649739),
 (19, 0.10671664827707755)]

In [82]:
sorted(list(enumerate(similarity[df2[df2['name'] == 'huawei'].index[0]])), reverse=True, key=lambda x:x[1])[:20]

[(0, 1.0000000000000004),
 (21, 0.166628617914054),
 (19, 0.10671664827707755),
 (18, 0.08197061839649739),
 (43, 0.06005527357458365),
 (28, 0.058627268494138955),
 (16, 0.05569260861762558),
 (26, 0.053670026577402885),
 (147, 0.051205026452749526),
 (5, 0.050800679422527105),
 (119, 0.04827749203706904),
 (152, 0.04290671937378333),
 (58, 0.04259561621622305),
 (115, 0.04057037099651447),
 (10, 0.03643841692405252),
 (53, 0.036410954965271844),
 (76, 0.033753985153467204),
 (139, 0.03296763047696955),
 (90, 0.0317417762108766),
 (111, 0.021129976496250393)]

In [83]:
sorted(list(enumerate(similarity[df2[df2['name'] == 'huawei'].index[0]])), reverse=True, key=lambda x : x[1])[:10]

[(0, 1.0000000000000004),
 (21, 0.166628617914054),
 (19, 0.10671664827707755),
 (18, 0.08197061839649739),
 (43, 0.06005527357458365),
 (28, 0.058627268494138955),
 (16, 0.05569260861762558),
 (26, 0.053670026577402885),
 (147, 0.051205026452749526),
 (5, 0.050800679422527105)]

In [84]:
all_distance = sorted(list(enumerate(similarity[df2[df2['name'] == 'huawei'].index[0]])), reverse=True, key=lambda x : x[1])[:10]

In [85]:
for i in all_distance:
    print(df2.loc[i[0], 'name'])

huawei
realme
redmi
datsun
apple imac pc
oppo
motorola moto
iphone
diamond storm  pro rgb gaming casing
ladies backpack


### Demo Test End

### Recommendar Function

In [86]:
def productRecommender(productName):
    productName = productName.lower()
    product_index = df2[df2['name'] == productName].index[0]
    distances = sorted(list(enumerate(similarity[product_index])), reverse=True, key=lambda x : x[1])
    for i in distances[1:16]:
        print(df2.loc[i[0], 'name'], "---->", df2.loc[i[0], 'categoryName'])

In [87]:
productRecommender('hijab') # cosmetics

ladies backpack ----> bag
spot light ----> light
asus pc ----> computer accessories
wled-rb3wb22 light ----> light
baby gym rack and game pad combo ----> toys
maithili silk saree ----> cloths
starex 32” gs ----> television
apple imac pc ----> computer accessories
gigabit smart managed switch ----> computer accessories
super 16 ----> cycle
mojar periscope ----> educational
led light ----> light
lg 32lk510b 32" hd led television ----> television
onnorokom science box: mystery of chemistry ----> educational
xiaomi mi band 6 cn ----> watch


In [88]:
productRecommender('oneplus') # phone

forever bicycle ----> cycle
dual band gigabit 7 antennas router ----> electronics
diamond storm  pro rgb gaming casing ----> computer accessories
oppo ----> phone
iphone ----> phone
redmi ----> phone
chuwi herobook  laptop ----> laptop
huawei ----> phone
duranta ----> cycle
cadillac ----> car
aston martin ----> car
nike ----> bag
ladies backpack ----> bag
ladies backpack ----> bag
lakmé ----> cosmetics


## Apply porterStemmer

In [90]:
ps = PorterStemmer()

### Remove stopwords and stem those words

In [91]:
def textPorterStemmer(text):
    words = word_tokenize(text=text)
    words = [ps.stem(word=word) for word in words if word not in stopwords.words('english')]
    text = ' '.join(words)
    return text

In [92]:
df3.head()

Unnamed: 0,id,product_category_id,name,categoryName,tags
0,1,1,huawei,phone,huaweiscreen size camera ramgb battery mah ope...
1,2,10,duranta,cycle,durantathis bicycle has a strong steel frame i...
2,19,20,cadillac,car,cadillacbasic years miles corrosion years mile...
3,20,20,aston martin,car,aston martinyears unlimited miles basic years ...
4,21,19,nike,bag,nikeyou can buy the gym bag traveling bag nike...


In [93]:
df3['tags'] = df3['tags'].apply(textPorterStemmer)

In [94]:
df3.loc[1, 'tags']

'durantathi bicycl strong steel frame brake system strong allow maintain control situat seat adjust chang need front rear wheel fit mudguard rider get mud ride roaddurantaduranta super sport bicycl'

## Apply CounterVectorizer

In [95]:
cv = CountVectorizer(max_features=1200)

In [96]:
cv_vectors = cv.fit_transform(raw_documents=df3['tags']).toarray()

In [97]:
cv_vectors.shape

(153, 1200)

In [98]:
cv_vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [99]:
cv_vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [100]:
cv_similarity = cosine_similarity(X=cv_vectors)

In [101]:
cv_similarity.shape

(153, 153)

In [102]:
cv.get_feature_names_out()

array(['010', '013', '018', ..., 'zari', 'zigzag', 'zoom'], dtype=object)

In [103]:
cv_similarity[0]

array([1.        , 0.        , 0.        , 0.        , 0.        ,
       0.08006408, 0.        , 0.        , 0.        , 0.        ,
       0.06019293, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.07715167, 0.        , 0.1132277 , 0.12909944,
       0.        , 0.26111648, 0.        , 0.        , 0.        ,
       0.        , 0.08703883, 0.        , 0.08333333, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.06019293, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.02227177, 0.048795  , 0.        ,
       0.        , 0.        , 0.        , 0.06622662, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

### Recommendar Function for PorterStemmer

In [104]:
def productRecommender2(productName):
    try:
        productName = productName.lower()
        product_index = df3[df3['name'] == productName].index[0]
        distances = sorted(list(enumerate(cv_similarity[product_index])), reverse=True, key=lambda x : x[1])
        for i in distances[1:16]:
            productID, productName = df3.loc[i[0], ['id','name']]
            # print(f"Product ID:[{productID}] | Product Name: {productName}")
            print(df3.loc[i[0], 'name'], "---->", df3.loc[i[0], 'categoryName'])
            
            # category_id = int(df3.loc[i[0], 'product_category_id'])
            # print(dic_category[category_id])
            
    except Exception as e:
        print("Opps!", e.__class__,"occurred.")

In [119]:
# Demo test

df3[df3['name'] == "COTTON WOVEN SAREE".lower()]['product_category_id']

106    11
Name: product_category_id, dtype: int64

In [120]:
df3[df3['name'] == "COTTON WOVEN SAREE".lower()]['product_category_id'].values

array([11], dtype=int64)

In [121]:
df3[df3['name'] == "COTTON WOVEN SAREE".lower()]['product_category_id'].values[0]

11

In [105]:
dic_category.get(int(df3[df3['name'] == "COTTON WOVEN SAREE".lower()]['product_category_id'].values[0]))

# Demo test end

'cloths'

In [107]:
df3.head(3)

Unnamed: 0,id,product_category_id,name,categoryName,tags
0,1,1,huawei,phone,huaweiscreen size camera ramgb batteri mah ope...
1,2,10,duranta,cycle,durantathi bicycl strong steel frame brake sys...
2,19,20,cadillac,car,cadillacbas year mile corros year mile drivetr...


In [109]:
# dic_category.get(int(testdf[testdf['name'] == "HUAWEI"]['product_category_id'].values[0]))

In [116]:
def specificProductRecommender2(productName):
    try:
        productName = productName.lower()
        
        product_id = int(df3[df3['name'] == productName]['product_category_id'].values[0])
        product_category_name = dic_category.get(product_id)
        
        product_index = df3[df3['name'] == productName].index[0]
        distances = sorted(list(enumerate(cv_similarity[product_index])), reverse=True, key=lambda x : x[1])
        
        for i in distances[1:]:
            productID, productName = df3.loc[i[0], ['id','name']]
            # print(f"Product ID:[{productID}] | Product Name: {productName}")
            # print(df3.loc[i[0], 'name'], "---->", df3.loc[i[0], 'categoryName'])
            
            category_id = int(df3.loc[i[0], 'product_category_id'])
            if product_category_name == dic_category[category_id]:
                print(df3.loc[i[0], 'name'], "---->", df3.loc[i[0], 'categoryName'],",","productID: ",productID, "--->", "categoryID: ",category_id)
            else:
                pass
            # print(dic_category[category_id])
            
    except Exception as e:
        print("Opps!", e.__class__,"occurred.")
        

        
# def specificProductRecommender2(productName):
#     try:
#         # productName = productName.lower()
        
#         product_id = int(testdf[testdf['name'] == productName]['product_category_id'].values[0])
#         product_category_name = dic_category.get(product_id)
        
#         product_index = testdf[testdf['name'] == productName].index[0]
#         distances = sorted(list(enumerate(cv_similarity[product_index])), reverse=True, key=lambda x : x[1])
        
#         for i in distances[1:16]:
#             productID, productName = testdf.loc[i[0], ['id','name']]
#             # print(f"Product ID:[{productID}] | Product Name: {productName}")
#             # print(df3.loc[i[0], 'name'], "---->", df3.loc[i[0], 'categoryName'])
            
#             category_id = int(testdf.loc[i[0], 'product_category_id'])
#             if product_category_name == dic_category[category_id]:
#                 print(testdf.loc[i[0], 'name'], "---->", testdf.loc[i[0], 'categoryName'])
#             else:
#                 pass
#             # print(dic_category[category_id])
            
#     except Exception as e:
#         print("Opps!", e.__class__,"occurred.")

In [117]:
productRecommender2("hijab")

ladies backpack ----> bag
wled-rb3wb22 light ----> light
spot light ----> light
xiaomi mi band 6 cn ----> watch
asus pc ----> computer accessories
walton light ----> light
electronic watch ----> watch
haylou smart watch ----> watch
mojar periscope ----> educational
starex 32” gs ----> television
baby gym rack and game pad combo ----> toys
apple imac pc ----> computer accessories
lg 32lk510b 32" hd led television ----> television
led light ----> light
gigabit smart managed switch ----> computer accessories


In [118]:
specificProductRecommender2("hijab")

maithili silk saree ----> cloths , productID:  138 ---> categoryID:  11
semi fitted panjabi ----> cloths , productID:  133 ---> categoryID:  11
children dress ----> cloths , productID:  135 ---> categoryID:  11
silk readymade lehenga ----> cloths , productID:  136 ---> categoryID:  11
traditional designer silk saree ----> cloths , productID:  137 ---> categoryID:  11
handwoven silk sarees ----> cloths , productID:  139 ---> categoryID:  11
cotton woven saree ----> cloths , productID:  140 ---> categoryID:  11
embroidery lehenga ----> cloths , productID:  141 ---> categoryID:  11
floral wedding lehenga ----> cloths , productID:  142 ---> categoryID:  11
embroidered net semi stitched bridal lehenga ----> cloths , productID:  143 ---> categoryID:  11
semi stitched party wear lehenga ----> cloths , productID:  144 ---> categoryID:  11
printed rayon kurta-sets ----> cloths , productID:  145 ---> categoryID:  11
embroidered-kurtis ----> cloths , productID:  146 ---> categoryID:  11


In [113]:
df3[df3['categoryName'] == "bag"]['name']

4                                  nike
5                       ladies backpack
6                       ladies backpack
130                    laptop pouch bag
131    targus intellect laptop backpack
Name: name, dtype: object

In [114]:
import pickle

In [115]:
# pickle.dump(obj=similarity, file=open('if_idf_similarity.pkl','wb'))