In [17]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("../data/mast_product_level_information.csv")
# data.head()

features = ['Product_ID', 'Name', 'Brand', 'Taxonomy_List', 'Keywords']

df = data[features]

In [3]:
def clean_taxonomy(raw_taxonomy):
    words = raw_taxonomy.split('|')
    words = [x for word in words for x in word.split('>')]
    words = [word.lower() for word in words]
#     print(words)
    return "  ".join(words)


clean_taxonomy(data.Taxonomy_List[0])

In [4]:
data.Taxonomy_List.apply(clean_taxonomy)

In [5]:
df.isna().sum()

In [6]:
df.fillna('Missing',inplace=True)

In [7]:
df.isna().sum()

# Pipelines

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from nltk.stem import PorterStemmer


# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Initialize a stemmer object
stemmer = PorterStemmer()
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove HTML tags using a regular expression
    text = re.sub('<.*?>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Tokenize text into individual words
    words = nltk.word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
#     # Lemmatize words
#     lemmatizer = WordNetLemmatizer()
#     words = [lemmatizer.lemmatize(word) for word in words]
    
    # Apply stemming to each word in the list
    words = [stemmer.stem(word) for word in words]
    
    # Join words back into a string
    text = ' '.join(words)
    
    return text


In [10]:
df.head()

In [11]:
preprocess_text(df.Keywords[0])

- For KNN, the time complexity for Training is O(1) which means it is constant and O(n) for testing which means it depends on the number of test examples.

In [13]:
df.columns

In [42]:
def preprocess_data(df):
    df.fillna('Missing', inplace=True)
    df.Name = df.Name.apply(preprocess_text)
    df.Taxonomy_List = df.Taxonomy_List.apply(clean_taxonomy)
    df.Brand = df.Brand.str.lower()
    df.Keywords = df.Keywords.apply(preprocess_text)
    return df
    

def generate_vectors(df):
    documents = (df.Name + " " + df.Brand + " " + df.Taxonomy_List + " " + df.Keywords).tolist()
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
    tfidf_vectors = vectorizer.fit_transform(documents)
    nn_model = NearestNeighbors(n_neighbors=10, algorithm='brute')
    nn_model.fit(tfidf_vectors)
    return nn_model

def preprocess_input(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words)
    return text
    
def recommend_products(query, n=10):
    new_document = preprocess_input(query)
    new_tfidf_vector = vectorizer.transform([new_document])
    distances, indices = nn_model.kneighbors(new_tfidf_vector,n_neighbors=n)
    res = main_df.iloc[indices[0]].reset_index(drop=True)
    return dict(res)
    

In [37]:
pd.DataFrame(recommend_products('freeload fork'))

Unnamed: 0,Product_ID,Name,Brand,Taxonomy_List,Keywords
0,433,freeload fork trick toy,loftus,staff picks toys kids toys staff picks to...,freeload fork stainless steel freeload fork ga...
1,434,gnarli teeth toy,accoutrements,toys kids toys toys pretend play & dress-u...,gnarli teeth asst box children toy accoutr gna...
2,620,ultralight watertight medic kit,adventure medical,outdoor gear health & safety outdoor gear h...,adventur medic first aid kit ultra light first...
3,626,heatsheet surviv blanket person,adventure medical,outdoor gear outdoor gear health & safety o...,heatsheet surviv blanket person surviv suppli ...
4,627,sol thermal singl bivvi,adventure medical,outdoor gear furniture & sleep systems outdo...,year round bivi sack sol thermal singl bivi so...
...,...,...,...,...,...
7867,229574,freez dri caramel,be freez,candy candy chocolate candy fruity & fun,freez dri caramel freez fdcm freez dri caramel...
7868,229606,sparkl cherri stretchi tangi laffi taffi candi,ferrara pan candy co,"candy candy caramels, nougats, & taffy cand...",gener candi laffi taffi sparkl cherri x oz spa...
7869,229607,atom firebal candi lb,ferrara pan candy co,candy made in usa candy hard candy & lollip...,larg atom firebal l bs atom firebal lb hot can...
7870,229709,pinch pop green worm toy,no show,kids toys kids toys toys this n that kid...,jeneey one ggw green grass worm pinch pop toy ...


In [15]:
cleaned_df = preprocess_data(df)

In [19]:
cleaned_df.head()

Unnamed: 0,Product_ID,Name,Brand,Taxonomy_List,Keywords
0,433,freeload fork trick toy,loftus,staff picks toys kids toys staff picks to...,freeload fork stainless steel freeload fork ga...
1,434,gnarli teeth toy,accoutrements,toys kids toys toys pretend play & dress-u...,gnarli teeth asst box children toy accoutr gna...
2,620,ultralight watertight medic kit,adventure medical,outdoor gear health & safety outdoor gear h...,adventur medic first aid kit ultra light first...
3,626,heatsheet surviv blanket person,adventure medical,outdoor gear outdoor gear health & safety o...,heatsheet surviv blanket person surviv suppli ...
4,627,sol thermal singl bivvi,adventure medical,outdoor gear furniture & sleep systems outdo...,year round bivi sack sol thermal singl bivi so...


# Vectorization

In [21]:
df.shape

(7872, 5)

In [23]:
df.iloc[0]

Product_ID                                                     433
Name                                       freeload fork trick toy
Brand                                                       loftus
Taxonomy_List    staff picks  toys  kids  toys  staff picks  to...
Keywords         freeload fork stainless steel freeload fork ga...
Name: 0, dtype: object

In [20]:
documents = (df.Name + " " + df.Brand + " " + df.Taxonomy_List + " " + df.Keywords).tolist()
documents[:3]

['freeload fork trick toy loftus staff picks  toys  kids  toys  staff picks  toys  toys  tomfoolery  kids  toys  tomfoolery freeload fork stainless steel freeload fork gadget kitchen food novelti gift gag gift utensil fun game telescop fork inch fork foot fork accoutr fun fork freeload fork trick toy fork kitchen suppli kitchenwar tablewar tabl ware',
 'gnarli teeth toy accoutrements toys  kids  toys  toys  pretend play & dress-up  toys  tomfoolery  kids  toys  pretend play & dress-up  kids  toys  tomfoolery gnarli teeth asst box children toy accoutr gnarli teeth halloween costum costum gnarli teeth toy fake teeth toy teeth costum teeth',
 'ultralight watertight medic kit adventure medical outdoor gear  health & safety  outdoor gear  health & safety  first aid & survival adventur medic first aid kit ultra light first aid kit camp first aid kit waterproof first aid kit ultralight watertight medic kit first aid kit hike camp outdoor backcountri backpack surviv kit']

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Initialize a TfidfVectorizer with the desired parameters
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit the vectorizer to the documents and transform the documents into TF-IDF vectors
tfidf_vectors = vectorizer.fit_transform(documents)

# Initialize a NearestNeighbors model with the desired parameters
nn_model = NearestNeighbors(n_neighbors=10, algorithm='brute')

# Fit the model to the TF-IDF vectors
nn_model.fit(tfidf_vectors)


In [39]:

# Query the model for the nearest neighbors to a new document
new_document = "freeload fork trick toy loftus"

new_tfidf_vector = vectorizer.transform([new_document])
distances, indices = nn_model.kneighbors(new_tfidf_vector,n_neighbors=5)

df.iloc[indices[0]]

Unnamed: 0,Product_ID,Name,Brand,Taxonomy_List,Keywords
0,433,freeload fork trick toy,loftus,staff picks toys kids toys staff picks to...,freeload fork stainless steel freeload fork ga...
3607,201249,snake nut trick toy,loftus,kids toys kids toys toys tomfoolery kids...,loftu trick joke snake nut shrink wrap x pc sw...
4880,209068,fridg fork tool,chef`n,home & hearth home & hearth kitchen home & ...,chef n pickl condiment fridg fork x fridg fork...
750,83574,magic ball vase trick toy,s.s. adams co.,kids kids toys kids toys tomfoolery,ss adam trick magic ball vase x magic ball vas...
254,24653,dollar bill snatcher trick toy,loftus,kids toys kids toys toys tomfoolery kids...,loftu trick joke dollar bill snatcher x pc bil...
