In [2]:
import numpy as np 
import pandas as pd 
import fasttext

In [3]:
df= pd.read_csv("ecommerce_dataset.csv", names=["category", "description"], header=None)
df.head()

Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [4]:
df.dropna(inplace=True)

In [5]:
df['category'].unique()

array(['Household', 'Books', 'Clothing & Accessories', 'Electronics'],
      dtype=object)

In [6]:
df['category'].replace('Clothing & Accessories', 'Clothing_Accessories', inplace=True)

#### Fasttext expects the description in the format of __label__Household

In [7]:
## So every description should have it
df['category'] = "__label__" + df['category'].astype(str)

In [8]:
df['category'].unique()

array(['__label__Household', '__label__Books',
       '__label__Clothing_Accessories', '__label__Electronics'],
      dtype=object)

In [9]:
## So we shall be merging the column of description and label and then export it to a txt. file
df['category_description'] = df['category'] + ' ' + df['description']

In [10]:
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__Household SAF Flower Print Framed Pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__Household Incredible Gifts India Wood...


In [12]:
## Preprocessing to be done on category_description
## Use regex to convert to lower case, whitespace
import re
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text) ## Here in this we shall be replacing not word, excess white space with normal space
    text = re.sub(' +', ' ', text) 
    return text.strip().lower() 

In [13]:
df['category_description'] = df['category_description'].map(preprocess)

In [18]:
import spacy
nlp = spacy.load('en_core_web_lg')
def preprocessVector(text):
    newToken = list()
    doc = nlp(text)
    for token in doc:
        if  not (token.is_punct or token.is_stop):
            newToken.append(token.lemma_)
    return " ".join(newToken)

In [19]:
df['Preprocess_Description'] = df['category_description'].apply(preprocessVector)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)
## Just dividing the dataste into train and test

In [22]:
train.to_csv("ecommerce.train", columns=["Preprocess_Description"], index=False, header=False)
test.to_csv("ecommerce.test", columns=["Preprocess_Description"], index=False, header=False)
## For using fasttext to train model on sample dataset we need a txt file to be created and by this method, a txt file will be created

In [None]:
## Now using fasttext to train a supervised model 
model = fasttext.train_supervised(input="ecommerce.train")
model.test("ecommerce.test")

In [None]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

In [None]:
model.predict("think and grow rich deluxe edition")

In [None]:
model.get_nearest_neighbors("painting")
## Here using fasttext we also have 

In [None]:
model.get_nearest_neighbors("sony")