# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

# notebook configurations
pd.options.display.max_colwidth = 100

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv("drive/MyDrive/COGS 109 Amazon Project/Data/amazon_products_sampled_eda_cleaned.csv")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


Mounted at /content/drive


## Methods

### Feature Extraction: TF-IDF

In [2]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["name"], df["main_category"], test_size = 0.2, random_state = 42)

In [3]:
# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
vectorizer

In [4]:
# Fit the vectorizer on the training data
train_features = vectorizer.fit_transform(X_train)
train_words_numerical = pd.DataFrame(train_features.toarray(), columns=vectorizer.get_feature_names_out())
train_words_numerical

Unnamed: 0,aabat,aabha,aabhir,aachi,aada,aadar,aadcart,aadgex,aadi,aadia,...,zuvim,zuvino,zvonko,zwart,zwoosh,zxi,zxizxi,zyax,zyozique,zzowin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61720,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Transform the testing data using the fitted vectorizer
test_features = vectorizer.transform(X_test)
test_words_numerical = pd.DataFrame(test_features.toarray(), columns = vectorizer.get_feature_names_out())
test_words_numerical

Unnamed: 0,aabat,aabha,aabhir,aachi,aada,aadar,aadcart,aadgex,aadi,aadia,...,zuvim,zuvino,zvonko,zwart,zwoosh,zxi,zxizxi,zyax,zyozique,zzowin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
vectorizer.get_feature_names_out()

array(['aabat', 'aabha', 'aabhir', ..., 'zyax', 'zyozique', 'zzowin'],
      dtype=object)

In [7]:
# validation make sure the features in both the train and test are the same
all(train_words_numerical.columns) == all(test_words_numerical.columns)

True

In [8]:
len(vectorizer.get_feature_names_out())

29551

## Models

### Model 1: Multinomial Naive Bayes Classifier

In [9]:
mnb_classifier = MultinomialNB()
mnb_classifier.fit(train_features, y_train)

In [10]:
# Make predictions on the testing data
mnb_predictions = mnb_classifier.predict(test_features)

In [11]:
# Evaluate the model
mnb_report = classification_report(y_test, mnb_predictions)
print(mnb_report)

                         precision    recall  f1-score   support

            accessories       0.77      0.79      0.78       896
             appliances       0.84      0.91      0.88       911
         bags & luggage       0.71      0.81      0.76       903
        beauty & health       0.83      0.82      0.82       857
        car & motorbike       0.81      0.86      0.83       864
grocery & gourmet foods       0.90      0.91      0.90       610
         home & kitchen       0.71      0.76      0.73       897
    industrial supplies       0.87      0.74      0.80       824
          kids' fashion       0.77      0.69      0.73       916
         men's clothing       0.77      0.95      0.85       908
            men's shoes       0.77      0.89      0.83       894
                  music       0.97      0.38      0.54       207
           pet supplies       0.99      0.68      0.81       285
       sports & fitness       0.78      0.66      0.71       936
                 stores 