In [1]:
# 1. Import packages
import tensorflow as tf 
from tensorflow import keras
import matplotlib.pyplot as plt 
import numpy as np 
import os, datetime
import IPython
import IPython.display
import matplotlib as mpl 
import seaborn as sns 
import pandas as pd 
import sklearn, pickle
from sklearn.metrics import classification_report


In [2]:
# 2. Data loading
PATH = r"C:\Users\User\Desktop\Capstone2\ecommerceDataset.csv"

column_names = ["Category", "Text"]

df = pd.read_csv(PATH, names=column_names, header=None)
print(df.info())
print(df.head())


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\User\\Desktop\\Capstone2\\ecommerceDataset.csv'

In [None]:
# 3. Data inspection
print("Shape of the data: ", df.shape)
print("Data description: \n", df.describe().transpose())
print("NA values: \n", df.isna().sum())
print("Duplicates: ", df.duplicated().sum())
print("Categories: ", df['Category'].value_counts())

Shape of the data:  (50425, 2)
Data description: 
           count unique                                                top  \
Category  50425      4                                          Household   
Text      50424  27802  Think & Grow Rich About the Author NAPOLEON HI...   

           freq  
Category  19313  
Text         30  
NA values: 
 Category    0
Text        1
dtype: int64
Duplicates:  22622
Categories:  Category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64


In [None]:
# Find row of the missing value
df[df['Text'].isna() == True]

Unnamed: 0,Category,Text
39330,Clothing & Accessories,


In [None]:
# Drop row of the missing value
df.dropna(inplace = True)
df.isnull().sum()

Category    0
Text        0
dtype: int64

In [None]:
df.Category.unique()

array(['Household', 'Books', 'Clothing & Accessories', 'Electronics'],
      dtype=object)

In [None]:
# 4. Data preprocessing
from sklearn.preprocessing import LabelEncoder

# (A) Separate the feature and label
feature = df['Text'].values
label = df['Category'].values
# (B) Perform label encoding to the category column
label_encoder = sklearn.preprocessing.LabelEncoder()
label_encoded = label_encoder.fit_transform(label)
label_encoded[:5]

array([3, 3, 3, 3, 3])

In [None]:
# [FYI] You can use the label encoder to do inverse transform
sample_categories = label_encoder.inverse_transform([0,1,2,3])
print(sample_categories)
# 0 - Books
# 1 - Clothing & Accessories
# 2 - Electronics
# 3 - Household

['Books' 'Clothing & Accessories' 'Electronics' 'Household']


In [None]:
# Perform train-val-test split
from sklearn.model_selection import train_test_split
seed = 42
x_train,x_split,y_train,y_split = train_test_split(feature, label_encoded, train_size=0.7, random_state=seed)
x_val,x_test,y_val,y_test = train_test_split(x_split,y_split, train_size=0.5, random_state=seed)

In [None]:
# 6. NLP
# (A) Tokenization
tokenizer = keras.layers.TextVectorization(max_tokens=5000, output_sequence_length=200)
tokenizer.adapt(x_train)

In [None]:
# [FYI] Test how the tokenizer works
sample_tokens = tokenizer(x_train[:2])
print(x_train[:2])
print(sample_tokens)  

["Acer 18.5 inch (46.99 cm) LED Monitor - EB192Q (Black) Specifications LED 18.5 '' ACER EB192Qb (B). Brand ACER Model EB192Qb Response Time 5 ms Max. Resolution 1366x768 @ 60Hz Contrast Ratio 100 million: 1 (ACM). Brightness 200 nits (cd / m2). Display 18.5 inch Color System 16.7 m POWER Supply (100V-240V): Internal Power Consumption (Off): 0.45W Power Consumption (Sleep): 14W Power Consumption (on): 18W. VGA Port 1 Port."
 'SOUMIK ELECTRICALS 5-inch Subwoofer with Maximum 4 ohm(100 W) Thisb product is from the brand SOUMIK ELECTRICALS it presents a 5 Inch subwoofer with maximum 4 ohm and comes with 100W. Use in your home theatre.']
tf.Tensor(
[[3365 4947  173    1  147  152  622    1   55  595  152 4947 3365    1
  1613  151 3365  535    1 1478   61  105 1551  739  649    1    1 1619
  1972  115 1539   36    1 1495 1270    1 1373 3153  259 4947  173   65
   146    1  805   59  781    1 1290   59 1111  253    1   59 1111  943
     1   59 1111   15    1 1210  377   36  377    0    0   

In [None]:
# (B) Embedding
embedding = keras.layers.Embedding(5000, 64)

In [None]:
# 7. Model development
model = keras.Sequential()
# (A) NLP layers
model.add(tokenizer)
model.add(embedding)
# (B) RNN
model.add(keras.layers.Bidirectional(keras.layers.LSTM(32, return_sequences=False)))
model.add(keras.layers.Dense(len(df['Category'].unique()), activation='softmax'))

# model.summary()


In [None]:
# 8. Model compile
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# 9. Model Training
logpath = "tensorboard/ecommerce/" + datetime.datetime.now().strftime("%Y=%m-%d_%H%M%S")
tb = keras.callbacks.TensorBoard(logpath)
es = keras.callbacks.EarlyStopping(patience=2, verbose=3)
history = model.fit(x_train,y_train, validation_data=(x_val,y_val), epochs=50, batch_size=32, callbacks=[tb,es])


Epoch 1/50
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 142ms/step - accuracy: 0.8114 - loss: 0.5219 - val_accuracy: 0.9627 - val_loss: 0.1363
Epoch 2/50
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 159ms/step - accuracy: 0.9728 - loss: 0.1021 - val_accuracy: 0.9646 - val_loss: 0.1254
Epoch 3/50
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 146ms/step - accuracy: 0.9820 - loss: 0.0663 - val_accuracy: 0.9688 - val_loss: 0.1139
Epoch 4/50
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 101ms/step - accuracy: 0.9866 - loss: 0.0485 - val_accuracy: 0.9663 - val_loss: 0.1264
Epoch 5/50
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 137ms/step - accuracy: 0.9895 - loss: 0.0376 - val_accuracy: 0.9713 - val_loss: 0.1247
Epoch 5: early stopping


In [None]:
# The model architecture
model.summary()
keras.utils.plot_model(model, show_shapes=True)

NameError: name 'model' is not defined

In [None]:
# Further evaluate with test data
evaluation = model.evaluate(x_test,y_test)
print(evaluation)

[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 36ms/step - accuracy: 0.9717 - loss: 0.1141
[0.14000791311264038, 0.9689317941665649]


In [None]:
# 10. Use the model for inference
predictions = model.predict(x_test[:3])
class_predictions = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
print(x_test[:3])
print(class_predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 972ms/step
['KVS-Primary Teachers (Music) Exam Guide (Old Edition) '
 'Anchor Fly with Me Stitch Kit A range of over 300 d-i-y kits with easy to make embroidery designs on canvas. Anchor stitch kits contain everything you need to help you make your own special embroidery creation. Happy crafting.'
 "Wall1ders Atulya Arts 3D Hexagon Acrylic Stickers with 20 Butterfly Wall Stickers (Silver) -Pack of 14 First Time in India!!! An INDIAN COMPANY named “Atulya Arts” presenting new trend of wall decoration in 3D acrylic, made using imported cast acrylic with computerized laser cutting method. Design you wall with your own creativity Buy from trusted name only!!! 'Atulya Arts' products are of premium quality and use only high standard material. Our products are quality controlled and pre stocked. Atulya Arts - Presenting high quality 3D Mirror or 3D acrylic sticker, Search Atulya Arts - acrylic, Atulya Arts - acrylic sticker that is

In [None]:
prediction = model.predict(x_test)
prediction_index = np.argmax(prediction, axis = 1)
model_report = classification_report(y_test, prediction_index)
print(model_report)

[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 35ms/step
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      1756
           1       0.99      0.96      0.97      1312
           2       0.95      0.96      0.96      1560
           3       0.96      0.98      0.97      2936

    accuracy                           0.97      7564
   macro avg       0.97      0.97      0.97      7564
weighted avg       0.97      0.97      0.97      7564



In [None]:
# 11. Save the necessary components
# (A) Label encoder
with open("label_encoder.json","wb") as f:
    pickle.dump(label_encoder, f)

In [None]:
# (B) Model
keras.models.save_model(model, "saved_models/Classification.h5")    



In [None]:
# (C) Tokenizer
with open("tokenizer.json", "wb") as f:
    pickle.dump(tokenizer, f)