In [None]:
import nltk
nltk.download('all')

In [204]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [205]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [206]:
dataset = '/content/drive/MyDrive/Colab Notebooks/NLP Datasets/Womens Clothing E-Commerce Reviews.csv'
missing_cells = ['na', 'NA', 'N/A', 'n/a', '-', np.nan]
ecom = pd.read_csv(dataset, na_values=missing_cells)
ecom

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
...,...,...,...,...,...,...,...,...,...,...,...
23481,23481,1104,34,Great dress for many occasions,I was very happy to snag this dress at such a ...,5,1,0,General Petite,Dresses,Dresses
23482,23482,862,48,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",3,1,0,General Petite,Tops,Knits
23483,23483,1104,31,"Cute, but see through","This fit well, but the top was very see throug...",3,0,1,General Petite,Dresses,Dresses
23484,23484,1084,28,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3,1,2,General,Dresses,Dresses


In [207]:
retain_cols = ['Review Text', 'Recommended IND']

for col in ecom.columns:
  if col not in retain_cols:
    ecom.drop(col, axis=1, inplace=True)

In [208]:
ecom.isnull().sum()

Review Text        845
Recommended IND      0
dtype: int64

In [209]:
ecom.dropna(subset=['Review Text'], axis=0, how='any', inplace=True)

In [210]:
ecom['Recommended IND'].value_counts()

1    18540
0     4101
Name: Recommended IND, dtype: int64

In [211]:
ecom.duplicated().sum()

7

In [212]:
ecom.drop_duplicates(inplace=True)

ecom['Recommended']

0: Negative
1: Positive

In [213]:
ecom.rename(columns={'Review Text': 'Review_Text', 'Recommended IND': 'Recommended'}, inplace=True)

In [214]:
# Cleaning and lemmatization

lemmatization = WordNetLemmatizer()

result = []
for index, row in ecom.iterrows():
  sentence = row['Review_Text']
  sentence = re.sub('[^a-zA-Z]', ' ', sentence)
  sentence = sentence.lower()
  sentence = sentence.split()
  sentence = [lemmatization.lemmatize(word) for word in sentence if word not in set(stopwords.words('english'))]
  sentence = ' '.join(sentence)
  result.append(sentence)

In [215]:
ecom['Converted_Text'] = result

In [216]:
converted = ecom.pop('Converted_Text')
ecom.insert(1, 'Converted_Text', converted)
ecom.drop('Review_Text', axis=1, inplace=True)

In [217]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk import sent_tokenize

In [218]:
processed_sentences = []

for feedback in ecom['Converted_Text']:
  tokens = sent_tokenize(feedback)
  for token in tokens:
    processing = simple_preprocess(token)
    processed_sentences.append(processing)

In [219]:
vectorize_tokens = Word2Vec(window=10, min_count=2)

In [220]:
vectorize_tokens.build_vocab(processed_sentences)

In [221]:
vectorize_tokens.train(processed_sentences, total_examples=vectorize_tokens.corpus_count, epochs=vectorize_tokens.epochs)

(2645057, 3163620)

In [222]:
len(vectorize_tokens.wv.index2word)

6995

In [248]:
def vectorize_text(a):
  a = [word for word in a.split() if word in vectorize_tokens.wv.index2word]
  return np.mean(vectorize_tokens.wv[a], axis=0)

In [249]:
vectorize_text(ecom['Converted_Text'].values[0])

array([ 0.4992423 , -0.11830013, -0.33411932,  0.18144324, -0.23821004,
        0.20873299,  0.13463464,  0.52407   , -0.2180376 , -0.21877441,
        0.3107124 ,  0.46788484, -0.31382942,  0.33609647, -0.0039595 ,
       -0.26747376,  0.23372658, -0.74099624,  0.10632153,  0.32542446,
       -0.21184953,  0.04919182, -0.26315537,  0.21349922, -0.43562612,
       -0.22932538, -0.21460561,  0.06461914, -1.1296294 , -0.07644806,
       -0.28910515,  0.5064444 , -0.77588785,  0.05639287, -0.49111542,
        0.01595873, -0.13215294, -0.5227858 , -0.59883446,  0.6627234 ,
        0.26015857, -0.47433394,  0.39866477,  0.6018306 , -0.583696  ,
        0.47927707,  0.06216136,  0.12168428, -0.54734147, -0.5793637 ,
        0.52912503,  0.20356321,  0.39515907,  0.2608735 ,  0.5012256 ,
        0.3085794 , -0.33267528,  0.50877774, -0.4820826 ,  0.41273913,
        0.27266753, -0.26235014, -0.07668722, -0.5336026 ,  0.08083606,
       -0.06349687, -0.10451335,  0.09705299, -0.33897907,  0.08

In [250]:
X = []
for rev in ecom['Converted_Text'].values:
  X.append(vectorize_text(rev))

In [251]:
y = ecom['Recommended']

In [252]:
#The classes are imbalanced. Therefore, it is necessary to balance the classes.

from imblearn.over_sampling import SMOTE

sampler = SMOTE(sampling_strategy='not majority', random_state=1)
X_resampled, y_resampled = sampler.fit_resample(X, y)

In [253]:
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=1)

In [254]:
print(y_train.value_counts())
print(y_val.value_counts())

0    14860
1    14794
Name: Recommended, dtype: int64
1    3740
0    3674
Name: Recommended, dtype: int64


In [255]:
from sklearn.ensemble import RandomForestClassifier

In [256]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [257]:
pred = model.predict(X_val)
how_accurate = metrics.accuracy_score(y_val, pred)
how_accurate

0.9031561909900189

In [258]:
matrix = metrics.classification_report(y_val, pred, output_dict=True)
matrix

{'0': {'f1-score': 0.9065347565738089,
  'precision': 0.8687624750499002,
  'recall': 0.9477408818726184,
  'support': 3674},
 '1': {'f1-score': 0.899524209347887,
  'precision': 0.9436288901937757,
  'recall': 0.8593582887700535,
  'support': 3740},
 'accuracy': 0.9031561909900189,
 'macro avg': {'f1-score': 0.903029482960848,
  'precision': 0.906195682621838,
  'recall': 0.903549585321336,
  'support': 7414},
 'weighted avg': {'f1-score': 0.9029982787447088,
  'precision': 0.9065289159236654,
  'recall': 0.9031561909900189,
  'support': 7414}}

In [259]:
matrix_frame = pd.DataFrame({'0': {'f1-score': 0.9067951054412913, 'precision': 0.8690119760479041, 'recall': 0.9480130647795318, 'support': 3674}, '1': {'f1-score': 0.8998040862020712, 'precision': 0.9439224897240165, 'recall': 0.8596256684491979, 'support': 3740}})
matrix_frame

Unnamed: 0,0,1
f1-score,0.906795,0.899804
precision,0.869012,0.943922
recall,0.948013,0.859626
support,3674.0,3740.0
