In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


In [4]:
df1 = df[['Review Text','Recommended IND']]

In [5]:
df1.head()

Unnamed: 0,Review Text,Recommended IND
0,Absolutely wonderful - silky and sexy and comf...,1
1,Love this dress! it's sooo pretty. i happene...,1
2,I had such high hopes for this dress and reall...,0
3,"I love, love, love this jumpsuit. it's fun, fl...",1
4,This shirt is very flattering to all due to th...,1


In [6]:
df1.isnull().sum() #845 missing reviews

Review Text        845
Recommended IND      0
dtype: int64

In [38]:
df1['Recommended IND'].value_counts()

1    18540
0     4101
Name: Recommended IND, dtype: int64

In [7]:
df1.dropna(inplace = True) #drop the missing reviews

In [8]:
blanks = []
for i, rv, lb in df1.itertuples():
    if rv.isspace():
        blanks.append(i)
blanks
# no blank spaces in our reviews so we can move on!

[]

In [21]:
X = df1['Review Text']
y = df1['Recommended IND']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [23]:
count_vec = CountVectorizer()
Xtrain_counts = count_vec.fit_transform(X_train)
Xtrain_counts #14145 number of unique words in our corpus

<15848x12200 sparse matrix of type '<class 'numpy.int64'>'
	with 678966 stored elements in Compressed Sparse Row format>

In [24]:

tfidf_trans = TfidfTransformer()
Xtrain_tfidf = tfidf_trans.fit_transform(Xtrain_counts)

In [29]:
#TfidfVectorizer does both CountVectorizer and TfidfTransformer in one function
txt_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())]) #Applying a basic SVC algorithm as a benchmark model to beat

In [30]:
txt_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [31]:
ypred = txt_clf.predict(X_test)

In [33]:
print(confusion_matrix(y_test, ypred))

[[ 778  469]
 [ 265 5281]]


In [39]:
print(classification_report(y_test,ypred))

precision    recall  f1-score   support

           0       0.75      0.62      0.68      1247
           1       0.92      0.95      0.94      5546

    accuracy                           0.89      6793
   macro avg       0.83      0.79      0.81      6793
weighted avg       0.89      0.89      0.89      6793

