### import data and check for na

In [1]:
import pandas as pd
df = pd.read_csv("data/amazon_cellphones_multiclass.csv")
df

Unnamed: 0,asin,reviewText,overall
0,B007D6J64K,Probably my favorite cover! Super sassy and ve...,5
1,B007D6J64K,This case protects the phone from damage.,5
2,B007D6J64K,Nice,4
3,B007D6J64K,"this was another of my favorite ones, thanks f...",5
4,B007D6J64K,Decent case but not a lot of protection.,5
...,...,...,...
29995,B0096QI0QK,it is so easy to put on your phone and it prot...,5
29996,B0096QI0QK,Much better quality than I expected for the pr...,5
29997,B0096QI0QK,This is one of the best screen protectors I ha...,4
29998,B0096QI0QK,This kit included a microfiber cloth and soft ...,5


In [2]:
df.isna().sum()

asin           0
reviewText    12
overall        0
dtype: int64

In [3]:
df.overall.value_counts()

5    17695
4     5366
3     3144
1     2121
2     1674
Name: overall, dtype: int64

### create binary target varibale

In [5]:
def binary(row):
    if row['overall'] > 3:
        val = 1
    elif row['overall'] < 3:
        val = 0
    else:
        val = -1
    return val

df['bin_y'] = df.apply(binary, axis=1)
df

Unnamed: 0,asin,reviewText,overall,bin_y
0,B007D6J64K,Probably my favorite cover! Super sassy and ve...,5,1
1,B007D6J64K,This case protects the phone from damage.,5,1
2,B007D6J64K,Nice,4,1
3,B007D6J64K,"this was another of my favorite ones, thanks f...",5,1
4,B007D6J64K,Decent case but not a lot of protection.,5,1
...,...,...,...,...
29995,B0096QI0QK,it is so easy to put on your phone and it prot...,5,1
29996,B0096QI0QK,Much better quality than I expected for the pr...,5,1
29997,B0096QI0QK,This is one of the best screen protectors I ha...,4,1
29998,B0096QI0QK,This kit included a microfiber cloth and soft ...,5,1


### remove NaN and split X and y

In [6]:
df_not_na = df[~(df['reviewText'].isna()) & ~(df['bin_y']==-1)]
text_0 = df_not_na['reviewText']
y = df_not_na['bin_y'].tolist()
text_0[5]

'This case is so cute the only problem I had with it due to the texture of the case it was hard to get in and out of my pockets'

### lowercase, remove punctuation, tokenize, lemmatization

In [7]:
import nltk
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()

text = text_0.str.lower().str.replace('[^\w\s]',' ')
text = text.str.split()
# text = text.apply(lambda x: [lemmatizer.lemmatize(word) for sentence in x for word in sentence])
text = text.apply(lambda x: [lemmatizer.lemmatize(sent) for sent in x])
print(text[5])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lorenzo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['this', 'case', 'is', 'so', 'cute', 'the', 'only', 'problem', 'i', 'had', 'with', 'it', 'due', 'to', 'the', 'texture', 'of', 'the', 'case', 'it', 'wa', 'hard', 'to', 'get', 'in', 'and', 'out', 'of', 'my', 'pocket']


### create ngrams

NLTK stopwords can be found at [this link](https://gist.github.com/sebleier/554280), downloaded, custiomized and imported as a list

In [8]:
from gensim.models.phrases import Phrases
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.extend(['good', 'many', 'love', 'excellent', 'would'])

bigram = Phrases(text, min_count=5, threshold=1, common_terms=stop)
print(bigram[text[5]])

['this', 'case', 'is', 'so', 'cute', 'the', 'only', 'problem', 'i', 'had', 'with', 'it', 'due', 'to', 'the', 'texture', 'of', 'the', 'case', 'it', 'wa', 'hard_to_get', 'in', 'and', 'out', 'of', 'my', 'pocket']


Threshold parameter:
<img src='img/phrases_threshold.PNG' width='400'>

In [9]:
bigrams = [bigram[item] for item in text]
ngrams = [bigram[item] for item in bigrams]
print(ngrams[5])

['this', 'case', 'is', 'so', 'cute', 'the', 'only', 'problem', 'i', 'had', 'with', 'it', 'due', 'to', 'the', 'texture', 'of', 'the', 'case', 'it', 'wa', 'hard_to_get', 'in', 'and', 'out', 'of', 'my', 'pocket']


### Remove Stopwords

In [10]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.extend(['good', 'bad', 'dont', 'many', 'love', 'excellent', 'would', 'perfect', 'even', 'great'])
print(ngrams[0])
train_sentences = []
for row in ngrams:
    train_sentences.append(' '.join([item for item in row if item not in stop]))
# train_sentences = [' '.join(item) for item in ngrams]
train_sentences[0]

['probably', 'my', 'favorite', 'cover', 'super', 'sassy', 'and', 'very', 'protective', 'i', 'am', 'very', 'abusive', 'of', 'my', 'phone', 'and', 'this', 'case', 'held_up_very_well', 'after', 'a', 'year', 'the', 'color', 'started', 'to', 'wear', 'a', 'bit', 'but', 'it', 'continued', 'to', 'protect_my_phone', 'very', 'well', 'i', 'would', 'buy', 'it', 'again']


'probably favorite cover super sassy protective abusive phone case held_up_very_well year color started wear bit continued protect_my_phone well buy'

### save data to file

In [11]:
df_not_na['reviewText'] = train_sentences
df_not_na.to_csv('data/amazon_cellphones_binary.csv', index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_not_na['reviewText'] = train_sentences


### prepare BoW

Bag of Words:
<img src='img/bow.PNG' width='600'>

Term frequency - inverse document frequency:
<img src='img/tfidf.jpeg' width='400'>

In [12]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=1000)
# vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=1000)
print(train_sentences[0])
X = vectorizer.fit_transform(train_sentences)
feature_names = vectorizer.get_feature_names()
X = X.toarray()
X = np.array(X)
y = np.array(y)
print(X[0])

probably favorite cover super sassy protective abusive phone case held_up_very_well year color started wear bit continued protect_my_phone well buy
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 

### create train/test split

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

### classify data

In [14]:
from tqdm import tqdm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, recall_score

kfold = StratifiedKFold(n_splits=3, shuffle=False, random_state=1)
model = tree.DecisionTreeClassifier(max_leaf_nodes=10, max_depth=5)
# model = LogisticRegression(class_weight=None)
# model = RandomForestClassifier()

cvscores = []
cvrecall = []

for train, validation in tqdm(kfold.split(x_train, y_train)):
    model.fit(x_train[train],y_train[train])
    predicted = model.predict(x_train[validation])
    scores = accuracy_score(predicted, y_train[validation])
    recall = recall_score(predicted, y_train[validation])
    cvrecall.append(recall)
    cvscores.append(scores * 100)

print("accuracy: ",cvscores)
print("recall: ",cvrecall)

3it [00:02,  1.40it/s]

accuracy:  [86.43039591315454, 86.74968071519795, 86.79546543190165]
recall:  [0.8726345236136251, 0.8758882829284416, 0.8769256253105847]





In [15]:
from sklearn.metrics import classification_report

model.fit(x_train, y_train)
predicted = model.predict(x_test)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.63      0.13      0.22      1138
           1       0.87      0.99      0.93      6916

    accuracy                           0.87      8054
   macro avg       0.75      0.56      0.57      8054
weighted avg       0.84      0.87      0.83      8054



### cross validated grid search

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm

#cross_validated_grid_search for Random Forest
model = RandomForestClassifier(class_weight='balanced')
param_grid = {'n_estimators': [10, 100],
               'criterion': ['gini', 'entropy'],
               'max_depth': [5, 10],
               'min_samples_split': [2, 10, 20]}

#cross_validated_grid_search for SVC
# model = svm.SVC()
# param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]}]

grid = GridSearchCV(estimator = model, param_grid = param_grid, cv=3, verbose=2, n_jobs=-1, scoring='f1_weighted')
# Fit the random search model
%time grid_result = grid.fit(x_train, y_train)

#print grid search results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   37.3s finished


CPU times: user 4.13 s, sys: 471 ms, total: 4.6 s
Wall time: 41.4 s
Best: 0.833146 using {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
0.785882 (0.016621) with: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 10}
0.818670 (0.006715) with: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100}
0.791339 (0.015944) with: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 10}
0.814290 (0.009004) with: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 100}
0.795607 (0.008950) with: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 20, 'n_estimators': 10}
0.821525 (0.006836) with: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 20, 'n_estimators': 100}
0.804787 (0.005259) with: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 10}
0.829279 (0.006568) with: {'criterion': 'gini', 'max_depth'

In [18]:
best_model = grid.best_estimator_
predicted = best_model.predict(x_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.39      0.53      0.45      1138
           1       0.92      0.86      0.89      6916

    accuracy                           0.82      8054
   macro avg       0.65      0.70      0.67      8054
weighted avg       0.84      0.82      0.83      8054



### Display classification tree

In [19]:
estimator = best_model.estimators_[5]

In [24]:
from sklearn.tree import export_graphviz

export_graphviz(estimator,
                proportion=True,
                out_file="tree.dot",
                feature_names=feature_names,
                class_names=['negative', 'positive'],
                filled=True,
                rounded=True,
                max_depth=10)

In [25]:
!dot -Tpng tree.dot -o tree.png

zsh:1: command not found: dot


In [26]:
!dot -Tpng tree.dot -o tree.png