In [None]:
%matplotlib inline

In [None]:
import json

def load_file(file_path):
    """

    :param file_path: path to the json file
    :return: an array in which each entry is tuple [text, classification label]
    """
    with open(file_path) as json_file:
        raw_data = json.load(json_file)
        return convert_data(raw_data)


def convert_data(raw_data):
    data = []
    for elem in raw_data:
        data.append([elem["reviewerID"], elem["asin"], elem["helpful"], elem["reviewText"], elem["overall"], elem["summary"], elem["unixReviewTime"], elem["reviewTime"]])
    return data

In [None]:
import pandas as pd
data = load_file("ElectronicsReviews1.json")

df = pd.DataFrame(data, columns = ['reviewerID', 'asin', 'helpful', 'reviewText', 'overall', 'summary', 'unixReviewTime', 'reviewTime'])
df.head()

Unnamed: 0,reviewerID,asin,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2NYK9KWFMJV4Y,B00KIMX4EY,"[0, 0]",I received my transmitted as a review sample f...,5.0,Easy installation and superb performance,1402012800,"06 6, 2014"
1,A1LAGBKLFBEVXL,B00KIMX4EY,"[1, 1]","I have a 3.5mm auxillary jack in my car, but i...",5.0,Excellent device with good audio quality,1402358400,"06 10, 2014"
2,A240FRPD4MEXND,B00KIMX4EY,"[0, 0]",I do not pretend to understand just how this a...,5.0,An Amazing Transmitter,1406073600,"07 23, 2014"
3,A1ZRHBODI7I015,B00KIMX4EY,"[0, 0]",Great product that saved me hundreds of dollar...,5.0,A Wallet-Saver,1405123200,"07 12, 2014"
4,A8NIGRJORA9KT,B00KIMX4EY,"[1, 2]",Went on a Road Trip back east.Drove back used ...,1.0,Broke one week after i recived it,1403136000,"06 19, 2014"


In [None]:
df.groupby('overall').nunique().plot(kind='bar')

TypeError: ignored

In [None]:
df.loc[3]['reviewText']

"Great product that saved me hundreds of dollars.  My wife is driving cross country in a car and didn't want to carry hundreds of CDs so she wanted me to replace the radio with a bluetooth-enabled radio.  Um... no.  Saw this, bought it, use it and it works extremely well.  Good sound, no static.  Even the phone hook up works.  If there's any problems on the cross-country trek, I'll update this, but right now it's working exceptionally well."

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np

df['Text_Parsed_1'] = df['reviewText'].str.replace("\r", " ") # carriage return
df['Text_Parsed_1'] = df['Text_Parsed_1'].str.replace("\n", " ")
df['Text_Parsed_1'] = df['Text_Parsed_1'].str.replace("  ", " ")
df['Text_Parsed_1'] = df['Text_Parsed_1'].str.replace('"', '')
df['Text_Parsed_1'] = df['Text_Parsed_1'].str.lower()

df.loc[3]['Text_Parsed_1']

"great product that saved me hundreds of dollars. my wife is driving cross country in a car and didn't want to carry hundreds of cds so she wanted me to replace the radio with a bluetooth-enabled radio. um... no. saw this, bought it, use it and it works extremely well. good sound, no static. even the phone hook up works. if there's any problems on the cross-country trek, i'll update this, but right now it's working exceptionally well."

In [None]:
punctuation_signs = list("?:!.,;")
df['Text_Parsed_2'] = df['Text_Parsed_1']

for punct_sign in punctuation_signs:
    df['Text_Parsed_2'] = df['Text_Parsed_2'].str.replace(punct_sign, '')

df['Text_Parsed_2'] = df['Text_Parsed_2'].str.replace("'s", "")

df.loc[3]['Text_Parsed_2']

"great product that saved me hundreds of dollars my wife is driving cross country in a car and didn't want to carry hundreds of cds so she wanted me to replace the radio with a bluetooth-enabled radio um no saw this bought it use it and it works extremely well good sound no static even the phone hook up works if there any problems on the cross-country trek i'll update this but right now it working exceptionally well"

In [None]:
# Downloading punkt (splint text into sentences) and wordnet from NLTK
nltk.download('punkt')
print("------------------------------------------------------------")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
------------------------------------------------------------
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Saving the lemmatizer into an object
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
df.head()

Unnamed: 0,reviewerID,asin,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Text_Parsed_1,Text_Parsed_2
0,A2NYK9KWFMJV4Y,B00KIMX4EY,"[0, 0]",I received my transmitted as a review sample f...,5.0,Easy installation and superb performance,1402012800,"06 6, 2014",i received my transmitted as a review sample f...,i received my transmitted as a review sample f...
1,A1LAGBKLFBEVXL,B00KIMX4EY,"[1, 1]","I have a 3.5mm auxillary jack in my car, but i...",5.0,Excellent device with good audio quality,1402358400,"06 10, 2014","i have a 3.5mm auxillary jack in my car, but i...",i have a 35mm auxillary jack in my car but it ...
2,A240FRPD4MEXND,B00KIMX4EY,"[0, 0]",I do not pretend to understand just how this a...,5.0,An Amazing Transmitter,1406073600,"07 23, 2014",i do not pretend to understand just how this a...,i do not pretend to understand just how this a...
3,A1ZRHBODI7I015,B00KIMX4EY,"[0, 0]",Great product that saved me hundreds of dollar...,5.0,A Wallet-Saver,1405123200,"07 12, 2014",great product that saved me hundreds of dollar...,great product that saved me hundreds of dollar...
4,A8NIGRJORA9KT,B00KIMX4EY,"[1, 2]",Went on a Road Trip back east.Drove back used ...,1.0,Broke one week after i recived it,1403136000,"06 19, 2014",went on a road trip back east.drove back used ...,went on a road trip back eastdrove back used t...


In [None]:
nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['Text_Parsed_2']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

In [None]:
df['Text_Parsed_3'] = lemmatized_text_list

print(df.loc[3]['reviewText'])
print(df.loc[3]['Text_Parsed_3'])

Great product that saved me hundreds of dollars.  My wife is driving cross country in a car and didn't want to carry hundreds of CDs so she wanted me to replace the radio with a bluetooth-enabled radio.  Um... no.  Saw this, bought it, use it and it works extremely well.  Good sound, no static.  Even the phone hook up works.  If there's any problems on the cross-country trek, I'll update this, but right now it's working exceptionally well.
great product that save me hundreds of dollars my wife be drive cross country in a car and didn't want to carry hundreds of cds so she want me to replace the radio with a bluetooth-enabled radio um no saw this buy it use it and it work extremely well good sound no static even the phone hook up work if there any problems on the cross-country trek i'll update this but right now it work exceptionally well


In [None]:
# Downloading the stop words list
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Loading the stop words in english
stop_words = list(stopwords.words('english'))
stop_words[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [None]:
example = "me eating a meal"
word = "me"

# The regular expression is:
regex = r"\b" + word + r"\b"  # we need to build it like that to work properly: backspace + stopword + backspace

re.sub(regex, "StopWord", example)

'StopWord eating a meal'

In [None]:
df['Text_Parsed_4'] = df['Text_Parsed_3']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    df['Text_Parsed_4'] = df['Text_Parsed_4'].str.replace(regex_stopword, '')
print(df.loc[3]['reviewText'])
print(df.loc[3]['Text_Parsed_4'])

Great product that saved me hundreds of dollars.  My wife is driving cross country in a car and didn't want to carry hundreds of CDs so she wanted me to replace the radio with a bluetooth-enabled radio.  Um... no.  Saw this, bought it, use it and it works extremely well.  Good sound, no static.  Even the phone hook up works.  If there's any problems on the cross-country trek, I'll update this, but right now it's working exceptionally well.
great product  save  hundreds  dollars  wife  drive cross country   car  ' want  carry hundreds  cds   want   replace  radio   bluetooth-enabled radio um  saw  buy  use    work extremely well good sound  static even  phone hook  work    problems   cross-country trek ' update   right   work exceptionally well


In [None]:
df.head(1)

Unnamed: 0,reviewerID,asin,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Text_Parsed_1,Text_Parsed_2,Text_Parsed_3,Text_Parsed_4
0,A2NYK9KWFMJV4Y,B00KIMX4EY,"[0, 0]",I received my transmitted as a review sample f...,5.0,Easy installation and superb performance,1402012800,"06 6, 2014",i received my transmitted as a review sample f...,i received my transmitted as a review sample f...,i receive my transmit as a review sample from ...,receive transmit review sample manufactu...


In [None]:
# remove the intermediate columns

list_columns = ["reviewText", "summary", "Text_Parsed_4"]
df = df[list_columns]

df = df.rename(columns={'Text_Parsed_4': 'Text_Parsed'})

df.head(1)

Unnamed: 0,reviewText,summary,Text_Parsed
0,I received my transmitted as a review sample f...,Easy installation and superb performance,receive transmit review sample manufactu...


In [None]:
# train - test split

X_train, X_test, y_train, y_test = train_test_split(df['Text_Parsed'], 
                                                    df['summary'], 
                                                    test_size=0.75, 
                                                    random_state=42)

In [None]:
# Text representation

# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train

# shape is a tuple that gives you an indication of the number of dimensions in the array (rows, columns)
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)
print(labels_test.shape)

print(features_train)
print(labels_train)

(80, 169)
(242, 169)
(242,)
[[0.         0.         0.         ... 0.         0.1728901  0.11695927]
 [0.         0.37122171 0.         ... 0.09686342 0.         0.12022478]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.1534752  0.         0.        ]
 [0.26210004 0.         0.         ... 0.14285565 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
300                       To use a technical term...Wow!
128    Great addition to my new Asus Laptop and trave...
312                                      Simply stellar!
8                         Would be great if it worked...
70                                      Oodles of power!
                             ...                        
188                                         A good value
71     Great quality four port Smartphone or tablet c...
106                    Seems locked to Belkin NetCam app
270           U

In [None]:
print(__doc__)


# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
# Though the following import is not directly being used, it is required
# for 3D projection to work
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import style


style.use("ggplot")

from sklearn.cluster import KMeans
from sklearn import datasets


np.random.seed(5)


f = open("/home/ubuntu/revnite/GloVe/vectors.txt", "r")
#print(f.read())
#X = f.read()

#iris = datasets.load_iris()
#X = iris.data
#y = iris.target
#print(datasets)

X = features_test
Y = labels_test
print(X)
print(y)



estimators = [('k_means_iris_8', KMeans(n_clusters=8)),
              ('k_means_iris_3', KMeans(n_clusters=3)),
              ('k_means_iris_bad_init', KMeans(n_clusters=3, n_init=1,
                                               init='random'))]

fignum = 1
titles = ['8 clusters', '3 clusters', '3 clusters, bad initialization']
for name, est in estimators:
    fig = plt.figure(fignum, figsize=(4, 3))
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
    est.fit(X)
    labels = est.labels_

    ax.scatter(X[:, 3], X[:, 0], X[:, 2],
               c=labels.astype(np.float), edgecolor='k')

    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    ax.set_xlabel('Petal width')
    ax.set_ylabel('Sepal length')
    ax.set_zlabel('Petal length')
    ax.set_title(titles[fignum - 1])
    ax.dist = 12
    fignum = fignum + 1

# Plot the ground truth
fig = plt.figure(fignum, figsize=(4, 3))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

for name, label in [('Setosa', 0),
                    ('Versicolour', 1),
                    ('Virginica', 2)]:
    ax.text3D(X[y == label, 3].mean(),
              X[y == label, 0].mean(),
              X[y == label, 2].mean() + 2, name,
              horizontalalignment='center',
              bbox=dict(alpha=.2, edgecolor='w', facecolor='w'))
# Reorder the labels to have colors matching the cluster results
y = np.choose(y, [1, 2, 0]).astype(np.float)
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor='k')

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
ax.set_title('Ground Truth')
ax.dist = 12

fig.show()

Automatically created module for IPython interactive environment


FileNotFoundError: ignored

In [None]:
# let's train a SVM

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt

svc_0 =svm.SVC(random_state=42)

print('Parameters currently in use:\n')
pprint(svc_0.get_params())

Parameters currently in use:

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}


In [None]:
# C
C = [.0001, .001, .01, .1, 1]

# gamma
gamma = [.0001, .001, .01, .1, 1, 10, 100]

# degree
degree = [1, 2, 3, 4, 5]

# kernel
kernel = ['linear', 'rbf', 'poly']

# probability: Linear SVC is not a probabilistic classifier by default but it
# has a built-in calibration option enabled in this example
probability = [True]

# Create the random grid
random_grid = {'C': C,
              'kernel': kernel,
              'gamma': gamma,
              'degree': degree,
              'probability': probability
             }

pprint(random_grid)

{'C': [0.0001, 0.001, 0.01, 0.1, 1],
 'degree': [1, 2, 3, 4, 5],
 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
 'kernel': ['linear', 'rbf', 'poly'],
 'probability': [True]}


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
# First create the base model to tune
svc = svm.SVC(random_state=42)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=svc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=2, 
                                   verbose=1, 
                                   random_state=42)

# Fit the random search model
random_search.fit(features_train, labels_train)

Fitting 2 folds for each of 50 candidates, totalling 100 fits


ValueError: ignored

In [None]:
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search.best_score_)

In [None]:
# Create the parameter grid based on the results of random search 
C = [.001, .01, .1]
degree = [3, 4, 5]
gamma = [.1, 1, 10]
probability = [True]

param_grid = [
  {'C': C, 'kernel':['linear'], 'probability':probability},
  {'C': C, 'kernel':['poly'], 'degree':degree, 'probability':probability},
  {'C': C, 'kernel':['rbf'], 'gamma':gamma, 'probability':probability}
]

# Create a base model
svc = svm.SVC(random_state=42)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 1, test_size = .33, random_state = 42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)

Fitting 1 folds for each of 21 candidates, totalling 21 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:    0.7s finished


GridSearchCV(cv=ShuffleSplit(n_splits=1, random_state=42, test_size=0.33, train_size=None),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=42, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.001, 0.01, 0.1], 'kernel': ['linear'],
                          'probability': [True]},
                         {'C': [0.001, 0.01, 0.1], 'degree': [3, 4, 5],
                          'kernel': ['poly'], 'probability': [True]},
                         {'C': [0.001, 0.01, 0.1], 'gamma': [0.1, 1, 10],
                          'kernel': ['rbf'], 'probability': [True]}],
             pre_dispatch='2*n

In [None]:
print("The best hyperparameters from Grid Search are:")
print(grid_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

The best hyperparameters from Grid Search are:
{'C': 0.001, 'kernel': 'linear', 'probability': True}

The mean accuracy of a model with these hyperparameters is:
0.0


In [None]:
# random search was better

best_svc = random_search.best_estimator_
best_svc

AttributeError: ignored

In [None]:
# fit the model
best_svc.fit(features_train, labels_train)

NameError: ignored

In [None]:
svc_pred = best_svc.predict(features_test)

In [None]:
from sklearn.metrics import f1_score

# Training accuracy
print("The training accuracy and F1_score are : ")
print(accuracy_score(labels_train, best_svc.predict(features_train)))
print(f1_score(labels_train, best_svc.predict(features_train)))

In [None]:
# Test accuracy
print("The test accuracy and f1_score are: ")
print(accuracy_score(labels_test, svc_pred))
print(f1_score(labels_test, best_svc.predict(features_test)))

In [None]:
# Classification report
print("Classification report")
print(classification_report(labels_test,svc_pred))

In [None]:
import seaborn as sns

# Confusion matrix
aux_df = df[['Label', 'Label_Code']].drop_duplicates().sort_values('Label_Code')
conf_matrix = confusion_matrix(labels_test, svc_pred)

plt.figure(figsize=(12.8,6))

sns.heatmap(conf_matrix, 
            annot=True,
            xticklabels=aux_df['Label'].values, 
            yticklabels=aux_df['Label'].values,
            cmap="Blues")

plt.ylabel('Predicted')
plt.xlabel('Actual')
plt.title('Confusion matrix')
plt.show()

In [None]:
# Let's see if the hyperparameter tuning process has returned a better model:

base_model = svm.SVC(random_state = 8)
base_model.fit(features_train, labels_train)

print(accuracy_score(labels_test, base_model.predict(features_test)))
print(f1_score(labels_test, base_model.predict(features_test)))

In [None]:
best_svc.fit(features_train, labels_train)
print(accuracy_score(labels_test, best_svc.predict(features_test)))
print(f1_score(labels_test, best_svc.predict(features_test)))

In [None]:
pprint(base_model.get_params())

In [None]:
pprint(best_svc.get_params())