In [2]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [3]:
import pandas as pd
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Cell_Phones_and_Accessories_5.json.gz')

# Number of reviews = 100000 

In [4]:
df100k = df.head(100000)

In [5]:
df100k = df100k.rename({'reviewText': 'review'}, axis=1)

## Labeling reviews based on 'overall' (rating) 

In [6]:
df100k['label'] = df100k['overall'].apply(lambda c: 1 if c >3 else (0 if c<3 else 'neut'))

In [7]:
df100k = df100k.drop(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style', 'reviewerName', 'summary', 'unixReviewTime', 'vote', 'image'], axis=1)

## Removing neutral reviews 

In [8]:
df100k = df100k[df100k['label'] != 'neut']
df100k

Unnamed: 0,review,label
0,Looks even better in person. Be careful to not...,1
1,When you don't want to spend a whole lot of ca...,1
3,DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY...,0
4,"I liked it because it was cute, but the studs ...",1
5,The product looked exactly like the picture an...,0
...,...,...
99995,"Great little hard cover for my son's ""dumb"" ph...",1
99996,This is a good case that holds my phone togeth...,1
99997,"I initially thought it wasn't charging, since ...",1
99998,Charger is great quality and charges both rath...,1


### Checking for NaN values 

In [9]:
df100k['review'].isnull().values.any()

True

In [10]:
import numpy as np
df100k = df100k.replace(np.nan, '', regex=True)
df100k['review'].isnull().values.any()

False

In [11]:
df100k['label'].value_counts()

1    74567
0    14849
Name: label, dtype: int64

## Distributing sentiments equally  

In [12]:
def get_top_data(top_n):
    top_data_df_positive = df100k[df100k['label'] == 1].head(top_n)
    top_data_df_negative = df100k[df100k['label'] == 0].head(top_n)
    top_data_df_small = pd.concat([top_data_df_positive, top_data_df_negative])
    return top_data_df_small

# Function call to get the top 10000 from each sentiment
top_data_df_small = get_top_data(top_n=15000)

# After selecting top few samples of each sentiment
print("After segregating and taking equal number of rows for each sentiment:")
print(top_data_df_small['label'].value_counts())
top_data_df_small.head(10)

After segregating and taking equal number of rows for each sentiment:
1    15000
0    14849
Name: label, dtype: int64


Unnamed: 0,review,label
0,Looks even better in person. Be careful to not...,1
1,When you don't want to spend a whole lot of ca...,1
4,"I liked it because it was cute, but the studs ...",1
7,It is a very cute case. None of the jewels hav...,1
9,I really love this case... you have to keep yo...,1
10,its super cute and makes my phone pretting and...,1
11,Another great product that my daughter she use...,1
13,I purchased this for my grand-daughters phone....,1
14,Beautiful quality and outstanding product! Eve...,1
15,It is such a good case for a low price. I have...,1


# Pre-processing

In [13]:
df30k = top_data_df_small

In [14]:
import re
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import spacy

In [15]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [16]:
df30k['new_reviews'] = df30k['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [17]:
df30k['new_reviews'] = df30k['new_reviews'].str.replace('[^\w\s]','')

  df30k['new_reviews'] = df30k['new_reviews'].str.replace('[^\w\s]','')


In [18]:
def space(comment):
    doc = nlp(comment)
    return " ".join([token.lemma_ for token in doc])
df30k['new_reviews']= df30k['new_reviews'].apply(space)
df30k.head(20)

Unnamed: 0,review,label,new_reviews
0,Looks even better in person. Be careful to not...,1,look even well in person be careful to not dro...
1,When you don't want to spend a whole lot of ca...,1,when you do nt want to spend a whole lot of ca...
4,"I liked it because it was cute, but the studs ...",1,I like it because it be cute but the stud fall...
7,It is a very cute case. None of the jewels hav...,1,it be a very cute case none of the jewel have ...
9,I really love this case... you have to keep yo...,1,I really love this case you have to keep your ...
10,its super cute and makes my phone pretting and...,1,its super cute and make my phone prette and I ...
11,Another great product that my daughter she use...,1,another great product that my daughter she use...
13,I purchased this for my grand-daughters phone....,1,I purchase this for my granddaughter phone she...
14,Beautiful quality and outstanding product! Eve...,1,beautiful quality and outstanding product ever...
15,It is such a good case for a low price. I have...,1,it be such a good case for a low price I have ...


In [19]:
df30k = df30k.drop(['review'], axis=1)
df30k = df30k.rename({'new_reviews': 'review'}, axis=1)
df30k

Unnamed: 0,label,review
0,1,look even well in person be careful to not dro...
1,1,when you do nt want to spend a whole lot of ca...
4,1,I like it because it be cute but the stud fall...
7,1,it be a very cute case none of the jewel have ...
9,1,I really love this case you have to keep your ...
...,...,...
99979,0,as other have mention the rubber piece that ho...
99983,0,refuse to charge but amazom replace it right a...
99988,0,this review be for the blueant wireless t1 blu...
99991,0,I purchase this because it play music and that...


# Splitting into train and test data 

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df30k.review, df30k.label, test_size=0.2, random_state=32)

# Vectorizing the reviews 

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

# Training the model 

In [22]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))

Training time: 99.085213s; Prediction time: 17.579832s


# Testing the model on test data and printing classification report 

In [23]:
print(classification_report(y_test,prediction_linear))

              precision    recall  f1-score   support

           0       0.88      0.89      0.89      2951
           1       0.90      0.88      0.89      3019

    accuracy                           0.89      5970
   macro avg       0.89      0.89      0.89      5970
weighted avg       0.89      0.89      0.89      5970

