In [89]:
#All credits to https://github.com/jorpro/workshops/blob/patch-1/NLP/NLP_Demo.ipynb
import json
import pandas as pd
import numpy as np
import nltk.data
import re
from nltk.corpus import stopwords
import re

In [227]:
#Parse data downloaded from http://jmcauley.ucsd.edu/data/amazon/
#import 'reviews_Cell_Phones_and_Accessories_5.json.gz'

import pandas as pd 
import gzip 

def parse(path): 
    g = gzip.open(path, 'rb') 
    for l in g: 
        yield eval(l) 
        
def getDF(path): 
    i = 0 
    df = {}
    for d in parse(path): 
        df[i] = d 
        i += 1 
    return pd.DataFrame.from_dict(df, orient='index') 

df = getDF('reviews_Cell_Phones_and_Accessories_5.json.gz')

In [228]:
#Select only review texts and ratings
df = df[['reviewText','overall']]
df.head()

Unnamed: 0,reviewText,overall
0,They look good and stick good! I just don't li...,4.0
1,These stickers work like the review says they ...,5.0
2,These are awesome and make my phone look so st...,5.0
3,Item arrived in great time and was in perfect ...,4.0
4,"awesome! stays on, and looks great. can be use...",5.0


In [229]:
#Set reviews of rating 1 as negative, 5 as positive.
from sklearn.utils import shuffle
df = df[(df['overall'] == 5.0) | (df['overall'] == 1.0)]
df['sentiment'] = 0 + (df['overall'] > 3.0)
negative_reviews = df[df['sentiment'] == 0]
positive_reviews = df[df['sentiment'] == 1]
positive_reviews = shuffle(positive_reviews)
positive_reviews = positive_reviews[0:negative_reviews.shape[0]]
reviews = [positive_reviews,negative_reviews]
df = pd.concat(reviews)
df = shuffle(df)

In [230]:
df.shape

(26558, 3)

In [231]:
#Default English stopwords (Credit: http://www.ranks.nl/stopwords)
#Words with negative implications are removed from the list i.e. 'against','mustn','needn','shan','shouldn','wasn','weren','won','wouldn'.
stop_words = {'i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he',
              'him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs',
              'themselves','what','which','who','whom','this','that', 'these','those','am','is','are','was', 'were',
              'be','been','being', 'has','had','having','do','does','did', 'doing','a','an','the','and','but',
              'if','or','because','as','until','while','of','at','by','for','with','about','between','into','through',
              'during','before','after','to','from','in','on', 'again','further','then','once','here','there','when',
              'where','why', 'how','all','any','both','each','other','some','such','own','same','too','s','t','can',
              'will','just','now','d','ll','m','o','re','ve','y','ma'}

In [232]:
#Credit: http://www.nltk.org/howto/stem.html
#import snowball stemmer
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english",ignore_stopwords=True)

In [233]:
#Clean reviews
#Map numbers to generic "NUMBER"
#Punctuations (including ,.?1) are replaced by "SYMBOL"
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\w*[0-9]+"," NUMBER ",string)
#Replace Exclamation and Question marks as symbols 
    string = re.sub(r"[?!]+", " SYMBOL ", string)
    string = re.sub(r"[,.]"," ", string)
    string = re.sub(r"\'s", " ", string)
    string = re.sub(r"\'ve", " ", string)
    string = re.sub(r"\'m", " ", string)
    string = re.sub(r"\'re", " ", string)
    string = re.sub(r"\'d", " ", string)
    string = re.sub(r"\'ll", " ", string)
    string = re.sub(r"\(", " ", string)
    string = re.sub(r"\)", " ", string)
    string = re.sub(r"\s{2,}", " ", string)
    words = string.lower().split()
    meaningful_words = [stemmer.stem(w) for w in words if w not in stop_words]
    return ( " ".join(meaningful_words)) 

In [234]:
#Clean each review
#Append each review (in string) to an array
clean_train_reviews = []

for string in df['reviewText']:
    clean_train_reviews.append(clean_str(string))

In [235]:
clean_train_reviews

['no differ than one buy ebay buy number pack number number cowork droid x phone bought one fell bs market product most pay number cover packag cheesi certif authent realli need serial number certif authent dumb screen cover symbol mayb frame put desk lol',
 "tri anker batteri sinc brand work great phone differ model put batteri first time daughter phone complet scrambl phone return wireless carrier have reset factori reset wouldn't even work not know batteri tri time not scrambl phone rather continu lock up",
 "wast money case no fit galaxi number it won't snap way part go wouldn't come off wast money",
 'bought aliph jawbon ii bluetooth headset noiseasssassin black hate not work like suppos',
 'so beyond mad wait extra week see would show up late noth peic crap would not recommend buy case doesnt even show up im contact ask what up have wait case suposs ive birthday present couldnt never show up very angri dont buy product',
 "not work not synch ipod bought back up origin still use o

In [236]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier


vectorizer = CountVectorizer(analyzer = "word",   
                                 tokenizer = None,  
                                 preprocessor = None, 
                                 stop_words = None,   
                                 max_features = 5000)

train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an array
train_data_features = train_data_features.toarray()

In [None]:
train_data_features.shape

(26558, 5000)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable

forest = forest.fit(train_data_features[0:23900,:], df["sentiment"][0:23900] )

In [None]:
# Validation accuracy 
valid_predictions = forest.predict(train_data_features[23900:,:])
train_predictions = forest.predict(train_data_features[0:23900,:])

In [None]:
valid_predictions.size

In [None]:
train_predictions.size

In [None]:
train_acc = np.mean(train_predictions == df["sentiment"][0:23900])
valid_acc = np.mean(valid_predictions == df["sentiment"][23900:])
print("The training accuracy is: ", train_acc, "\n", "The validation accuracy is: ", valid_acc)

In [None]:
np.mean(valid_predictions)

In [None]:
np.mean(train_predictions)

In [None]:
#Use "amazon_cells_labelled.txt" as test set.
text_file = open("amazon_cells_labelled.txt", 'r')
text_file = text_file.read()
text_file = re.split("\t|\n", text_file)
text_file = text_file[:len(text_file)-1]
test = pd.DataFrame({"text":text_file[:len(text_file):2], "rating":[int(x) for x in text_file[1:len(text_file):2]]})
test.head()

In [None]:
importances = forest.feature_importances_
indices = np.argsort(importances)[-10:]
importances[indices]

In [None]:
#Credit: http://www.agcross.com/2015/02/random-forests-in-python-with-scikit-learn/
import matplotlib.pyplot as plt
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), np.asarray(vocab)[indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
clean_test_reviews = [] 

for string in test['text']:
    clean_test_reviews.append(clean_str(string))

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

In [None]:
output = pd.DataFrame(data={"text":test["text"],"prediction":result ,"truth":test['rating']})
output.head()

In [None]:
test_accuracy = np.mean(result == test['rating'])
print("The test_accuracy is: ", test_accuracy)

In [None]:
wrong_predictions = output[output['prediction'] != output['truth']]
wrong_predictions.head()

In [None]:
# Percentage of negative review identified as negative
sensitivity = (output[(output['prediction'] == 0) & (output['truth'] == 0)].shape[0]) / (output[output['truth'] == 0].shape[0])
# Percentage of positive review identified as positive
specifity = output[(output['prediction'] == 1) & (output['truth'] == 1)].shape[0] / output[output['truth'] == 0].shape[0]
print("The sensitivity is: ", sensitivity, "\n", "The specifity is: ", specifity)