In [1]:
import numpy as np
import matplotlib.pyplot as plt
%pylab inline

from sklearn.model_selection import KFold

from nltk.corpus import stopwords
from textblob import TextBlob


from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict



from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import RandomizedSearchCV

Populating the interactive namespace from numpy and matplotlib


# Rating Learning and Prediction for Amazon Product Reviews

## Bahman Roostaei

$\textit{December 2016, Galvanize , San Fransisco.} $

In this project we use the reviews of various products submitted by Amazon.com customers to understand the models that can best predict (classify) the ratings. Understanding the ratings that a given text indicates helps for businesses to ultimately be able to collect the most relevant and most helpful reviews to their products. 

The data are obtained from [here](http://www.cs.jhu.edu/~mdredze/datasets/sentiment/). The data includes about more than a 1000 review text for products like DVD's and Books and Kitchen tools. The data have been mostly cleaned and processed into dictionary of bi-grams. We reprocess the data and convert it to token sets as well and analyze both the tokens and also the original format. 

We first use classification models and calculate the accuracy score and present the confusion matrix. Since this is a multiclass classification we group the ratings that are close to each other (4 and 5) and (1,2). Note that there is no rating 3 option (see below for the histogram of labels). We then use regressor models to be able to analyze the model by ROC curve.

In [2]:
class review_analysis(object):
    
    def __init__(self,product_name,filename):
        # instanciate the class by giving the product name or the filename in which
        # the train and test files are.
        self.product_name = product_name
        self.filename = filename
        
    def read(self):
        
        # read the Amazon processed files.
        
        with open(self.product_name+'/'+self.filename) as f:
            self._data = f.readlines()
    
        
    
    def process(self,pure_token = False):
         
        # if pure_token = True returns a lisr of single strings each containing all the 
        # tokens (no underscores) repeated in the same number as original review 
        # but stop words removed.
        # In all cases it returns the labels as well.
        
        self._pure_token = pure_token
        data = self._data
        X = [] # X is list of dictionaries (or processed reviewes) in the selected product name.
            
        y = [] # y corresponds to the labels.
        
        for rev in data: 
            rev_dict = defaultdict(int)
            for W in rev.split()[:-1:]:
                key= W.split(':')[0]
                value = int(W.split(':')[1]) 
                rev_dict[key] = value
                
            X.append(rev_dict)
            label = rev.split()[-1].split(':')[1]
            y.append(float(label))
    
        if pure_token:
            stop_words = stopwords.words('English')
            review_nostop = []
            for review in X:
                tokens = []
                for key,value in review.items():
                    key_split = key.split('_')
                    for elem in key_split:
                        if elem not in stop_words:
                            for _ in range(value):
                                tokens.append(elem)
                review_nostop.append(' '.join(tokens))
    
            return review_nostop,y
        else:
            return X,y

### Part 1. Data

Data have been provided as dictionary of joined tokens (bag of bi-grams) for each review. Here we reprocess the data and convert them to a document with $\textit{only tokens with stop words removed}$ as reflected in dictionaries with the corresponding number of times it is repeated. 

We start with the book review and their star ratings:

In [36]:
rva_books = review_analysis('books','train')
rva_books.read()
X_books,y = rva_books.process(pure_token=True)

In [37]:
rva_dvd = review_analysis('dvd','train')
rva_dvd.read()
X_dvd,y = rva_dvd.process(pure_token=True)

Here is a sample of the review generated after processing. This set of tokens are not in the human meaningful order.

In [38]:
X_books[0]

"field school type shadows law clerk law law law law professionals guy becomes shadows waste professional money law guy become law professional everyone professionals law six-figure-paid professionals lawyer type clerk lawyer right even money law school aren't money even everyone becomes understand book aren't right becomes law field book book guy book book book ridiculous type become become law school lawyer new people everyone want understand law wants professional people give field six-figure-paid lawyer give aren't new understand lawyer lawyer wants ridiculous six-figure-paid clerk give break want shadows break people want even waste right wants ridiculous"

In [39]:
X_dvd[0]

"moving line actors exciting nail biting casting great top blu-ray acting blu-ray partly today's can't life wait story deja vu exciting can't wait excellent reality actors life excellent moving action actors action nail biting fast action can't top story casting story line sci-fi partly biting partly partly acting great line partly selection reality life selection wait fast moving vu great casting fast sci-fi partly sci-fi exciting nail selection excellent acting today's reality deja today's"

## Part 2. vocabulary:

The vocabulary size can be found using the vecorizer:

In [40]:
vectorizer_books = CountVectorizer(min_df=1)
vec_books = vectorizer_books.fit_transform(X_books)

vectorizer_dvd = CountVectorizer(min_df=1)
vec_dvd = vectorizer_dvd.fit_transform(X_dvd)

In [41]:
# vocabulary:

vocab_books  = vectorizer_books.vocabulary_

vocab_dvd  = vectorizer_dvd.vocabulary_

In [42]:
# list of all vocabulary with stop words removed.

words_books = vocab_books.keys()

words_dvd = vocab_dvd.keys() 

In [43]:
# size of books review vocabulary with stop words removed:
len(words_books)

15923

In [44]:
# size of dvd review vocabulary with stop words removed:
len(words_dvd)

15640