### Load Data

In [47]:
#Create a data class for all the data being loaded

class Sentiment:
    negative = "negative"
    neutral = "neutral"
    positive = "positive"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
#define sentiment score in review class (used to allow NLP ML later)
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.negative
        elif self.score == 3:
            return Sentiment.neutral
        else:
            return Sentiment.positive
        
        
#Create class to even out Positive and negative training data as not to indroduce bias into the ML model

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
      
    #create method to extract text from each review
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_dist(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.negative, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.positive, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        print(len(negative))
        print(len(positive))

### Define Classes

In [3]:
import json

file_name = 'C:/Users/camer/Documents/ML_py/amazon_review_nlp/Books_small_10000.json'

with open(file_name) as f:
    for line in f:
        print(line)
        break

{"reviewerID": "A1F2H80A1ZNN1N", "asin": "B00GDM3NQC", "reviewerName": "Connie Correll", "helpful": [0, 0], "reviewText": "I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ago and just finished book 5.  Sloane Monroe is a great character and being able to follow her through both private life and her PI life gets a reader very involved!  Although clues may be right in front of the reader, there are twists and turns that keep one guessing until the last page!  These are books you won't be disappointed with.", "overall": 5.0, "summary": "Can't stop reading!", "unixReviewTime": 1390435200, "reviewTime": "01 23, 2014"}



In [4]:
# Use above print to find dictionary keys to print corresponding values. Current file is not of type dict so need to convert

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        print(review['reviewText'])
        print(review['overall'])
        break

I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ago and just finished book 5.  Sloane Monroe is a great character and being able to follow her through both private life and her PI life gets a reader very involved!  Although clues may be right in front of the reader, there are twists and turns that keep one guessing until the last page!  These are books you won't be disappointed with.
5.0


In [5]:
# Above code works to find individual reviews and rating, use this to create a list of every review and rating. Use review class
#to append as a review ibject

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))

# Class allows us to use integer to select review tuple and 'text' or 'score' to specify the body of the review of its rating
print(reviews[5].sentiment)

positive


### Prep Data

In [6]:
len(reviews)

10000

In [60]:
from sklearn.model_selection import train_test_split
import random


# split review data to use to train and test ML algorithm
train, test = train_test_split(reviews, test_size = 0.2, random_state = 42)

train_container = ReviewContainer(train)
train_container.evenly_dist()

test_container = ReviewContainer(test)
test_container.evenly_dist()

513
6704
131
1674


In [61]:
# split training data into text and sentiment (X and y in ML algorithm)
X_train = train_container.get_text()
y_train = train_container.get_sentiment()

# likewise split test data into text and sentiment (X and y)
X_test = test_container.get_text()
y_test = test_container.get_sentiment()

print(y_train.count(Sentiment.positive))


513


#### Bag of words vectorisation

In [69]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#Use bag of words to create a numerized version of review text e.g. each word is assigned a row and counted in a vector when it appears in a review

vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)

X_test_vectors = vectorizer.transform(X_test)

X_train_vectors[0]
X_test_vectors[0]

<1x9625 sparse matrix of type '<class 'numpy.float64'>'
	with 168 stored elements in Compressed Sparse Row format>

### Classification

#### Linear SVM

In [70]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(X_train_vectors, y_train)

clf_svm.predict(X_test_vectors[0])

array(['negative'], dtype='<U8')

In [20]:
X_test[0]

"was sent an Arc of this book for an honest review and here it is = This is the kind of book that you want to read while sitting in front of the fire with a cup of hot apple cider and a blanket over your legs.I have read many of Jaci Burton's books and have never been disappointed. This first book in her new Hope series does not disappoint either.This is the story of Emma, a new vet who has come back home to open her own practice and Luke McCormack, a police officer in the same town.Both have been previously burned by love so both have issues but, that doesn't stop them from acting on that attraction.This book pulls you in from the first page, wraps you up and doesn't let you go until the end.I loved it!"

#### Decision tree

In [71]:
from sklearn.tree import DecisionTreeClassifier

clf_decision = DecisionTreeClassifier()
clf_decision.fit(X_train_vectors, y_train)

clf_decision.predict(X_test_vectors[0])

array(['positive'], dtype='<U8')

#### Naive Bayes

In [72]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(X_train_vectors.toarray(), y_train)

clf_gnb.predict(X_test_vectors[0].toarray())

array(['negative'], dtype='<U8')

#### Logistic regression

In [73]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(X_train_vectors, y_train)

clf_log.predict(X_test_vectors[0])

array(['negative'], dtype='<U8')

### Evaluation

In [74]:
#Pass test data through ML models to see how well they score on mean accuracy

print(clf_svm.score(X_train_vectors, y_train))
print(clf_decision.score(X_train_vectors, y_train))
print(clf_gnb.score(X_train_vectors.toarray(), y_train))
print(clf_log.score(X_train_vectors, y_train))

0.9892787524366472
1.0
0.9844054580896686
0.9668615984405458


In [75]:
# Pass test data through ml models and evaluate their f1_scores

from sklearn.metrics import f1_score

print(f1_score(y_test, clf_svm.predict(X_test_vectors), average = None, labels = [Sentiment.positive, Sentiment.negative]))
print(f1_score(y_test, clf_decision.predict(X_test_vectors), average = None, labels = [Sentiment.positive, Sentiment.negative]))
print(f1_score(y_test, clf_log.predict(X_test_vectors), average = None, labels = [Sentiment.positive, Sentiment.negative]))

[0.82442748 0.82442748]
[0.69888476 0.68235294]
[0.83333333 0.83076923]


#### Use a grid model to tune the ML algorithm