# Classes

In [118]:
import random

# Used to classify the reviews as either Negative or Positive 
class Sentiment:
    NEGATIVE = "NEGATIVE"
    POSITIVE = "POSITIVE"

# Innit class to classify an object with the review and score either 1 or 2 from the data
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

# Changing the score to Positive or Negative from the data csv file
    def get_sentiment(self):
        if self.score == 2:
            return Sentiment.POSITIVE
        else:
            return Sentiment.NEGATIVE

# Receive the reviews and randomize which review and score will be part of the testing and training containers
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

# Importing Data

In [119]:
# Importing the data from the csv file and appending into a usable python array
import pandas as pd
df = pd.read_csv('test.csv')

reviews = []

for i in range(0, 100):
    for j in range(1):
        reviews.append(Review(df.values[i][j + 1], df.values[i][j]))

print(reviews[7].sentiment)

POSITIVE


# Clean Data

In [120]:
# Using 80% training and 20% testing data from the csv file
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.20, random_state=42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)

In [121]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

35
45


# Bag of Words

In [122]:
# Bag of words is a way to take reviews and turn them into usable data such as integers by searching key words used in
# many parts of the data reviews
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

I really enjoyed using this product and it exceeded what I thought it did. Will def be buying again.
[[0.         0.28011812 0.1546819  0.1546819  0.         0.
  0.28011812 0.         0.28011812 0.1546819  0.28011812 0.28011812
  0.         0.30936379 0.         0.         0.         0.
  0.         0.         0.         0.1546819  0.28011812 0.
  0.1546819  0.28011812 0.         0.         0.28011812 0.28011812
  0.         0.28011812]]


### Linear SVM

In [123]:
# Using SVM or Support Vector Machines with the libaray sklearn
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

# Evaluation

In [124]:
# f1 metrics is a harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])

array([1., 1.])

In [125]:
# testing key words to make sure it is working
test_set = ['amazing job', "just terrible i hated it", 'it was okay']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

# Saving Model

In [126]:
# Saving the model using the library pickle
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

# Load Model

In [127]:
# Loading the model
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [128]:
# Testing
print(train_x[0])

loaded_clf.predict(train_x_vectors[0])

I really enjoyed using this product and it exceeded what I thought it did. Will def be buying again.


array(['POSITIVE'], dtype='<U8')