#### Importing all required libraries to create models

In [1]:
import numpy as np
import json
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

#### Classes used for getting sentiments and texts from json files

In [2]:
class sentiment:
    positive = 'positive'
    negative = 'negative'
    neutral = 'neutral'
class Review:
    def __init__(self,text,rating):
        self.text = text
        self.rating = rating
        self.sentiment = self.get_sentiment()
    def get_sentiment(self):
        if self.rating<=2:
            return sentiment.negative
        elif self.rating==3:
            return sentiment.neutral
        else:
            return sentiment.positive
    

In [3]:
all_reviews = []
path = './data/category'
for i in os.listdir(path):
    f_path = os.path.join(path,i)
    with open(f_path,'r') as file:
        for line in file:
            review = json.loads(line)
            all_reviews.append(Review(review['reviewText'],review['overall']))

In [4]:
len(all_reviews)

5000

In [5]:
print(all_reviews[1000].sentiment)

positive


In [6]:
training,testing = train_test_split(all_reviews,test_size=0.35)

In [7]:
x_train = [x.text for x in training]
y_train = [x.sentiment for x in training]
x_test = [x.text for x in testing]
y_test = [x.sentiment for x in testing]

In [8]:
x_train[0]

"My kids and grandkids (and now my great grandkids) love these, and they're always excited to find them in their Christmas stockings. Amazon is the only place I've found these, so I'm really happy that they carry them!!!"

In [9]:
count = CountVectorizer()
count.fit(x_train)
training_x = count.transform(x_train)
testing_x = count.transform(x_test)

In [10]:
print(training_x.shape)
print(testing_x.shape)

(3250, 16217)
(1750, 16217)


# SVM Model

In [11]:
from sklearn import svm
clf = svm.SVC(kernel='linear')
clf.fit(training_x,y_train)
print(clf.predict(testing_x[20]))
print(y_test[20])

['positive']
positive


# DecisionTreeClassifier

In [12]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(training_x,y_train)
clf_dec.predict(testing_x[20])

array(['positive'], dtype='<U8')

# GaussianNB

In [13]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(training_x.todense(),y_train)
clf_gnb.predict(testing_x.todense()[20])

array(['positive'], dtype='<U8')

# LogisticRegression

In [14]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(training_x,y_train)
clf_log.predict(testing_x[20])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['positive'], dtype='<U8')

In [15]:
print(clf.score(testing_x,y_test),'svm model')
print(clf_dec.score(testing_x,y_test),'decision tree model')
print(clf_gnb.score(testing_x.todense(),y_test),'gaussian model')
print(clf_log.score(testing_x,y_test),'logistic regression model')

0.7468571428571429 svm model
0.7177142857142857 decision tree model
0.676 gaussian model
0.7868571428571428 logistic regression model
