In [48]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score < 3:
            return Sentiment.NEGATIVE
        else: 
            return Sentiment.POSITIVE

#data nao estava bem distribuida havia bias para reviews positivas
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [i.text for i in self.reviews]
    
    def get_sentiment(self):
        return [i.sentiment for i in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

In [49]:
#Lidar com a data em json
import json
import random
file_name = './book_info.json'
#criar lista de class Review
reviews = []
with open(file_name) as file:
    for i in file:
        review = json.loads(i)
        reviews.append(Review(review['reviewText'], review['overall']))

print("Exemplo:")
random_int = random.randint(0,10000)
print(reviews[random_int].text)
print(reviews[random_int].score)
print(reviews[random_int].sentiment)


Exemplo:
hard to find yarns that match those used for these patterns and the patterns don't give finished dimensions.. otherwise, if one can extrapolate and adjust, the dolls are adorable
3.0
POSITIVE


In [50]:
#Dividir entre train e test data de uma forma tendo em conta a proporcao entre negative e positive reviews
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

#x-o que sei y-o quero saber
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

In [51]:
#Vetorizar as palavras
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x) #nao queremos dar fit


In [52]:
"""CLASSIFICACAO
Experimentar varios modelos
1 - fazer import
2 - criar classifier
3 - dar fit a informacao
"""

'CLASSIFICACAO\nExperimentar varios modelos\n1 - fazer import\n2 - criar classifier\n3 - dar fit a informacao\n'

In [53]:
#LINEAR SVM
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)


In [54]:
#DECISION TREE
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

In [55]:
#NAIVE BAYES
from sklearn.naive_bayes import GaussianNB
clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)


In [56]:
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [58]:
#Avaliar os modelos criados
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

from sklearn.metrics import f1_score
f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])


0.7403846153846154
0.6442307692307693
0.6274038461538461
0.7668269230769231


array([0.73786408, 0.74285714])

In [67]:
#Testar exemplos
test_set = ['hated it because I loved it so much', 'very fun', "bad book do not buy", 'horrible waste of time']
new_test = vectorizer.transform(test_set)
clf_svm.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')