## Objetivo: Analise de sentimentos

### Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from tensorflow.keras import layers
from tensorflow.keras import Sequential

### Leitura de dados

In [2]:
path = 'dados_nlp/imdb_labelled.txt'

df = pd.read_csv(path, names=['sentence', 'label'], sep='\t')
sentences = df['sentence'].values
y = df['label'].values

print (df.shape)
df.sample(10)

(748, 2)


Unnamed: 0,sentence,label
713,The incredible soundtrack truly captures the e...,1
10,And those baby owls were adorable.,1
584,"I saw it as a child on TV back in 1973, when i...",1
467,"Unless you're just out to visually ""collect"" a...",0
533,It handles some tough issues with dignity and ...,1
591,This is an extraordinary film.,1
624,It's a long time since I was so entertained by...,1
365,The film looks cheap and bland.,0
44,Very disappointing.,0
176,"If you see it, you should probably just leave ...",0


### Separa dados em treino e teste

In [3]:
sentences_train, sentences_test, y_train, y_test = train_test_split(
                                                   sentences,
                                                   y,
                                                   test_size=0.25,
                                                   random_state=1000)

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

print (X_train.shape, X_test.shape)

(561, 2505) (187, 2505)


### Regressão logística

In [4]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.7486631016042781


### Árvore de decisão

In [5]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.6363636363636364


### Redes neurais

In [7]:
input_dim = X_train.shape[1]

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train,
                    epochs=100,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Accuracy:  {:.4f}".format(accuracy))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                25060     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 25,071
Trainable params: 25,071
Non-trainable params: 0
_________________________________________________________________
Training Accuracy: 1.0000
Accuracy:  0.7861
