# Classificação de tweets no contexto político

### Deep Learning

#### Brunna de Sousa Pereira Amorim


In [1]:
import keras

Using Theano backend.


In [2]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

In [7]:
#carregando os dados
import numpy as np
import pandas as pd
from numpy import genfromtxt

my_data = genfromtxt("bow_tweets_carac_dp.csv", delimiter=',',skip_header=1)
#my_data = pd.read_csv("bow_tweets_carac_dp.csv")


In [39]:
max_features = 500000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

In [8]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

np.random.seed(1337)  # for reproducibility

In [9]:
#copiar o arquivo gerado no home para a pasta do home\.keras\\datasets
x_train = x_test = my_data[:,1:]
y_sentiment = my_data[:,0:1]
y_train = y_test = [round(y[0]) for y in y_sentiment]
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

(12004, 'train sequences')
(12004, 'test sequences')


In [10]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
('x_train shape:', (12004L, 400L))
('x_test shape:', (12004L, 400L))


In [40]:
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

In [41]:
# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

In [42]:
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

In [43]:
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [44]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

### Validação Cruzada

In [45]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Train on 12004 samples, validate on 12004 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x33ac25c50>

In [46]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

('Test score:', 0.37306447601226994)
('Test accuracy:', 0.8372209263578807)


In [47]:
predicted = model.predict(x_test)
rounded = [(int)(round(x[0])) for x in predicted]
matrix = confusion_matrix(y_test, rounded)
print('Accuracy:', accuracy_score(y_test, rounded))
print( 'Recall:', recall_score(y_test, rounded, average='micro'))
print('Precision:', precision_score(y_test, rounded, average='micro'))
print('F1 score:',f1_score(y_test, rounded, average='micro'))
print('matrix of confusion:',matrix)

('Accuracy:', 0.8372209263578807)
('Recall:', 0.8372209263578807)
('Precision:', 0.8372209263578807)
('F1 score:', 0.8372209263578807)
('matrix of confusion:', array([[4792, 1107],
       [ 847, 5258]]))


### Treino e Teste

In [49]:
#dividir o treino e teste de novo
from sklearn.cross_validation import train_test_split

X_treino_feature, X_teste_feature, y_treino_feature, y_teste_feature = train_test_split(x_train, y_train, test_size=0.3)



In [50]:
model.fit(X_treino_feature, y_treino_feature,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_teste_feature, y_teste_feature))

Train on 8402 samples, validate on 3602 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x478c03d68>

In [51]:
score = model.evaluate(X_teste_feature, y_teste_feature, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

('Test score:', 0.39087967107125215)
('Test accuracy:', 0.80566352033270916)


In [None]:
predicted = model.predict(X_teste_feature)
rounded = [(int)(round(x[0])) for x in predicted]
matrix = confusion_matrix(y_teste_feature, rounded)
print('Accuracy:', accuracy_score(y_teste_feature, rounded))
print( 'Recall:', recall_score(y_teste_feature, rounded, average='micro'))
print('Precision:', precision_score(y_teste_feature, rounded, average='micro'))
print('F1 score:',f1_score(y_teste_feature, rounded, average='micro'))
print('matrix of confusion:',matrix)