# Reading the data

In [1]:
import pandas as pd

csv_data = pd.read_csv('GOP_REL_ONLY.csv')
mid = int(len(csv_data) / 2)

train = pd.DataFrame(list(zip(csv_data['text'][:mid], csv_data['sentiment'][:mid] )), columns=['text', 'sentiment'])
test =  pd.DataFrame(list(zip(csv_data['text'][mid+1:], csv_data['sentiment'][mid+1:] )), columns=['text', 'sentiment'])

print(len(train['text']))
print("Done reading data")

6935
Done reading data


# Vectorization

In [2]:
import re
from sklearn.preprocessing import LabelEncoder  

le = LabelEncoder()

train['tokens'] = train['text'].map(lambda x: re.sub(r"[^a-zA-Z0-9 ]+", '', x.lower()).split(' '))
test['tokens'] = test['text'].map(lambda x: re.sub(r"[^a-zA-Z0-9 ]+", '', x.lower()).split(' '))

word2i = {}
i2word = {}
filtered_words = ['<UNK>']
freq_word = {}
min_freq = 2

def count_freq(x):
    global freq_word
    if x in freq_word:
        freq_word[x] = freq_word[x] + 1
    else:
        freq_word[x] = 1
    return x

def freq_filter(x):
    global word2i
    global i2word
    global filtered_words
    global freq_word
    global min_freq
    
    if x in freq_word and freq_word[x] >= min_freq:
        filtered_words.append(x)
    
    return x

train['tokens'].map(lambda tweet: [count_freq(word) for word in tweet])
#test['tokens'].map(lambda seq: [count_freq(word) for word in seq])

df_freq = pd.DataFrame(list(zip( freq_word.keys(), freq_word.items() )), columns=['word','count'])
df_freq['word'].map(lambda word: freq_filter(word))


lookup_len = len(filtered_words)
print('# words found originally :: ' + str(len(freq_word)))
print('# filtered words :: ' + str(lookup_len))

i2word = dict(zip(list(range(1,lookup_len + 1)), filtered_words))
word2i = dict(zip(filtered_words, list(range(1,lookup_len + 1))))

train['vectors'] = train['tokens'].map(lambda tweet: [word2i[word] if word in word2i else 1 for word in tweet])
test['vectors'] = test['tokens'].map(lambda tweet: [word2i[word] if word in word2i else 1 for word in tweet])

max_tweet_len = max( len(tweet) for tweet in train['tokens'])
max_tweet_len = max( len(tweet) for tweet in test['tokens'])

print('Max tweet length :: ' + str(max_tweet_len))

print(train[2:3])

print("Done vectorizing input data")

# words found originally :: 16158
# filtered words :: 5984
Max tweet length :: 41
                                                text sentiment  \
2  RT @TJMShow: No mention of Tamir Rice and the ...   Neutral   

                                              tokens  \
2  [rt, tjmshow, no, mention, of, tamir, rice, an...   

                                             vectors  
2  [2, 1, 32, 33, 24, 34, 35, 36, 9, 16, 37, 38, ...  
Done vectorizing input data


## Build the model

In [8]:
from keras.preprocessing.sequence import pad_sequences
import numpy as np

from keras.models import Model
from keras.layers import Input, Dense, TimeDistributed
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from pathlib import Path

padded_tweets = pad_sequences(sequences = train['vectors'], maxlen = max_tweet_len)
x_train = np.matrix(padded_tweets)

print('Padded training data shape ::: ' + str(x_train.shape))

y_train = le.fit_transform(train['sentiment'])

n_classes = len(set(y_train))

print('# Classes :: ' + str(n_classes))

n_embedding = 50
n_gru = 80

def get_model(n_embedding, n_gru, seq_len, vocab_dim, batch_size=None, stateful=False):
    
    input_layer = Input(batch_shape=(batch_size, seq_len), name='input_layer')
    # batch_size * seq_len
    
    embedding_layer = Embedding(input_dim=vocab_dim, output_dim=n_embedding, mask_zero=True,
                                name='embedding_layer')(input_layer)
    # batch_size * seq_len * n_embedding
    
    gru_1 = GRU(n_gru, return_sequences=True, stateful=stateful, activation='relu',
                name='gru_1')(embedding_layer)
    # batch_size * seq_len * n_gru
    
    gru_2 = GRU(n_gru, return_sequences=True, stateful=stateful, activation='relu',
                name='gru_2')(gru_1)
    # batch_size * seq_len * n_gru
    
    gru_3 = GRU(n_gru, return_sequences=False, stateful=stateful, activation='relu',
                name='gru_3')(gru_2)
    # batch_size * n_gru
    
    output_layer = Dense(3, activation='softmax')(gru_3)
    # batch_size * 3
    
    model = Model(inputs=input_layer, outputs=output_layer)
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    
    return model

model = get_model(n_embedding=n_embedding, seq_len=max_tweet_len, n_gru=n_gru,
                  vocab_dim=lookup_len + 1)
print(model.summary())

file_path = 'gop_weights' + '.embed=' + str(n_embedding) + '.gru=' + str(n_gru) + '.h5'
init_weights_file = Path(file_path)

if init_weights_file.exists():
    model.load_weights(file_path)

model.fit(x = x_train, y = y_train, epochs=5, batch_size=100)
model.save_weights(file_path)

print("Done training")


Padded training data shape ::: (6935, 41)
# Classes :: 3
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     (None, 41)                0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 41, 50)            299250    
_________________________________________________________________
gru_1 (GRU)                  (None, 41, 80)            31440     
_________________________________________________________________
gru_2 (GRU)                  (None, 41, 80)            38640     
_________________________________________________________________
gru_3 (GRU)                  (None, 80)                38640     
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 243       
Total params: 408,213
Trainable params: 408,213
Non-trainable params: 0
_____________

In [9]:
pred_model = get_model(n_embedding=n_embedding, seq_len=max_tweet_len, n_gru=n_gru,
                  vocab_dim=lookup_len + 1)
pred_model.load_weights(file_path)

padded_tweets = pad_sequences(sequences = test['vectors'], maxlen = max_tweet_len)
x_test = np.matrix(padded_tweets)
y_test = le.fit_transform(test['sentiment'])

print('Test data shape :: ' + str(x_test.shape))

res = pred_model.predict(x=x_test)
res = [np.argmax(arr) for arr in res]

print(res[1:10])
print(y_test[1:10])

hits = np.sum(res == y_test)
print('Accuracy :: ' + str( float(hits/len(y_test))))

print('Done testing')

Test data shape :: (6935, 41)
[1, 0, 0, 1, 0, 1, 0, 0, 1]
[0 0 0 2 0 1 0 1 0]
Accuracy :: 0.5495313626532083
Done testing
