# Text 3-class classification/sentiment analysis of twitter data using RnnClassifier

In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf
import gensim
import re
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from rnn_utils import RnnClassifier, pre_processX, pre_processY

### In order to encode words into vectors to feed to the classifier we use a pre-trained word2vec model from Google

In [4]:
###Methods in order to parse words and a list of word2vec vectors
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(path_to_w2v_bin, binary=True)  
###Function accepts a list of strings and returns a list of vectors for words that exist in the vocabulary
def w2v_parse(list_of_strings):
    w2v_list = []
    for text in list_of_strings:
        tokens = text.split()
        w2v_item_list = []
        for word in tokens:
            try:
                w2v_item_list.append(w2v_model.word_vec(word))
            except:
                continue
        w2v_list.append(w2v_item_list)
    return w2v_list

### The dataset consists of a set of tweets that have been scored 0 for neutral, 1 for positive sentiment and -1 for negative sentiment

In [7]:
###Get twitter data with positive-neutral-negative sentiment
twitter = pd.read_csv(path_to_file)
twitter.head()

Unnamed: 0,polarity,text
0,0,@AppIeGivevvay if your not affiliated how the ...
1,0,@gay_emo_zac haha... I wouldn't be surprised.....
2,0,"@SteamPowered damn, so many good deals...you g..."
3,-1,i m totally confused and bored.. my life must ...
4,0,@DWStweets @Kazport Good luck with that. You h...


In [8]:
###Basic text curation to remove handles and other useless stuff
twitter_list = [' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split()) for x in twitter["text"]]

In [9]:
###one-hot encoding
twitter_labels = np.zeros([len(twitter), 3])
twitter_labels[np.arange(len(twitter)), twitter.polarity] = 1

In [10]:
###Shuffling and getting train and test/validation set
twitter_list, twitter_labels = shuffle(twitter_list, twitter_labels)
splt = int(0.8*len(twitter_list))
train_labels = twitter_labels[:splt]
train_text = twitter_list[:splt]
test_labels = twitter_labels[splt:]
test_text = twitter_list[splt:]

In [11]:
###From list of words to list of vectors
train_input = w2v_parse(train_text)
test_input = w2v_parse(test_text)

In [12]:
###Make input np.array(Sample_size, Max_sequence_length, Embedding_dimension) with zero padding and get length of each sequence
train_input, train_len = pre_processX(train_input)
test_input, test_len = pre_processX(test_input)

### At this point we are readdy to use RnnClassifier. The class works as any other model taken from sklearn library with functions such as .train and .predict. Also the feed forward part is customizable upon creation with the output_architecture variable. The log for the tensorboard visualization are stored at the tensorboard_dir.

In [17]:
model = RnnClassifier(n_classes=3, embedding_dimension=300, output_architecture=[500], 
                      tensorboard_dir="./tensorboard_logs/", batch_size=200)
model.train(train_input, train_labels, train_len, epochs=2, verbose=True)

Step 0 of epoch 0 has accuracy: [0.39500001]
Step 5 of epoch 0 has accuracy: [0.34]
Step 10 of epoch 0 has accuracy: [0.44499999]
Step 15 of epoch 0 has accuracy: [0.47499999]
Step 20 of epoch 0 has accuracy: [0.58999997]
Step 25 of epoch 0 has accuracy: [0.60500002]
Step 30 of epoch 0 has accuracy: [0.63499999]
Step 35 of epoch 0 has accuracy: [0.56999999]
Step 40 of epoch 0 has accuracy: [0.57499999]
Step 45 of epoch 0 has accuracy: [0.61000001]
Step 50 of epoch 0 has accuracy: [0.63999999]
Step 55 of epoch 0 has accuracy: [0.70499998]
Step 60 of epoch 0 has accuracy: [0.61000001]
Step 65 of epoch 0 has accuracy: [0.57999998]
Step 70 of epoch 0 has accuracy: [0.65499997]
Step 75 of epoch 0 has accuracy: [0.63999999]
Step 80 of epoch 0 has accuracy: [0.63999999]
Step 85 of epoch 0 has accuracy: [0.63]
Step 90 of epoch 0 has accuracy: [0.64999998]
Step 95 of epoch 0 has accuracy: [0.69499999]
Step 100 of epoch 0 has accuracy: [0.59500003]
Step 105 of epoch 0 has accuracy: [0.67000002]


In [12]:
###Get test set accuracy
twitter_preds = model.predict(test_input, test_len)
print(accuracy_score(twitter_preds.argmax(axis=1), test_labels.argmax(axis=1)))

0.655


In [None]:
%%bash
python -m tensorflow.tensorboard --logdir=tensorboard_logs

### Now point to the specified url and port to view the visualizations of the tensorflow graph and the statistics of the variables during training.

In [16]:
model.destruct()