Twitter Sentiment Analysis

Import the libraries that will be usedin this project.

In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_colwidth = 1000

Read in the twitter data into a panda dataframe. I originally ran into an encoding issue and had to save the csv in UTF-8. The csv file does not include a column header row, so add those in manually.

In [2]:
df = pd.read_csv("twitter_data.csv", header=None, names=["sentiment", "tweet_id", "date", "query", "user", "tweet"])

In [3]:
df.head()

Unnamed: 0,sentiment,tweet_id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


Remove columns that dont seem useful for sentiment analysis

In [4]:
df.drop(["tweet_id", "date", "query", "user"], axis=1, inplace=True)
df.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


DO NOT KEEP THIS IN THE PROJECT!!!
DROPPING A BUNCH OF ROWS TO SPEED UP PREPROCESSING WHILE TESTING!!!

In [5]:
df = df.sample(1000)
df.shape

(1000, 2)

Perform data preprocessing

In [6]:
import re

def processTweet(tweet):
    tweet = re.sub("[@|#]\w+\S","", tweet) # remove @usernames and #hashtags
    tweet = re.sub("http[s]?://[\S]+", '', tweet) # remove urls 
    tweet = re.sub(r"(.)\1\1+",r"\1\1", tweet) # remove letters that repeat more than 2 times
    return tweet

In [7]:
df['tweet'] = df['tweet'].map(lambda tweet: processTweet(tweet))

df.head()

Unnamed: 0,sentiment,tweet
200614,0,Almost back from Triple A. Saw my life flash before my eyes. My hearts in my stomach! Get me outta this car! lol
34294,0,My girl is sick.
347896,0,got a bad head ake
1337148,4,"Seriously, no worries about the book! I have way too much to read right now anyhow!"
922011,4,happy birthday


In [8]:
from sklearn.model_selection import train_test_split

y = df['sentiment']
X = df.drop(['sentiment'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()

train_vector = count_vector.fit_transform(X_train['tweet'])
test_vector = count_vector.transform(X_test['tweet'])

In [10]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(train_vector, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
predictions = naive_bayes.predict(test_vector)

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[65 49]
 [46 90]]
              precision    recall  f1-score   support

           0       0.59      0.57      0.58       114
           4       0.65      0.66      0.65       136

    accuracy                           0.62       250
   macro avg       0.62      0.62      0.62       250
weighted avg       0.62      0.62      0.62       250

0.62


BELOW HERE STARTING RNN

In [56]:
from keras.preprocessing import sequence, text

vocabulary_size = 5000

tokenizer = text.Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(X_train['tweet'])

train_token = tokenizer.texts_to_sequences(X_train['tweet'])
test_token = tokenizer.texts_to_sequences(X_test['tweet'])

max_words = 50
train_padded = sequence.pad_sequences(train_token, maxlen=max_words)
test_padded = sequence.pad_sequences(test_token, maxlen=max_words)

In [57]:
train_padded[83]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,  57,   8,  11,  19, 604,   2, 191,   4,  22, 338], dtype=int32)

In [58]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

embedding_size=32
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 32)            160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
