# Udacity MLND Capstone — Tweet Classifier

## Prep
* load libraries
* establish baseline algorithm (Multinomial Naive Bayes)

In [1]:
import numpy as np
import pandas as pd

from keras.preprocessing import sequence

Using TensorFlow backend.


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

tweets = pd.read_csv("./ExtractedTweets.csv")

---

## Baseline Algorithm (Multinomial NB)

In [3]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(tweets.Tweet)

print(X_train_counts.shape)

(86460, 126330)


In [4]:

tweet_accounts = tweets.iloc[:, :2].drop_duplicates()
accounts_train, accounts_test = train_test_split(tweet_accounts.Handle, stratify=tweet_accounts.Party, \
                                                 test_size=0.2, random_state=41)

tweets_train = tweets[tweets.Handle.isin(accounts_train)].reset_index().drop('index', axis=1)
tweets_test = tweets[tweets.Handle.isin(accounts_test)].reset_index().drop('index', axis=1)

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

mnb_pipeline = Pipeline([('vec', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

mnb_pipeline.fit(tweets_train.Tweet, tweets_train.Party)

p = mnb_pipeline.predict(tweets_test.Tweet)
np.mean(p == tweets_test.Party)

0.75217089267106629

---

## Neural Networks

### Tensor Setup

In [6]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling1D, MaxPooling3D
from keras.layers.embeddings import Embedding

from nltk.tokenize.casual import TweetTokenizer
from nltk import FreqDist

tokenizer = TweetTokenizer(reduce_len=True)

top_words = 25000

tweets_nn = tweets
tweets_nn.Tweet = tweets_nn.Tweet.apply(tokenizer.tokenize)

In [7]:
print("...")

fdist = FreqDist(word for tweet in tweets_nn.Tweet for word in tweet)
print("...fdist")
terms = [term for term, count in fdist.most_common(top_words)]
print("...terms")
tweets_nn.Tweet = tweets_nn.Tweet.apply(\
                    lambda tweet:[terms.index(term)\
                    if term in terms else 0 for term in tweet]
                )
print("...term lambda")

nn_tweeters = tweets_nn.iloc[:,:2].drop_duplicates()

print("^^^")

...
...fdist
...terms
...term lambda
^^^


In [8]:

nn_tweeters_train, nn_tweeters_test = train_test_split(nn_tweeters.Handle,\
                           stratify=nn_tweeters.Party, test_size=0.2, random_state=42)

nn_tweets_train = tweets_nn[tweets_nn.Handle.isin(nn_tweeters_train)].reset_index().drop('index', axis=1)
nn_tweets_test = tweets_nn[tweets_nn.Handle.isin(nn_tweeters_test)].reset_index().drop('index',axis=1)


In [9]:
print(nn_tweets_train.shape)
print(nn_tweets_test.shape)
nn_tweets_test.head()

(69185, 3)
(17275, 3)


Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepJackyRosen,"[383, 3, 24, 479, 15, 19, 0, 4, 0, 0, 20105, 1..."
1,Democrat,RepJackyRosen,"[25, 1621, 2137, 5433, 466, 4, 119, 108, 7, 69..."
2,Democrat,RepJackyRosen,"[168, 275, 3, 0, 6358, 10, 40, 0, 1659, 150, 0..."
3,Democrat,RepJackyRosen,"[3884, 5594, 3, 42, 0, 104, 41, 261, 24, 5702,..."
4,Democrat,RepJackyRosen,"[52, 22, 4, 4837, 4, 10, 1628, 80, 3, 24, 110,..."


In [10]:
nn_tweets_train.head()


Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"[51, 4, 222, 1204, 161, 3, 3312, 2, 191, 3, 77..."
1,Democrat,RepDarrenSoto,"[11, 23901, 6, 4415, 5963, 3417, 60, 0, 7109, ..."
2,Democrat,RepDarrenSoto,"[11, 12229, 6, 2, 1962, 10929, 21, 1223, 3012,..."
3,Democrat,RepDarrenSoto,"[11, 0, 6, 1349, 17, 1962, 2, 87, 10, 382, 0, ..."
4,Democrat,RepDarrenSoto,"[11, 0, 6, 1223, 1076, 1813, 13, 2561, 752, 45..."


In [11]:
np.random.seed(42)

X_train = np.array(nn_tweets_train.Tweet)
X_test = np.array(nn_tweets_test.Tweet)
y_train = np.array((nn_tweets_train.Party == 'Democrat').astype(int))
y_test = np.array((nn_tweets_test.Party == 'Democrat').astype(int))

In [12]:
tweet_length = 60
X_train = sequence.pad_sequences(X_train, maxlen=tweet_length)
X_test = sequence.pad_sequences(X_test, maxlen=tweet_length)

### Neural Net — Model 1 — LSTM Recurrent Neural Network

In [13]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=tweet_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
#model.add(Dropout(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
#model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 60, 32)            800000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 60, 32)            3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 30, 32)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 856,405
Trainable params: 856,405
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 72.27%


### Neural Net — Model 2 — VGG Net Basis

In [21]:
print(X_train.shape)
model2 = Sequential()

model2.add(Embedding(top_words, embedding_vector_length, input_length=tweet_length))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(LSTM(100))

model2.add(Dense(1, activation='sigmoid'))

model2.summary()


(69185, 60)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 60, 32)            800000    
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 60, 64)            6208      
_________________________________________________________________
max_pooling1d_15 (MaxPooling (None, 30, 64)            0         
_________________________________________________________________
dropout_13 (Dropout)         (None, 30, 64)            0         
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 30, 64)            12352     
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 15, 64)            0         
_________________________________________________________________
dropout_14 (Dropout)         (None, 15, 64)            0        

In [24]:
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',])
model2.fit(X_train, y_train, epochs=10, batch_size=64)
scores2 = model2.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores2[1]*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 71.40%
