# Udacity MLND Capstone — Tweet Classifier

## Prep
* load libraries
* establish baseline algorithm (Multinomial Naive Bayes)

In [1]:
import numpy as np
import pandas as pd

from keras.preprocessing import sequence

Using TensorFlow backend.


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

tweets = pd.read_csv("./ExtractedTweets.csv")

---

## Baseline Algorithm (Multinomial NB)

In [3]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(tweets.Tweet)

print(X_train_counts.shape)

(86460, 126330)


In [4]:

tweet_accounts = tweets.iloc[:, :2].drop_duplicates()
accounts_train, accounts_test = train_test_split(tweet_accounts.Handle, stratify=tweet_accounts.Party, \
                                                 test_size=0.2, random_state=41)

tweets_train = tweets[tweets.Handle.isin(accounts_train)].reset_index().drop('index', axis=1)
tweets_test = tweets[tweets.Handle.isin(accounts_test)].reset_index().drop('index', axis=1)

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

mnb_pipeline = Pipeline([('vec', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

mnb_pipeline.fit(tweets_train.Tweet, tweets_train.Party)

p = mnb_pipeline.predict(tweets_test.Tweet)
np.mean(p == tweets_test.Party)

0.75217089267106629

In [16]:
from sklearn.metrics import f1_score as f1

#tweets_test.Party.values

predicted = pd.factorize(p)[0]
actual = pd.factorize(tweets_test.Party)[0]
print("---")
print(predicted)
print("---")
print(actual)

f1_a = f1(predicted, actual)
print(f1_a)


---
[0 0 1 ..., 1 1 1]
---
[0 0 0 ..., 1 1 1]
0.768632113711


---

## Neural Networks

### Tensor Setup

In [17]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Flatten, Activation
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling1D, MaxPooling3D
from keras.layers.embeddings import Embedding

from nltk.tokenize.casual import TweetTokenizer
from nltk import FreqDist

tokenizer = TweetTokenizer(reduce_len=True)

top_words = 25000

tweets_nn = tweets
tweets_nn.Tweet = tweets_nn.Tweet.apply(tokenizer.tokenize)

In [18]:
print("...")

fdist = FreqDist(word for tweet in tweets_nn.Tweet for word in tweet)
print("...fdist")
terms = [term for term, count in fdist.most_common(top_words)]
print("...terms")
tweets_nn.Tweet = tweets_nn.Tweet.apply(\
                    lambda tweet:[terms.index(term)\
                    if term in terms else 0 for term in tweet]
                )
print("...term lambda")

nn_tweeters = tweets_nn.iloc[:,:2].drop_duplicates()

print("^^^")

...
...fdist
...terms
...term lambda
^^^


In [19]:

nn_tweeters_train, nn_tweeters_test = train_test_split(nn_tweeters.Handle,\
                           stratify=nn_tweeters.Party, test_size=0.2, random_state=42)

nn_tweets_train = tweets_nn[tweets_nn.Handle.isin(nn_tweeters_train)].reset_index().drop('index', axis=1)
nn_tweets_test = tweets_nn[tweets_nn.Handle.isin(nn_tweeters_test)].reset_index().drop('index',axis=1)


In [20]:
print(nn_tweets_train.shape)
print(nn_tweets_test.shape)
nn_tweets_test.head()

(69185, 3)
(17275, 3)


Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepJackyRosen,"[383, 3, 24, 479, 15, 19, 0, 4, 0, 0, 20248, 1..."
1,Democrat,RepJackyRosen,"[25, 1609, 2127, 5479, 466, 4, 119, 108, 7, 70..."
2,Democrat,RepJackyRosen,"[168, 275, 3, 0, 6378, 10, 40, 0, 1665, 151, 0..."
3,Democrat,RepJackyRosen,"[3873, 5548, 3, 42, 0, 104, 41, 261, 24, 5634,..."
4,Democrat,RepJackyRosen,"[52, 22, 4, 4819, 4, 10, 1634, 80, 3, 24, 110,..."


In [21]:
nn_tweets_train.head()


Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"[51, 4, 221, 1208, 161, 3, 3304, 2, 191, 3, 77..."
1,Democrat,RepDarrenSoto,"[11, 24397, 6, 4422, 5996, 3374, 60, 0, 6895, ..."
2,Democrat,RepDarrenSoto,"[11, 12592, 6, 2, 1962, 11102, 21, 1219, 3005,..."
3,Democrat,RepDarrenSoto,"[11, 0, 6, 1348, 17, 1962, 2, 87, 10, 382, 0, ..."
4,Democrat,RepDarrenSoto,"[11, 0, 6, 1219, 1079, 1825, 13, 2544, 750, 45..."


In [22]:
np.random.seed(42)

X_train = np.array(nn_tweets_train.Tweet)
X_test = np.array(nn_tweets_test.Tweet)
y_train = np.array((nn_tweets_train.Party == 'Democrat').astype(int))
y_test = np.array((nn_tweets_test.Party == 'Democrat').astype(int))

In [23]:
tweet_length = 60
X_train = sequence.pad_sequences(X_train, maxlen=tweet_length)
X_test = sequence.pad_sequences(X_test, maxlen=tweet_length)

In [69]:
import keras.backend as K

def my_f1_score(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return c3

    # How many selected items are relevant?
    precision = c1 / c2

    # How many relevant items are selected?
    recall = c1 / c3

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score


# from Medium blog post: https://medium.com/@thongonary/how-to-compute-f1-score-for-each-epoch-in-keras-a1acd17715a2

from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

class MyMetrics(Callback):    
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("\n— val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        return



### Neural Net — Model 1 — LSTM Recurrent Neural Network

In [70]:
my_metrics1 = MyMetrics()
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=tweet_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
#model.add(Dropout(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
#model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',])
print(model.summary())
#model.fit(X_train, y_train, epochs=3, batch_size=64)
model.fit(X_train, y_train, epochs=3, batch_size=64, callbacks=[my_metrics1,], validation_data=(X_test, y_test))
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 80, 32)            800000    
_________________________________________________________________
conv1d_35 (Conv1D)           (None, 80, 32)            3104      
_________________________________________________________________
max_pooling1d_35 (MaxPooling (None, 40, 32)            0         
_________________________________________________________________
lstm_19 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 101       
Total params: 856,405
Trainable params: 856,405
Non-trainable params: 0
_________________________________________________________________
None
Train on 69185 samples, validate on 17275 samples
Epoch 1/3
— val_f1: 0.723588 — val_precision: 0.729001 — val_recall 0.718255
Epoc

### Neural Net — Model 2 — VGG Net Basis

In [65]:

print(X_train.shape)

model2 = Sequential()

model2.add(Embedding(top_words, embedding_vector_length, input_length=tweet_length))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(LSTM(100))

model2.add(Dense(1, activation='sigmoid'))

model2.summary()


(69185, 80)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 80, 32)            800000    
_________________________________________________________________
conv1d_25 (Conv1D)           (None, 80, 64)            6208      
_________________________________________________________________
max_pooling1d_25 (MaxPooling (None, 40, 64)            0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 40, 64)            0         
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 40, 64)            12352     
_________________________________________________________________
max_pooling1d_26 (MaxPooling (None, 20, 64)            0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 20, 64)            0        

In [66]:
my_metrics2 = MyMetrics()

model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',])

model2.fit(X_train, y_train, epochs=1, batch_size=64, callbacks=[my_metrics2,], validation_data=(X_test, y_test))

scores2 = model2.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores2[1]*100))

Train on 69185 samples, validate on 17275 samples
Epoch 1/1
Accuracy: 72.29%


### Neural Net 3 — Bigger VGG Net

In [67]:

embedding_vector_length_longer = 64
tweet_length = 80
X_train = sequence.pad_sequences(X_train, maxlen=tweet_length)
X_test = sequence.pad_sequences(X_test, maxlen=tweet_length)
print(X_train.shape)

model3 = Sequential()

model3.add(Embedding(top_words, embedding_vector_length_longer, input_length=tweet_length))

model3.add(Conv1D(filters=256, kernel_size=12, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Dropout(0.2))

model3.add(Conv1D(filters=256, kernel_size=12, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Dropout(0.2))

model3.add(Conv1D(filters=256, kernel_size=12, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Dropout(0.2))

model3.add(Conv1D(filters=256, kernel_size=12, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Dropout(0.2))

model3.add(Conv1D(filters=256, kernel_size=12, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Dropout(0.2))

#model3.add(Conv1D(filters=256, kernel_size=12, padding='same', activation='relu'))
#model3.add(MaxPooling1D(pool_size=2))
#model3.add(Dropout(0.2))


model3.add(LSTM(400, dropout=0.2, recurrent_dropout=0.2))
model3.add(Dense(1, activation='sigmoid'))

model3.summary()

(69185, 80)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 80, 64)            1600000   
_________________________________________________________________
conv1d_30 (Conv1D)           (None, 80, 256)           196864    
_________________________________________________________________
max_pooling1d_30 (MaxPooling (None, 40, 256)           0         
_________________________________________________________________
dropout_16 (Dropout)         (None, 40, 256)           0         
_________________________________________________________________
conv1d_31 (Conv1D)           (None, 40, 256)           786688    
_________________________________________________________________
max_pooling1d_31 (MaxPooling (None, 20, 256)           0         
_________________________________________________________________
dropout_17 (Dropout)         (None, 20, 256)           0        

In [68]:
my_metrics3 = MyMetrics()

model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',])


model3.fit(X_train, y_train, epochs=1, batch_size=64, callbacks=[my_metrics3,], validation_data=(X_test, y_test))


scores3 = model3.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores3[1]*100))

Train on 69185 samples, validate on 17275 samples
Epoch 1/1
Accuracy: 72.17%
