# Udacity MLND Capstone — Tweet Classifier

## Prep
* load libraries
* establish baseline algorithm (Multinomial Naive Bayes)

In [1]:
import numpy as np
import pandas as pd

from keras.preprocessing import sequence

Using TensorFlow backend.


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

tweets = pd.read_csv("./ExtractedTweets.csv")

---

## Baseline Algorithm (Multinomial NB)

In [3]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(tweets.Tweet)

print(X_train_counts.shape)

(86460, 126330)


In [4]:

tweet_accounts = tweets.iloc[:, :2].drop_duplicates()
accounts_train, accounts_test = train_test_split(tweet_accounts.Handle, stratify=tweet_accounts.Party, \
                                                 test_size=0.2, random_state=41)

tweets_train = tweets[tweets.Handle.isin(accounts_train)].reset_index().drop('index', axis=1)
tweets_test = tweets[tweets.Handle.isin(accounts_test)].reset_index().drop('index', axis=1)

In [36]:
print(tweets_train.shape)
print(tweets_test.shape)
tweets_train.head()
print(tweets_train.size)

(69186, 3)
(17274, 3)
207558


In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

mnb_pipeline = Pipeline([('vec', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

mnb_pipeline.fit(tweets_train.Tweet, tweets_train.Party)

p = mnb_pipeline.predict(tweets_test.Tweet)
np.mean(p == tweets_test.Party)

0.75217089267106629

In [6]:
from sklearn.metrics import f1_score as f1

#tweets_test.Party.values

predicted = pd.factorize(p)[0]
actual = pd.factorize(tweets_test.Party)[0]
print("---")
print(predicted)
print("---")
print(actual)

f1_a = f1(predicted, actual)
print(f1_a)


---
[0 0 1 ..., 1 1 1]
---
[0 0 0 ..., 1 1 1]
0.768632113711


### Constant Classifier
* for comparison

In [10]:
from sklearn.dummy import DummyClassifier

dmb_pipeline = Pipeline([('vec', CountVectorizer()), ('tfidf', TfidfTransformer()),\
                         ('clf', DummyClassifier(strategy='constant', constant='Republican'))])

dmb_pipeline.fit(tweets_train.Tweet, tweets_train.Party)

p = dmb_pipeline.predict(tweets_test.Tweet)
np.mean(p == tweets_test.Party)

0.5207826791710084

---

## Neural Networks

### Tensor Setup

In [11]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Flatten, Activation
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling1D, MaxPooling3D
from keras.layers.embeddings import Embedding

from nltk.tokenize.casual import TweetTokenizer
from nltk import FreqDist

tokenizer = TweetTokenizer(reduce_len=True)

top_words = 25000

tweets_nn = tweets
tweets_nn.Tweet = tweets_nn.Tweet.apply(tokenizer.tokenize)

In [12]:
print("...")

fdist = FreqDist(word for tweet in tweets_nn.Tweet for word in tweet)
print("...fdist")
terms = [term for term, count in fdist.most_common(top_words)]
print("...terms")
tweets_nn.Tweet = tweets_nn.Tweet.apply(\
                    lambda tweet:[terms.index(term)\
                    if term in terms else 0 for term in tweet]
                )
print("...term lambda")

nn_tweeters = tweets_nn.iloc[:,:2].drop_duplicates()

print("^^^")

...
...fdist
...terms
...term lambda
^^^


In [13]:

nn_tweeters_train, nn_tweeters_test = train_test_split(nn_tweeters.Handle,\
                           stratify=nn_tweeters.Party, test_size=0.2, random_state=42)

nn_tweets_train = tweets_nn[tweets_nn.Handle.isin(nn_tweeters_train)].reset_index().drop('index', axis=1)
nn_tweets_test = tweets_nn[tweets_nn.Handle.isin(nn_tweeters_test)].reset_index().drop('index',axis=1)


In [14]:
print(nn_tweets_train.shape)
print(nn_tweets_test.shape)
nn_tweets_test.head()

(69185, 3)
(17275, 3)


Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepJackyRosen,"[383, 3, 24, 479, 15, 19, 0, 4, 0, 0, 20652, 1..."
1,Democrat,RepJackyRosen,"[25, 1610, 2129, 5443, 468, 4, 119, 108, 7, 69..."
2,Democrat,RepJackyRosen,"[168, 275, 3, 0, 6290, 10, 40, 0, 1668, 150, 0..."
3,Democrat,RepJackyRosen,"[3891, 5617, 3, 42, 0, 104, 41, 261, 24, 5690,..."
4,Democrat,RepJackyRosen,"[52, 22, 4, 4835, 4, 10, 1628, 80, 3, 24, 110,..."


In [32]:
print(tweets_train.shape)
tweets_train.head()

##dmb_nn_pipeline = Pipeline([('vec', CountVectorizer()), ('tfidf', TfidfTransformer()),\
##                         ('clf', DummyClassifier(strategy='constant', constant='Republican'))])

##dmb_nn_pipeline.fit(nn_tweets_train.Tweet, nn_tweets_train.Party)

##p = dmb_nn_pipeline.predict(nn_tweets_test.Tweet)
##np.mean(p == tweets_test.Party)

dc = DummyClassifier(strategy='constant', constant=0)
dc.fit(nn_tweets_train.Tweet, nn_tweets_train.Party)

(69186, 3)


ValueError: setting an array element with a sequence.

In [15]:
nn_tweets_train.head()


Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"[51, 4, 221, 1210, 161, 3, 3292, 2, 191, 3, 77..."
1,Democrat,RepDarrenSoto,"[11, 23147, 6, 4373, 5894, 3412, 60, 0, 7159, ..."
2,Democrat,RepDarrenSoto,"[11, 11862, 6, 2, 1967, 11542, 21, 1222, 3004,..."
3,Democrat,RepDarrenSoto,"[11, 0, 6, 1352, 17, 1967, 2, 87, 10, 381, 0, ..."
4,Democrat,RepDarrenSoto,"[11, 0, 6, 1222, 1077, 1830, 13, 2562, 752, 45..."


In [16]:
np.random.seed(42)

X_train = np.array(nn_tweets_train.Tweet)
X_test = np.array(nn_tweets_test.Tweet)
y_train = np.array((nn_tweets_train.Party == 'Democrat').astype(int))
y_test = np.array((nn_tweets_test.Party == 'Democrat').astype(int))

In [17]:
tweet_length = 60
X_train = sequence.pad_sequences(X_train, maxlen=tweet_length)
X_test = sequence.pad_sequences(X_test, maxlen=tweet_length)

In [18]:
import keras.backend as K

# by epoch
def my_f1_score(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return c3

    # How many selected items are relevant?
    precision = c1 / c2

    # How many relevant items are selected?
    recall = c1 / c3

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score


# from Medium blog post: https://medium.com/@thongonary/how-to-compute-f1-score-for-each-epoch-in-keras-a1acd17715a2

from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

class MyMetrics(Callback):    
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        return


In [19]:
# actual metric
# from so: https://stackoverflow.com/a/45305384/54423
def my_f1_metric(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


def my_recall_metric(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def my_precision_metric(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

### Neural Net — Model 1 — LSTM Recurrent Neural Network

In [20]:
#my_metrics1 = MyMetrics()
embedding_vector_length = 32

model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=tweet_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 60, 32)            800000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 60, 32)            3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 30, 32)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 856,405
Trainable params: 856,405
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
import tensorflow as tf
print(y_train)
print("---")
print(y_train.shape)
#y_test.shape
print(tf.rank(y_train))
print(tf.rank(y_test))
print(y_test.shape)
print(y_train.dtype)

print(y_train.shape[0])
zeros = tf.zeros(y_train.shape[0])

print(zeros.shape)
print(tf.rank(zeros))

[1 1 1 ..., 0 0 0]
---
(69185,)
Tensor("Rank:0", shape=(), dtype=int32)
Tensor("Rank_1:0", shape=(), dtype=int32)
(17275,)
int64
69185
(69185,)
Tensor("Rank_2:0", shape=(), dtype=int32)


In [22]:

#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc', 'binary_accuracy', my_f1_metric, my_recall_metric, my_precision_metric])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy', my_f1_metric,\
                                                                    my_recall_metric, my_precision_metric])
model.fit(X_train, y_train, epochs=3, batch_size=64)
#model.fit(X_train, y_train, epochs=3, batch_size=64, callbacks=[my_metrics1,], validation_data=(X_test, y_test))


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa4529fbc50>

In [28]:
scores = model.evaluate(X_test, y_test, verbose=0)

print("Accuracy: %.2f%%" % (scores[1]*100))

print(scores)

Accuracy: 71.80%
[1.3168071156990337, 0.71803183791261338, 0.38815476948894051, 0.3337109779103965, 0.4755344221940434]


### Neural Net — Model 2 — VGG Net Basis

In [51]:

print(X_train.shape)

model2 = Sequential()

model2.add(Embedding(top_words, embedding_vector_length, input_length=tweet_length))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.2))

model2.add(LSTM(100))

model2.add(Dense(1, activation='sigmoid'))

model2.summary()


(69185, 80)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 80, 32)            800000    
_________________________________________________________________
conv1d_38 (Conv1D)           (None, 80, 64)            6208      
_________________________________________________________________
max_pooling1d_38 (MaxPooling (None, 40, 64)            0         
_________________________________________________________________
dropout_26 (Dropout)         (None, 40, 64)            0         
_________________________________________________________________
conv1d_39 (Conv1D)           (None, 40, 64)            12352     
_________________________________________________________________
max_pooling1d_39 (MaxPooling (None, 20, 64)            0         
_________________________________________________________________
dropout_27 (Dropout)         (None, 20, 64)            0        

In [52]:
my_metrics2 = MyMetrics()

model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', my_f1_metric])

model2.fit(X_train, y_train, epochs=3, batch_size=64)
#model2.fit(X_train, y_train, epochs=3, batch_size=64, callbacks=[my_metrics2,], validation_data=(X_test, y_test))

scores2 = model2.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores2[1]*100))
print(scores2)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 72.72%
[0.61286864787900841, 0.72723589002137246, 0.37038056455024243]


### Neural Net 3 — Bigger VGG Net

In [53]:

embedding_vector_length_longer = 64
tweet_length = 80
X_train = sequence.pad_sequences(X_train, maxlen=tweet_length)
X_test = sequence.pad_sequences(X_test, maxlen=tweet_length)
print(X_train.shape)

model3 = Sequential()

model3.add(Embedding(top_words, embedding_vector_length_longer, input_length=tweet_length))

model3.add(Conv1D(filters=256, kernel_size=12, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Dropout(0.2))

model3.add(Conv1D(filters=256, kernel_size=12, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Dropout(0.2))

model3.add(Conv1D(filters=256, kernel_size=12, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Dropout(0.2))

model3.add(Conv1D(filters=256, kernel_size=12, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Dropout(0.2))

model3.add(Conv1D(filters=256, kernel_size=12, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Dropout(0.2))

#model3.add(Conv1D(filters=256, kernel_size=12, padding='same', activation='relu'))
#model3.add(MaxPooling1D(pool_size=2))
#model3.add(Dropout(0.2))


model3.add(LSTM(400, dropout=0.2, recurrent_dropout=0.2))
model3.add(Dense(1, activation='sigmoid'))

model3.summary()

(69185, 80)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 80, 64)            1600000   
_________________________________________________________________
conv1d_43 (Conv1D)           (None, 80, 256)           196864    
_________________________________________________________________
max_pooling1d_43 (MaxPooling (None, 40, 256)           0         
_________________________________________________________________
dropout_31 (Dropout)         (None, 40, 256)           0         
_________________________________________________________________
conv1d_44 (Conv1D)           (None, 40, 256)           786688    
_________________________________________________________________
max_pooling1d_44 (MaxPooling (None, 20, 256)           0         
_________________________________________________________________
dropout_32 (Dropout)         (None, 20, 256)           0        

In [54]:
#my_metrics3 = MyMetrics()

model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', my_f1_metric])
model3.fit(X_train, y_train, epochs=3, batch_size=64)

#model3.fit(X_train, y_train, epochs=3, batch_size=64, callbacks=[my_metrics3,], validation_data=(X_test, y_test))


scores3 = model3.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores3[1]*100))
print(scores3)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 71.65%
[0.76729745943363425, 0.71646888565568612, 0.38666918192862082]
