## Analyzing Tweets

We can gather a sample of Twitter data using the Twitter API (https://dev.twitter.com).  To do so, we'll need to create a Twitter application and get credentials for it.  You can do this manually at https://app.twitter.com.  Once you have an app, go to the "Key and Access Tokens" tab to find your credentials.

In [None]:
import random
import twitter
import emoji
import itertools
import pandas as pd
from itertools import chain
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
import keras.callbacks
import json

import os
import nb_utils
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Merge, LSTM, Embedding, GlobalMaxPooling1D
from keras.models import Model
from keras.layers.merge import Concatenate, Average

from gensim.models import Word2Vec

In [None]:
# Fill these in!

CONSUMER_KEY = 'xbMuxcJpRTiVGt2C2EYnA'
CONSUMER_SECRET = '2DbQTsvIptkPTdaUcos8DDvQH9fzO0hNjJpUT2uVzQ'
ACCESS_TOKEN = '7319442-EDm4CPxL7W4KkZcGWRMJNVHp88W5OH9vgblu898fg'
ACCESS_SECRET = '5ZxJSbqXhG7uhgXzTFWf9XhkfsxxinlPRXyDTzbA9w'

In [None]:
auth=twitter.OAuth(
    consumer_key=CONSUMER_KEY,
    consumer_secret=CONSUMER_SECRET,
    token=ACCESS_TOKEN,
    token_secret=ACCESS_SECRET,
)

status_stream = twitter.TwitterStream(auth=auth).statuses

[x['text'] for x in itertools.islice(status_stream.sample(), 0, 5) if x.get('text')]

In [None]:
status_stream = twitter.TwitterStream(auth=auth).statuses

def english_has_emoji(tweet):
    if tweet.get('lang') != 'en':
        return False
    return any(ch for ch in tweet.get('text', '') if ch in emoji.UNICODE_EMOJI)

%time tweets = list(itertools.islice(filter(english_has_emoji, status_stream.sample()), 0, 100))

In [None]:
stripped = []
for tweet in tweets:
    text = tweet['text']
    emojis = {ch for ch in text if ch in emoji.UNICODE_EMOJI}
    if len(emojis) == 1:
        emoiji = emojis.pop()
        text = ''.join(ch for ch in text if ch != emoiji)
        stripped.append((text, emoiji))
len(stripped)

## Using the CNN

Let's see what the CNN of the previous chapter does on the data

In [None]:
all_tweets = pd.read_csv('data/emojis.csv')
all_tweets['emoji'].value_counts()

In [None]:
tweets = all_tweets.groupby('emoji').filter(lambda c:len(c) > 1000)
tweets['emoji'].value_counts()

In [None]:
max(tweets['text'], key=lambda t:len(t))

In [None]:
chars = list(sorted(set(chain(*tweets['text']))))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
max_sequence_len = max(len(x) for x in tweets['text'])

emojis = list(sorted(set(tweets['emoji'])))
emoji_to_idx = {em: idx for idx, em in enumerate(emojis)}
emojis[:10]

train_tweets, test_tweets = train_test_split(tweets, test_size=0.1)


In [None]:
def data_generator(tweets, batch_size):
    while True:
        if batch_size is None:
            batch = tweets
            batch_size = batch.shape[0]
        else:
            batch = tweets.sample(batch_size)
        X = np.zeros((batch_size, max_sequence_len, len(chars)))
        y = np.zeros((batch_size,))
        for row_idx, (_, row) in enumerate(batch.iterrows()):
            y[row_idx] = emoji_to_idx[row['emoji']]
            for ch_idx, ch in enumerate(row['text']):
                X[row_idx, ch_idx, char_to_idx[ch]] = 1
        yield X, y

next(data_generator(tweets, 10))

In [None]:
def create_char_cnn_model(num_chars, max_sequence_len, num_labels):
    char_input = Input(shape=(max_sequence_len, num_chars), name='char_cnn_input')
    
    conv_1x = Conv1D(128, 6, activation='relu', padding='valid')(char_input)
    max_pool_1x = MaxPooling1D(4)(conv_1x)
    conv_2x = Conv1D(256, 6, activation='relu', padding='valid')(max_pool_1x)
    max_pool_2x = MaxPooling1D(4)(conv_2x)

    flatten = Flatten()(max_pool_2x)
    dense = Dense(128, activation='relu')(flatten)
    preds = Dense(num_labels, activation='softmax', name='char_cnn_predictions')(dense)

    model = Model(char_input, preds)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    return model

char_cnn_model = create_char_cnn_model(len(char_to_idx), max_sequence_len, len(emojis))
char_cnn_model.summary()

In [None]:
early = keras.callbacks.EarlyStopping(monitor='loss',
                              min_delta=0.03,
                              patience=2,
                              verbose=0, mode='auto')

BATCH_SIZE = 512
char_cnn_model.fit_generator(
    data_generator(train_tweets, batch_size=BATCH_SIZE),
    epochs=20,
    steps_per_epoch=len(train_tweets) / BATCH_SIZE,
    verbose=2,
    callbacks=[early]
)

In [None]:
char_cnn_model.evaluate_generator(
    data_generator(test_tweets, batch_size=BATCH_SIZE),
    steps=len(test_tweets) / BATCH_SIZE
)

In [None]:
with open('zoo/07/emoji_chars.json', 'w') as fout:
    json.dump({
        'emojis': ''.join(emojis),
        'char_to_idx': char_to_idx,
        'max_sequence_len': max_sequence_len,
    }, fout)
char_cnn_model.save('zoo/07/char_cnn_model.h5')
char_cnn_model.save_weights('zoo/07/char_cnn_model_weights.h5')

In [None]:
pd.options.display.max_colwidth = 128
inspect_tweets = test_tweets.sample(100)
predicted = char_cnn_model.predict_generator(data_generator(inspect_tweets, batch_size=None), steps=1)
show = pd.DataFrame({
    'text': inspect_tweets['text'],
    'true': inspect_tweets['emoji'],
    'pred': [emojis[np.argmax(x)] for x in predicted],
})
show = show[['text', 'true', 'pred']]
show.head(10)

In [None]:
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Merge, LSTM
from keras.models import Model
from keras.layers.merge import Concatenate

def create_char_cnn_model2(num_chars, max_sequence_len, num_labels, drop_out=0.25):
    char_input = Input(shape=(max_sequence_len, num_chars), name='char_cnn_input')
    
    layers = []
    for window in (4, 5, 6):
        conv_1x = Conv1D(128, window, activation='relu', padding='valid')(char_input)
        max_pool_1x = MaxPooling1D(4)(conv_1x)
        dropout_1x = Dropout(drop_out)(max_pool_1x)
        conv_2x = Conv1D(256, window, activation='relu', padding='valid')(dropout_1x)
        max_pool_2x = MaxPooling1D(4)(conv_2x)
        dropout_2x = Dropout(drop_out)(max_pool_2x)
        layers.append(dropout_2x)

    merged = Concatenate(axis=1)(layers)

    dropout = Dropout(drop_out)(merged)
    
    flatten = Flatten()(dropout)
    dense = Dense(128, activation='relu')(flatten)
    preds = Dense(num_labels, activation='softmax', name='char_cnn_predictions')(dense)

    model = Model(char_input, preds)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    return model

char_cnn_model2 = create_char_cnn_model2(len(char_to_idx), max_sequence_len, len(emojis))
char_cnn_model2.summary()

In [None]:
BATCH_SIZE = 2048
char_cnn_model2.fit_generator(
    data_generator(train_tweets, batch_size=BATCH_SIZE),
    epochs=30,
    steps_per_epoch=len(train_tweets) / BATCH_SIZE,
    verbose=2,
    callbacks=[early]
)

In [None]:
char_cnn_model2.evaluate_generator(
    data_generator(test_tweets, batch_size=BATCH_SIZE),
    steps=len(test_tweets) / BATCH_SIZE
)

In [None]:
"75s - loss: 2.3855 - acc: 0.4368\n[2.8089022636413574, 0.38840296648550726]"

## Featurizing and preparing our data

Just like we did when computing word embeddings, we want to featurize our data so we can classify it effectively.

In [None]:
VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(tweets['text'])

In [None]:
training_tokens = tokenizer.texts_to_sequences(train_tweets['text'])
test_tokens = tokenizer.texts_to_sequences(test_tweets['text'])
max_num_tokens = max(len(x) for x in chain(training_tokens, test_tokens))
training_tokens = pad_sequences(training_tokens, maxlen=max_num_tokens)
test_tokens = pad_sequences(test_tokens, maxlen=max_num_tokens)

In [None]:
training_labels = np.asarray([emoji_to_idx[em] for em in train_tweets['emoji']])
test_labels = np.asarray([emoji_to_idx[em] for em in test_tweets['emoji']])

In [None]:
def load_weights(tokenizer):
    model = Word2Vec.load('data/twitter_w2v.model')
    w2v = np.zeros((tokenizer.num_words, w2v_model.syn0.shape[1]))
    for k, v in tokenizer.word_index.items():
        if v >= tokenizer.num_words:
            continue
        if k in w2v_model:
            w2v[v] = w2v_model[k]
    return w2v

In [None]:
# This may take a while to load

#w2v = load_weights(tokenizer)

#model = Word2Vec.load('data/twitter_w2v.model')
w2v = np.zeros((tokenizer.num_words, model.wv.syn0.shape[1]))
found = 0
for k, v in tokenizer.word_index.items():
    if v >= tokenizer.num_words:
        continue
    if k in model:
        w2v[v] = model[k]
        found += 1
found, tokenizer.num_words


# World Level CNN

As with our previous task, we can try using more powerful models to classify our text.  In this case, the limited training data and text size limit their effectiveness.

In [None]:
def create_cnn_model(vocab_size, embedding_size=None, embedding_weights=None, drop_out=0.2):
    message = Input(shape=(max_num_tokens,), dtype='int32', name='cnn_input')
    
    
    # The convolution layer in keras does not support masking, so we just allow
    # the embedding layer to learn an explicit value.
    embedding = Embedding(mask_zero=False, input_dim=vocab_size, 
                          output_dim=embedding_weights.shape[1], 
                          weights=[embedding_weights],
                          trainable=True,
                          name='cnn_embedding')(message)
    
    global_pools = []
    for window in 2, 3:
        conv_1x = Conv1D(128, window, activation='relu', padding='valid')(embedding)
        max_pool_1x = MaxPooling1D(2)(conv_1x)
        conv_2x = Conv1D(256, window, activation='relu', padding='valid')(max_pool_1x)
        max_pool_2x = MaxPooling1D(2)(conv_2x)
        conv_3x = Conv1D(256, window, activation='relu', padding='valid')(max_pool_2x)

        global_pools.append(GlobalMaxPooling1D()(conv_3x))

    merged = Concatenate(axis=1)(global_pools)
    fc1 = Dense(units=128, activation='elu')(merged)
    preds = Dense(units=len(emojis), activation='softmax', name='cnn_predictions')(fc1)
    model = Model(
        inputs=[message],
        outputs=[preds],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

cnn_model = create_cnn_model(VOCAB_SIZE, embedding_weights=w2v)
cnn_model.summary()

In [None]:
cnn_model.fit(training_tokens, training_labels, epochs=5)

In [None]:
def create_lstm_model(vocab_size, embedding_size=None, embedding_weights=None):
    message = Input(shape=(None,), dtype='int32', name='lstm_input')
    embedding = Embedding(mask_zero=False, input_dim=vocab_size, 
                          output_dim=embedding_weights.shape[1], 
                          weights=[embedding_weights],
                          trainable=True,
                          name='lstm_embedding')(message)

    lstm_1 = LSTM(units=128, return_sequences=False)(embedding)
    preds = Dense(units=len(emojis), activation='softmax', name='lstm_predictions')(lstm_1)
    
    model = Model(
        inputs=[message],
        outputs=[preds],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [None]:
lstm_model = create_lstm_model(VOCAB_SIZE, embedding_weights=w2v)
lstm_model.summary()

In [None]:
lstm_model.fit(training_tokens, training_labels, epochs=12, batch_size=1024, callbacks=[early])

In [None]:
lstm_model.evaluate(test_tokens, test_labels)

## Comparing our models

Let's compare the predictions from our models on a sample of our data.

In [None]:
test_char_vectors, _ = next(data_generator(test_tweets, None)) 

In [None]:
predictions = {
    label: [emojis[np.argmax(x)] for x in pred]
    for label, pred in (
        ('lstm', lstm_model.predict(test_tokens[:100])),
        ('char_cnn', char_cnn_model.predict(test_char_vectors[:100])),
        ('cnn', cnn_model.predict(test_tokens[:100])),
    )
}

In [None]:
# Make a dataframe just for test data
pd.options.display.max_colwidth = 128
test_df = test_tweets[:100].reset_index()
eval_df = pd.DataFrame({
    'content': test_df['text'],
    'true': test_df['emoji'],
    **predictions
})
eval_df[['content', 'true', 'char_cnn', 'cnn', 'lstm']].head(25)

## Qualitative Evaluation

We can examine some of our error cases by hand.  Often, the models tend to agree when they make mistakes, and that the mistakes aren't unreasonable: this task would be challenging even for a human.

In [None]:
eval_df[eval_df['lstm'] != eval_df['true']].head(10)

In [None]:
def combined_data_generator(tweets, tokens, batch_size):
    tweets = tweets.reset_index()
    while True:
        batch_idx = random.sample(range(len(tweets)), batch_size)
        tweet_batch = tweets.iloc[batch_idx]
        token_batch = tokens[batch_idx]
        char_vec = np.zeros((batch_size, max_sequence_len, len(chars)))
        token_vec = np.zeros((batch_size, max_num_tokens))
        y = np.zeros((batch_size,))
        for row_idx, (token_row, (_, tweet_row)) in enumerate(zip(token_batch, tweet_batch.iterrows())):
            y[row_idx] = emoji_to_idx[tweet_row['emoji']]
            for ch_idx, ch in enumerate(tweet_row['text']):
                char_vec[row_idx, ch_idx, char_to_idx[ch]] = 1
            token_vec[row_idx, :] = token_row
        yield {'char_cnn_input': char_vec, 'cnn_input': token_vec, 'lstm_input': token_vec}, y

d, y = next(combined_data_generator(train_tweets, training_tokens, 5))
d['lstm_input'].shape

In [None]:
def prediction_layer(model):
    layers = [layer for layer in model.layers if layer.name.endswith('_predictions')]
    return layers[0].output

def create_ensemble(*models):
    inputs = [model.input for model in models]
    predictions = [prediction_layer(model) for model in models]
    merged = Average()(predictions)
    model = Model(
        inputs=inputs,
        outputs=[merged],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model


ensemble = create_ensemble(char_cnn_model2, cnn_model, lstm_model)
ensemble.summary()

In [None]:
BATCH_SIZE = 512
ensemble.fit_generator(
    combined_data_generator(train_tweets, training_tokens, BATCH_SIZE),
    epochs=20,
    steps_per_epoch=len(train_tweets) / BATCH_SIZE,
    verbose=2,
    callbacks=[early]
)

In [None]:
ensemble.evaluate_generator(
    combined_data_generator(test_tweets, test_tokens, BATCH_SIZE),
    steps=len(test_tweets) / BATCH_SIZE
)

In [None]:
len(train_tweets)