In [1]:
do_mount=False
if do_mount:
  from google.colab import drive
  drive.mount('/content/gdrive')

In [2]:
# Parameters
TRAIN_INPUT = 'twitgen_train_201906011956.csv'
VALID_INPUT = 'twitgen_valid_201906011956.csv'
TEST_INPUT = 'twitgen_test_201906011956.csv'
EMBEDDING_DIM = 200
MAXLEN = 50  # Maximum number of words per tweet that will be processed

In [3]:
basepath = '../data/'
glovefile = 'glove.twitter.27B.200d.txt.gz'
glovepath = basepath + glovefile
model_file_name = 'twitgen_lstm_201907261114.h5'
base_timestamp = model_file_name[-15:-3]
base_timestamp

'201907261114'

In [4]:
if do_mount:
  # Get the embedding initialization file
  !cp '$glovepath' .
  !gunzip $glovefile
  !ls -l

In [5]:
import tensorflow as tf
import pandas as pd
import os
import re
import keras
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
from keras.optimizers import Adam, Adagrad
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from datetime import datetime
import string

keras.__version__

Using TensorFlow backend.


'2.2.4'

In [6]:
# Read in the data
df_train = pd.read_csv(basepath+TRAIN_INPUT, index_col=['id','time'], parse_dates=['time'])
df_valid = pd.read_csv(basepath+VALID_INPUT, index_col=['id','time'], parse_dates=['time'])
df_test = pd.read_csv(basepath+TEST_INPUT, index_col=['id','time'], parse_dates=['time'])
df_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,male
id,time,Unnamed: 2_level_1,Unnamed: 3_level_1
1083596943807393792,2019-05-27 23:27:08+00:00,"Ah, the Flat Earth gambit.\nWell played.",True
815783987784187904,2019-05-24 15:36:01+00:00,Aw ily,False
3458239641,2019-05-24 19:00:37+00:00,I hate being sick,False
1003729876250226688,2019-05-26 12:53:00+00:00,You still didn't' do shit tho. Slow down the...,True
2360143940,2019-05-28 03:50:46+00:00,Harriet Tubman may not be on the $20 bill... f...,False


In [7]:
# Maximum number of words per tweet in each data set
(df_train.text.str.split().apply(len).max(), 
 df_valid.text.str.split().apply(len).max(),
 df_test.text.str.split().apply(len).max())

(34, 30, 31)

In [8]:
# Text Normalization function

# Taken from 
# https://medium.com/@sabber/classifying-yelp-review-comments-using-lstm-and-word-embeddings-part-1-eb2275e4066b
# which was taken from https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings
# but this version no longer does stemming or stop word elmination

# This is for general text, not Twitter-specific.
# Maybe would get a better classifier if we used a Python transaltion of this:
# https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
# but that is arguably outside the scope of this project
# and my initial attempts to use Twitter-specific preprocessing have been unsuccessful


def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text


In [9]:
# Process the data for model input
def get_texts_and_labels(df):
  texts = df['text'].map(lambda x: clean_text(x)).tolist()
  texts = [t.split()[0:MAXLEN] for t in texts]
  labels = df['male'].tolist()
  return texts, labels

train_text, train_label = get_texts_and_labels(df_train)
valid_text, valid_label = get_texts_and_labels(df_valid)
test_text, test_label = get_texts_and_labels(df_test)

max([len(x) for x in train_text]), max([len(x) for x in valid_text]), max([len(x) for x in test_text])

(47, 42, 42)

In [10]:
# Fit tokenizer on training data
tok = Tokenizer()
tok.fit_on_texts(train_text)
vocab_size = len(tok.word_index) + 1

In [11]:
# Tokenize the data
def get_tokenized_texts(texts):
  encoded_docs = tok.texts_to_sequences(texts)
  padded_docs = pad_sequences(encoded_docs, maxlen=MAXLEN, padding='post')
  return padded_docs

docs_train = get_tokenized_texts(train_text)
docs_valid = get_tokenized_texts(valid_text)
docs_test = get_tokenized_texts(test_text)

print(type(docs_train), len(docs_train), len(docs_valid), len(docs_test))
docs_train[0][:10]

<class 'numpy.ndarray'> 34146 10914 10450


array([  956,     1,  4035,  1154, 13312,     8,    98,   732,     0,
           0], dtype=int32)

In [12]:
# Load the whole embedding into memory
embeddings_index = dict()
f = open(glovefile[:-3])
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1193514 word vectors.


In [13]:
# Create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tok.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [14]:
# NERUAL NETWORK MODEL


# PARAMETERS
batchsize = 128

lstm_dim = 80
residual_connection_width = 40

dropout_factor = 1.07
spatiotemporal_dropout = 0.25 * dropout_factor
lstm_dropout = 0.3 * dropout_factor
residual_connection_dropout = 0.6 * dropout_factor
final_dropout = 0.7 * dropout_factor

base_frozen_lr = 1e-3
base_frozen_decay = 1e-4
frozen_epochs = 35
frozen_batchsize = batchsize

base_unfrozen_lr = 1.3e-4
base_unfrozen_decay = 2.5e-4
unfrozen_epochs = 29
unfrozen_batchsize = batchsize


base_batchsize = 512

frozen_lr_factor = frozen_batchsize / base_batchsize
unfrozen_lr_factor = unfrozen_batchsize / base_batchsize

frozen_lr = base_frozen_lr * frozen_lr_factor
frozen_decay = base_frozen_decay * frozen_lr_factor

unfrozen_lr = base_unfrozen_lr * unfrozen_lr_factor
unfrozen_decay = base_unfrozen_decay * unfrozen_lr_factor


inputs = layers.Input((MAXLEN,), dtype="int32")


# EMBEDDING BLOCK
raw_embed = layers.Embedding(vocab_size, 
                           EMBEDDING_DIM, 
                           weights=[embedding_matrix], 
                           input_length=MAXLEN, 
                           trainable=False)(inputs)
embed_random_drop = layers.Dropout(rate=spatiotemporal_dropout)(raw_embed)
embed_time_drop = layers.Dropout(rate=spatiotemporal_dropout, 
                       noise_shape=(None, MAXLEN, 1))(embed_random_drop)


# LEFT LSTM BLOCK

# Backward LSTM layer
lstm_bottom_left = layers.LSTM(lstm_dim, return_sequences=True, 
                               go_backwards=True, dropout=lstm_dropout, 
                               recurrent_dropout=lstm_dropout)(embed_time_drop)
lstm_random_drop_left = layers.Dropout(rate=spatiotemporal_dropout)(lstm_bottom_left)
lstm_time_drop_left = layers.Dropout(rate=spatiotemporal_dropout, 
                            noise_shape=(None,MAXLEN,1))(lstm_random_drop_left)
# Forward LSTM layer
lstm_top_left = layers.LSTM(lstm_dim, return_sequences=False, dropout=lstm_dropout, 
                            recurrent_dropout=lstm_dropout)(lstm_time_drop_left)


# RIGHT LSTM BLOCK

# Forward LSTM layer
lstm_bottom_right = layers.LSTM(lstm_dim, return_sequences=True, dropout=lstm_dropout, 
                                recurrent_dropout=lstm_dropout)(embed_time_drop)
lstm_random_drop_right = layers.Dropout(rate=spatiotemporal_dropout)(lstm_bottom_right)
lstm_time_drop_right = layers.Dropout(rate=spatiotemporal_dropout, 
                            noise_shape=(None,MAXLEN,1))(lstm_random_drop_right)
# Backward LSTM layer
lstm_top_right = layers.LSTM(80, return_sequences=False, 
                             go_backwards=True, dropout=lstm_dropout, 
                             recurrent_dropout=lstm_dropout)(lstm_time_drop_right)


# MERGE LEFT AND RIGHT BLOCK
merged_lstm = layers.merge.concatenate([lstm_top_left, lstm_top_right])


# LEFT RESIDUAL BRANCH
dropout_resid = layers.Dropout(rate=residual_connection_dropout)(merged_lstm)
dense_resid = layers.Dense(residual_connection_width, activation='relu')(dropout_resid)

# RIGHT RESIDUAL BRANCH
dropout_resid2 = layers.Dropout(rate=residual_connection_dropout)(merged_lstm)
dense_resid2 = layers.Dense(residual_connection_width, activation='relu')(dropout_resid2)


# FINAL DENSE BLOCK
merged_resid = layers.merge.concatenate([merged_lstm, dense_resid, dense_resid2],
                                       name='final_concat')
dropout = layers.Dropout(rate=final_dropout)(merged_resid)
pred = layers.Dense(1, activation='sigmoid')(dropout)


# FINAL MODEL
model = Model(inputs=[inputs], outputs=pred)
model.compile(optimizer=Adam(frozen_lr, decay=frozen_decay), 
              loss='binary_crossentropy', metrics=['acc'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 200)      6026200     input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 50, 200)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_2 (Dropout)  

In [15]:
model.layers

[<keras.engine.input_layer.InputLayer at 0x7f1330f84278>,
 <keras.layers.embeddings.Embedding at 0x7f1330f842e8>,
 <keras.layers.core.Dropout at 0x7f1330f848d0>,
 <keras.layers.core.Dropout at 0x7f1344a50400>,
 <keras.layers.recurrent.LSTM at 0x7f1330f84748>,
 <keras.layers.recurrent.LSTM at 0x7f132c09bd68>,
 <keras.layers.core.Dropout at 0x7f1330f84898>,
 <keras.layers.core.Dropout at 0x7f13006ea5f8>,
 <keras.layers.core.Dropout at 0x7f132c09ba90>,
 <keras.layers.core.Dropout at 0x7f13003752e8>,
 <keras.layers.recurrent.LSTM at 0x7f132c09bef0>,
 <keras.layers.recurrent.LSTM at 0x7f13003754a8>,
 <keras.layers.merge.Concatenate at 0x7f1300313780>,
 <keras.layers.core.Dropout at 0x7f12e4484d30>,
 <keras.layers.core.Dropout at 0x7f12d47a8860>,
 <keras.layers.core.Dense at 0x7f12e4484eb8>,
 <keras.layers.core.Dense at 0x7f12e40d52e8>,
 <keras.layers.merge.Concatenate at 0x7f12e412fa20>,
 <keras.layers.core.Dropout at 0x7f12d471ac88>,
 <keras.layers.core.Dense at 0x7f12e44e20f0>]

In [16]:
[(l.name, l.trainable) for l in model.layers]

[('input_1', False),
 ('embedding_1', False),
 ('dropout_1', True),
 ('dropout_2', True),
 ('lstm_1', True),
 ('lstm_3', True),
 ('dropout_3', True),
 ('dropout_5', True),
 ('dropout_4', True),
 ('dropout_6', True),
 ('lstm_2', True),
 ('lstm_4', True),
 ('concatenate_1', True),
 ('dropout_7', True),
 ('dropout_8', True),
 ('dense_1', True),
 ('dense_2', True),
 ('final_concat', True),
 ('dropout_9', True),
 ('dense_3', True)]

In [17]:
# Unfreeze the embeddings
model.layers[1].trainable = True
model.compile(optimizer=Adam(unfrozen_lr, decay=unfrozen_decay), 
              loss='binary_crossentropy', metrics=['acc'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 200)      6026200     input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 50, 200)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 50, 200)      0           dropout_1[0][0]                  
__________________________________________________________________________________________________
lstm_1 (LS

In [18]:
model.load_weights(basepath+model_file_name)

In [19]:
timestamp = base_timestamp

In [20]:
inp = model.input
layer_name = 'final_concat'
activations = model.get_layer(layer_name).output
get_activations = K.function([inp, K.learning_phase()], [activations])

def get_inference_activations(docs):
    return(get_activations([docs, 0])[0])

def save_activations(df, docs, split, timestamp):

    split_activations = get_inference_activations(docs)

    for i in range(split_activations.shape[1]):
      df['activ'+str(i)] = (split_activations[:,i]*1e6).astype(int)

    activ_file_name = basepath + 'activ_' + split + '_' + timestamp + '.csv'
    df.drop(['text'],axis=1).to_csv(activ_file_name)
    print(activ_file_name)

save_activations(df_train, docs_train, 'train', timestamp)
save_activations(df_valid, docs_valid, 'valid', timestamp)
save_activations(df_test, docs_test, 'test', timestamp)


../data/activ_train_201907261114.csv
../data/activ_valid_201907261114.csv
../data/activ_test_201907261114.csv
