In [1]:
do_mount=False
if do_mount:
  from google.colab import drive
  drive.mount('/content/gdrive')

In [2]:
# Parameters
do_save_activations = False

TRAIN_INPUT = 'twitgen_train_201906011956.csv'
VALID_INPUT = 'twitgen_valid_201906011956.csv'
TEST_INPUT = 'twitgen_test_201906011956.csv'
EMBEDDING_DIM = 200
MAXLEN = 50  # Maximum number of words per tweet that will be processed

In [3]:
basepath = '../data/'
glovefile = 'glove.twitter.27B.200d.txt.gz'
glovepath = basepath + glovefile

In [4]:
if do_mount:
  # Get the embedding initialization file
  !cp '$glovepath' .
  !gunzip $glovefile
  !ls -l

In [5]:
import tensorflow as tf
import pandas as pd
import os
import re
import keras
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
from keras.optimizers import Adam, Adagrad
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from datetime import datetime
import string

keras.__version__

Using TensorFlow backend.


'2.2.4'

In [6]:
# Read in the data
df_train = pd.read_csv(basepath+TRAIN_INPUT, index_col=['id','time'], parse_dates=['time'])
df_valid = pd.read_csv(basepath+VALID_INPUT, index_col=['id','time'], parse_dates=['time'])
df_test = pd.read_csv(basepath+TEST_INPUT, index_col=['id','time'], parse_dates=['time'])
df_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,male
id,time,Unnamed: 2_level_1,Unnamed: 3_level_1
1083596943807393792,2019-05-27 23:27:08+00:00,"Ah, the Flat Earth gambit.\nWell played.",True
815783987784187904,2019-05-24 15:36:01+00:00,Aw ily,False
3458239641,2019-05-24 19:00:37+00:00,I hate being sick,False
1003729876250226688,2019-05-26 12:53:00+00:00,You still didn't' do shit tho. Slow down the...,True
2360143940,2019-05-28 03:50:46+00:00,Harriet Tubman may not be on the $20 bill... f...,False


In [7]:
# Maximum number of words per tweet in each data set
(df_train.text.str.split().apply(len).max(), 
 df_valid.text.str.split().apply(len).max(),
 df_test.text.str.split().apply(len).max())

(34, 30, 31)

In [8]:
# Text Normalization function

# Taken from 
# https://medium.com/@sabber/classifying-yelp-review-comments-using-lstm-and-word-embeddings-part-1-eb2275e4066b
# which was taken from https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings
# but this version no longer does stemming or stop word elmination

# This is for general text, not Twitter-specific.
# Maybe would get a better classifier if we used a Python transaltion of this:
# https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
# but that is arguably outside the scope of this project
# and my initial attempts to use Twitter-specific preprocessing have been unsuccessful


def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text


In [9]:
# Process the data for model input
def get_texts_and_labels(df):
  texts = df['text'].map(lambda x: clean_text(x)).tolist()
  texts = [t.split()[0:MAXLEN] for t in texts]
  labels = df['male'].tolist()
  return texts, labels

train_text, train_label = get_texts_and_labels(df_train)
valid_text, valid_label = get_texts_and_labels(df_valid)
test_text, test_label = get_texts_and_labels(df_test)

max([len(x) for x in train_text]), max([len(x) for x in valid_text]), max([len(x) for x in test_text])

(47, 42, 42)

In [10]:
# Fit tokenizer on training data
tok = Tokenizer()
tok.fit_on_texts(train_text)
vocab_size = len(tok.word_index) + 1

In [11]:
# Tokenize the data
def get_tokenized_texts(texts):
  encoded_docs = tok.texts_to_sequences(texts)
  padded_docs = pad_sequences(encoded_docs, maxlen=MAXLEN, padding='post')
  return padded_docs

docs_train = get_tokenized_texts(train_text)
docs_valid = get_tokenized_texts(valid_text)
docs_test = get_tokenized_texts(test_text)

print(type(docs_train), len(docs_train), len(docs_valid), len(docs_test))
docs_train[0][:10]

<class 'numpy.ndarray'> 34146 10914 10450


array([  956,     1,  4035,  1154, 13312,     8,    98,   732,     0,
           0], dtype=int32)

In [12]:
# Load the whole embedding into memory
embeddings_index = dict()
f = open(glovefile[:-3])
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1193514 word vectors.


In [13]:
# Create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tok.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [14]:
# NERUAL NETWORK MODEL

# PARAMETERS

batchsize = 512

temporal_dropout = .2
random_dropout = .1
flat_dropout= .3

frozen_lr = 1e-3
frozen_decay = 1e-4
frozen_epochs = 12

unfrozen_lr = 3e-4
unfrozen_decay = 3e-5
unfrozen_epochs = 35
unfrozen_batchsize = batchsize


inputs = layers.Input((MAXLEN,), dtype="int32")

# EMBEDDING BLOCK
raw_embed = layers.Embedding(vocab_size, 
                           EMBEDDING_DIM, 
                           weights=[embedding_matrix], 
                           input_length=MAXLEN, 
                           trainable=False)(inputs)
embed_random_drop = layers.Dropout(rate=random_dropout)(raw_embed)
embed_time_drop = layers.Dropout(rate=temporal_dropout, 
                       noise_shape=(None, MAXLEN, 1))(embed_random_drop)

pooled = layers.GlobalMaxPooling1D()(embed_time_drop)
drop_pooled = layers.Dropout(rate=flat_dropout)(pooled)
pred = layers.Dense(1, activation='sigmoid')(drop_pooled)

# FINAL MODEL
model = Model(inputs=[inputs], outputs=pred)
model.compile(optimizer=Adam(frozen_lr, decay=frozen_decay), 
              loss='binary_crossentropy', metrics=['acc'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 200)           6026200   
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 200)           0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 50, 200)           0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 200)               0         
_________________________________________________________________
dropout_3 (Dropout)  

In [15]:
model.layers

[<keras.engine.input_layer.InputLayer at 0x7f402249c6a0>,
 <keras.layers.embeddings.Embedding at 0x7f40c46b4c50>,
 <keras.layers.core.Dropout at 0x7f4030b02518>,
 <keras.layers.core.Dropout at 0x7f40c46693c8>,
 <keras.layers.pooling.GlobalMaxPooling1D at 0x7f40c4669748>,
 <keras.layers.core.Dropout at 0x7f4030b02a90>,
 <keras.layers.core.Dense at 0x7f402249ce48>]

In [16]:
# Fit the frozen model

model.fit(docs_train, train_label, epochs=frozen_epochs, 
          validation_data=(docs_valid, valid_label), batch_size=batchsize)

Instructions for updating:
Use tf.cast instead.
Train on 34146 samples, validate on 10914 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f4022480a58>

In [17]:
[(l.name, l.trainable) for l in model.layers]

[('input_1', False),
 ('embedding_1', False),
 ('dropout_1', True),
 ('dropout_2', True),
 ('global_max_pooling1d_1', True),
 ('dropout_3', True),
 ('dense_1', True)]

In [18]:
# Unfreeze the embeddings
model.layers[1].trainable = True
model.compile(optimizer=Adam(unfrozen_lr, decay=unfrozen_decay), 
              loss='binary_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 200)           6026200   
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 200)           0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 50, 200)           0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 200)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total para

In [19]:
# Fit the unfrozen model

model.fit(docs_train, train_label, epochs=unfrozen_epochs, 
          validation_data=(docs_valid, valid_label), batch_size=unfrozen_batchsize)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 34146 samples, validate on 10914 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7f4022483630>