<a href="https://colab.research.google.com/github/andyharless/twit_demog/blob/master/twitgen_glovinit_best_dl_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Connect to data files
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# Parameters
TRAIN_INPUT = 'twitgen_train_201906011956.csv'
VALID_INPUT = 'twitgen_valid_201906011956.csv'
TEST_INPUT = 'twitgen_test_201906011956.csv'
EMBEDDING_DIM = 200
MAXLEN = 50  # Maximum number of words per tweet that will be processed

In [3]:
import tensorflow as tf
import pandas as pd
import os
import re
import keras
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
from keras.optimizers import Adam, Adagrad
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string

keras.__version__

Using TensorFlow backend.


'2.2.4'

In [0]:
basepath = '/content/gdrive/My Drive/twitgen/'
glovefile = 'glove.twitter.27B.200d.txt.gz'
glovepath = basepath + glovefile

In [5]:
# Get the embedding initialization file
!cp '$glovepath' .
!gunzip $glovefile
!ls -l

total 2009380
drwx------ 3 root root       4096 Jun  8 19:28 gdrive
-rw------- 1 root root 2057590469 Jun  8 19:28 glove.twitter.27B.200d.txt
drwxr-xr-x 1 root root       4096 May 31 16:17 sample_data


In [6]:
# Read in the data
df_train = pd.read_csv(basepath+TRAIN_INPUT, index_col=['id','time'], parse_dates=['time'])
df_valid = pd.read_csv(basepath+VALID_INPUT, index_col=['id','time'], parse_dates=['time'])
df_test = pd.read_csv(basepath+TEST_INPUT, index_col=['id','time'], parse_dates=['time'])
df_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,male
id,time,Unnamed: 2_level_1,Unnamed: 3_level_1
1083596943807393792,2019-05-27 23:27:08+00:00,"Ah, the Flat Earth gambit.\nWell played.",True
815783987784187904,2019-05-24 15:36:01+00:00,Aw ily,False
3458239641,2019-05-24 19:00:37+00:00,I hate being sick,False
1003729876250226688,2019-05-26 12:53:00+00:00,You still didn't' do shit tho. Slow down the...,True
2360143940,2019-05-28 03:50:46+00:00,Harriet Tubman may not be on the $20 bill... f...,False


In [7]:
# Maximum number of words per tweet in each data set
(df_train.text.str.split().apply(len).max(), 
 df_valid.text.str.split().apply(len).max(),
 df_test.text.str.split().apply(len).max())

(34, 30, 31)

In [0]:
# Text Normalization function

# Taken from 
# https://medium.com/@sabber/classifying-yelp-review-comments-using-lstm-and-word-embeddings-part-1-eb2275e4066b
# which was taken from https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings
# but this version no longer does stemming or stop word elmination

# This is for general text, not Twitter-specific.
# Probably would get a better classifier if we used a Python transaltion of this:
# https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
# but that is arguably outside the scope of this project

def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text


In [9]:
# Process the data for model input
def get_texts_and_labels(df):
  texts = df['text'].map(lambda x: clean_text(x)).tolist()
  texts = [t.split()[0:MAXLEN] for t in texts]
  labels = df['male'].tolist()
  return texts, labels

train_text, train_label = get_texts_and_labels(df_train)
valid_text, valid_label = get_texts_and_labels(df_valid)
test_text, test_label = get_texts_and_labels(df_test)

max([len(x) for x in train_text]), max([len(x) for x in valid_text]), max([len(x) for x in test_text])

(47, 42, 42)

In [0]:
# Fit tokenizer on training data
tok = Tokenizer()
tok.fit_on_texts(train_text)
vocab_size = len(tok.word_index) + 1

In [11]:
# Tokenize the data
def get_tokenized_texts(texts):
  encoded_docs = tok.texts_to_sequences(texts)
  padded_docs = pad_sequences(encoded_docs, maxlen=MAXLEN, padding='post')
  return padded_docs

docs_train = get_tokenized_texts(train_text)
docs_valid = get_tokenized_texts(valid_text)
docs_test = get_tokenized_texts(test_text)

print(type(docs_train), len(docs_train), len(docs_valid), len(docs_test))
docs_train[0][:10]

<class 'numpy.ndarray'> 34146 10914 10450


array([  956,     1,  4035,  1154, 13312,     8,    98,   732,     0,
           0], dtype=int32)

In [12]:
# Load the whole embedding into memory
embeddings_index = dict()
f = open(glovefile[:-3])
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1193514 word vectors.


In [0]:
# Create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tok.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [14]:
# NERUAL NETWORK MODEL


# PARAMETERS
batchsize = 512

lstm_dim = 80
residual_connection_width = 40

spatiotemporal_dropout = 0.20
lstm_dropout = 0.35
residual_connection_dropout = 0.50
final_dropout = 0.70

frozen_lr = 1e-3
frozen_decay = 1e-5
frozen_epochs = 2
frozen_batchsize = batchsize

unfrozen_lr = 2.7e-4
unfrozen_decay = 5.9e-5
unfrozen_epochs = 75 
unfrozen_batchsize = batchsize

inputs = layers.Input((MAXLEN,), dtype="int32")


# EMBEDDING BLOCK
raw_embed = layers.Embedding(vocab_size, 
                           EMBEDDING_DIM, 
                           weights=[embedding_matrix], 
                           input_length=MAXLEN, 
                           trainable=False)(inputs)
embed_random_drop = layers.Dropout(rate=spatiotemporal_dropout)(raw_embed)
embed_time_drop = layers.Dropout(rate=spatiotemporal_dropout, 
                       noise_shape=(None, MAXLEN, 1))(embed_random_drop)


# LEFT LSTM BLOCK

# Backward LSTM layer
lstm_bottom_left = layers.LSTM(lstm_dim, return_sequences=True, 
                               go_backwards=True, dropout=lstm_dropout, 
                               recurrent_dropout=lstm_dropout)(embed_time_drop)
lstm_random_drop_left = layers.Dropout(rate=spatiotemporal_dropout)(lstm_bottom_left)
lstm_time_drop_left = layers.Dropout(rate=spatiotemporal_dropout, 
                            noise_shape=(None,MAXLEN,1))(lstm_random_drop_left)
# Forward LSTM layer
lstm_top_left = layers.LSTM(lstm_dim, return_sequences=False, dropout=lstm_dropout, 
                            recurrent_dropout=lstm_dropout)(lstm_time_drop_left)


# RIGHT LSTM BLOCK

# Forward LSTM layer
lstm_bottom_right = layers.LSTM(lstm_dim, return_sequences=True, dropout=lstm_dropout, 
                                recurrent_dropout=lstm_dropout)(embed_time_drop)
lstm_random_drop_right = layers.Dropout(rate=spatiotemporal_dropout)(lstm_bottom_right)
lstm_time_drop_right = layers.Dropout(rate=spatiotemporal_dropout, 
                            noise_shape=(None,MAXLEN,1))(lstm_random_drop_right)
# Backward LSTM layer
lstm_top_right = layers.LSTM(80, return_sequences=False, 
                             go_backwards=True, dropout=lstm_dropout, 
                             recurrent_dropout=lstm_dropout)(lstm_time_drop_right)


# MERGE LEFT AND RIGHT BLOCK
merged_lstm = layers.merge.concatenate([lstm_top_left, lstm_top_right])


# RESIDUAL BLOCK
dropout_resid = layers.Dropout(rate=residual_connection_dropout)(merged_lstm)
dense_resid = layers.Dense(residual_connection_width, activation='relu')(dropout_resid)


# FINAL DENSE BLOCK
merged_resid = layers.merge.concatenate([merged_lstm, dense_resid])
dropout = layers.Dropout(rate=final_dropout)(merged_resid)
pred = layers.Dense(1, activation='sigmoid')(dropout)


# FINAL MODEL
model = Model(inputs=[inputs], outputs=pred)
model.compile(optimizer=Adam(frozen_lr, decay=frozen_decay), 
              loss='binary_crossentropy', metrics=['acc'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 200)      6026200     input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 50, 200)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_2 (Dropout)  

In [15]:
# Fit the frozen model

model.fit(docs_train, train_label, epochs=frozen_epochs, 
          validation_data=(docs_valid, valid_label), batch_size=batchsize)

Instructions for updating:
Use tf.cast instead.
Train on 34146 samples, validate on 10914 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f58932c66a0>

In [16]:
# Fit the unfrozen model

model.layers[0].trainable = True
model.compile(optimizer=Adam(unfrozen_lr, decay=unfrozen_decay), 
              loss='binary_crossentropy', metrics=['acc'])
model.fit(docs_train, train_label, epochs=unfrozen_epochs, 
          validation_data=(docs_valid, valid_label), batch_size=unfrozen_batchsize)

Train on 34146 samples, validate on 10914 samples
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


<keras.callbacks.History at 0x7f5858cddc18>