In [1]:
do_mount=False
if do_mount:
  from google.colab import drive
  drive.mount('/content/gdrive')

In [2]:
# Parameters
TRAIN_INPUT = 'twitgen_train_201906011956.csv'
CORPUS = 'twitgen_big_corpus_201907251843.csv'

EMBEDDING_DIM = 200
ACTIVATION_DIM = 400
MAXLEN = 50  # Maximum number of words per tweet that will be processed

In [3]:
basepath = '../data/'
glovefile = 'glove.twitter.27B.200d.txt.gz'
glovepath = basepath + glovefile
model_file_name = 'PoolModel_201907041941.h5'

In [4]:
if do_mount:
  # Get the embedding initialization file
  !cp '$glovepath' .
  !gunzip $glovefile
  !ls -l

In [5]:
import tensorflow as tf
import pandas as pd
import os
import re
import keras
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
from keras.optimizers import Adam, Adagrad
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from datetime import datetime
import string

keras.__version__

Using TensorFlow backend.


'2.2.4'

In [6]:
# Read in the data
df_train = pd.read_csv(basepath+TRAIN_INPUT, index_col=['id','time'], parse_dates=['time'])
df_corpus = pd.read_csv(basepath+CORPUS, index_col=['id','time'], parse_dates=['time'])
df_corpus.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,male
id,time,Unnamed: 2_level_1,Unnamed: 3_level_1
1703564846,2019-05-21 17:50:48+00:00,I prefer tubs.. but nice try jiggly,True
211806644,2019-05-21 17:50:48+00:00,It's the shop. I'd never have paid for 2 year...,False
971515498411241472,2019-05-21 17:50:48+00:00,Why can’t attend Cannes 🎬,False
2430359987,2019-05-21 17:50:49+00:00,"Raspberry gin and tonic, nomnom 😋 beautiful ev...",False
2860188236,2019-05-21 17:50:49+00:00,This is 🏴󠁧󠁢󠁥󠁮󠁧󠁿🇬🇧🏴󠁧󠁢󠁥󠁮󠁧󠁿🇬🇧🏴󠁧󠁢󠁥󠁮󠁧󠁿 here we g...,True


In [7]:
# Maximum number of words per tweet in each data set
(df_train.text.str.split().apply(len).max(), 
 df_corpus.text.str.split().apply(len).max())

(34, 37)

In [8]:
# Text Normalization function

# Taken from 
# https://medium.com/@sabber/classifying-yelp-review-comments-using-lstm-and-word-embeddings-part-1-eb2275e4066b
# which was taken from https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings
# but this version no longer does stemming or stop word elmination

# This is for general text, not Twitter-specific.
# Maybe would get a better classifier if we used a Python transaltion of this:
# https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
# but that is arguably outside the scope of this project
# and my initial attempts to use Twitter-specific preprocessing have been unsuccessful


def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text


In [9]:
# Process the data for model input
def get_texts_and_labels(df):
  texts = df['text'].map(lambda x: clean_text(x)).tolist()
  texts = [t.split()[0:MAXLEN] for t in texts]
  labels = df['male'].tolist()
  return texts, labels

train_text, train_label = get_texts_and_labels(df_train)
corpus_text, corpus_label = get_texts_and_labels(df_corpus)

max([len(x) for x in train_text]), max([len(x) for x in corpus_text])

(47, 50)

In [10]:
# Fit tokenizer on training data
tok = Tokenizer()
tok.fit_on_texts(train_text)
vocab_size = len(tok.word_index) + 1

In [11]:
# Tokenize the data
def get_tokenized_texts(texts):
  encoded_docs = tok.texts_to_sequences(texts)
  padded_docs = pad_sequences(encoded_docs, maxlen=MAXLEN, padding='post')
  return padded_docs

docs_corpus = get_tokenized_texts(corpus_text)

print(type(docs_corpus))
docs_corpus[0][:10]

<class 'numpy.ndarray'>


array([    2,  1984, 24950,    31,   206,   305, 24951,     0,     0,
           0], dtype=int32)

In [12]:
# Load the whole embedding into memory
embeddings_index = dict()
f = open(glovefile[:-3])
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1193514 word vectors.


In [13]:
# Create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tok.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [14]:
# NERUAL NETWORK MODEL

# PARAMETERS

batchsize = 512

temporal_dropout = .2
random_dropout = .1
flat_dropout= .3

frozen_lr = 1e-3
frozen_decay = 1e-4
frozen_epochs = 12

unfrozen_lr = 3e-4
unfrozen_decay = 3e-5
unfrozen_epochs = 25
unfrozen_batchsize = batchsize


inputs = layers.Input((MAXLEN,), dtype="int32")

# EMBEDDING BLOCK
raw_embed = layers.Embedding(vocab_size, 
                           EMBEDDING_DIM, 
                           weights=[embedding_matrix], 
                           input_length=MAXLEN, 
                           trainable=False)(inputs)
embed_random_drop = layers.Dropout(rate=random_dropout)(raw_embed)
embed_time_drop = layers.Dropout(rate=temporal_dropout, 
                       noise_shape=(None, MAXLEN, 1))(embed_random_drop)

max_pooled = layers.GlobalMaxPooling1D()(embed_time_drop)
av_pooled = layers.GlobalAveragePooling1D()(embed_time_drop)
pooled = layers.merge.concatenate([max_pooled,av_pooled], name='pooled')
drop_pooled = layers.Dropout(rate=flat_dropout)(pooled)

pred = layers.Dense(1, activation='sigmoid')(drop_pooled)

# FINAL MODEL
model = Model(inputs=[inputs], outputs=pred)
model.compile(optimizer=Adam(frozen_lr, decay=frozen_decay), 
              loss='binary_crossentropy', metrics=['acc'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 200)      6026200     input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 50, 200)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_2 (Dropout)  

In [15]:
model.layers

[<keras.engine.input_layer.InputLayer at 0x7fcaa896f470>,
 <keras.layers.embeddings.Embedding at 0x7fcaa896fb70>,
 <keras.layers.core.Dropout at 0x7fcaa89832b0>,
 <keras.layers.core.Dropout at 0x7fcaa5345860>,
 <keras.layers.pooling.GlobalMaxPooling1D at 0x7fcaa5345518>,
 <keras.layers.pooling.GlobalAveragePooling1D at 0x7fcaa53459b0>,
 <keras.layers.merge.Concatenate at 0x7fcaa8983320>,
 <keras.layers.core.Dropout at 0x7fcaa40ed2e8>,
 <keras.layers.core.Dense at 0x7fcaa40ede48>]

In [16]:
[(l.name, l.trainable) for l in model.layers]

[('input_1', False),
 ('embedding_1', False),
 ('dropout_1', True),
 ('dropout_2', True),
 ('global_max_pooling1d_1', True),
 ('global_average_pooling1d_1', True),
 ('pooled', True),
 ('dropout_3', True),
 ('dense_1', True)]

In [17]:
# Unfreeze the embeddings
model.layers[1].trainable = True
model.compile(optimizer=Adam(unfrozen_lr, decay=unfrozen_decay), 
              loss='binary_crossentropy', metrics=['acc'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 200)      6026200     input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 50, 200)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 50, 200)      0           dropout_1[0][0]                  
__________________________________________________________________________________________________
global_max

In [18]:
model.load_weights(basepath+model_file_name)

In [19]:
chunk_size = 2**15
timestamp = datetime.now().strftime('%Y%m%d%H%M')

In [20]:
inp = model.input
layer_name = 'pooled'
activations = model.get_layer(layer_name).output
get_activations = K.function([inp, K.learning_phase()], [activations])

def get_inference_activations(docs):
    return(get_activations([docs, 0])[0])

def save_activations(df, docs, split, timestamp):

    full_len = df.shape[0]
    tweet_activations = np.empty(shape=[0,ACTIVATION_DIM])

    for start in range(0, full_len, chunk_size):
        end = min(start+chunk_size, full_len)
        activations_chunk = get_inference_activations(docs[start:end,:])
        tweet_activations = np.concatenate([tweet_activations, activations_chunk])    
    
    for i in range(tweet_activations.shape[1]):
      df['activ'+str(i)] = (tweet_activations[:,i]*1e6).astype(int)

    activ_file_name = basepath + 'pool_activ_' + split + '_' + timestamp + '.csv'
    df.drop(['text'],axis=1).to_csv(activ_file_name)
    print(activ_file_name)

save_activations(df_corpus, docs_corpus, 'corpus', timestamp)


../data/pool_activ_corpus_201907301317.csv
