In [1]:
do_mount=False
have_glove_embeddings=True
if do_mount:
  from google.colab import drive
  drive.mount('/content/gdrive')

In [2]:
# Parameters
do_save_activations = False

TRAIN_INPUT = 'twitgen_train_201906011956.csv'
VALID_INPUT = 'twitgen_valid_201906011956.csv'
TEST_INPUT = 'twitgen_test_201906011956.csv'
EMBEDDING_DIM = 200
MAXLEN = 25  # Maximum number of words per tweet that will be processed

In [3]:
basepath = '../data/'
if have_glove_embeddings:
    embed_file = 'glove_train_embeddings.pkl.gz'
else:
    glovepath = basepath + glovefile
    glovefile = 'glove.twitter.27B.200d.txt.gz'

In [4]:
if do_mount and not have_glove_embeddings:
  # Get the embedding initialization file
  !cp '$glovepath' .
  !gunzip $glovefile
  !ls -l

In [5]:
import tensorflow as tf
import pandas as pd
import os
import re
import keras
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
from keras.optimizers import Adam, Adagrad
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from datetime import datetime
from collections import Counter
import string

keras.__version__

Using TensorFlow backend.


'2.2.4'

In [6]:
# Read in the data
df_train = pd.read_csv(basepath+TRAIN_INPUT, index_col=['id','time'], parse_dates=['time'])
df_valid = pd.read_csv(basepath+VALID_INPUT, index_col=['id','time'], parse_dates=['time'])
df_test = pd.read_csv(basepath+TEST_INPUT, index_col=['id','time'], parse_dates=['time'])
df_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,male
id,time,Unnamed: 2_level_1,Unnamed: 3_level_1
1083596943807393792,2019-05-27 23:27:08+00:00,"Ah, the Flat Earth gambit.\nWell played.",True
815783987784187904,2019-05-24 15:36:01+00:00,Aw ily,False
3458239641,2019-05-24 19:00:37+00:00,I hate being sick,False
1003729876250226688,2019-05-26 12:53:00+00:00,You still didn't' do shit tho. Slow down the...,True
2360143940,2019-05-28 03:50:46+00:00,Harriet Tubman may not be on the $20 bill... f...,False


In [7]:
# Maximum number of words per tweet in each data set
(df_train.text.str.split().apply(len).max(), 
 df_valid.text.str.split().apply(len).max(),
 df_test.text.str.split().apply(len).max())

(34, 30, 31)

In [8]:
# Text Normalization function

# Taken from 
# https://medium.com/@sabber/classifying-yelp-review-comments-using-lstm-and-word-embeddings-part-1-eb2275e4066b
# which was taken from https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings
# but this version no longer does stemming or stop word elmination

# This is for general text, not Twitter-specific.
# Maybe would get a better classifier if we used a Python transaltion of this:
# https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
# but that is arguably outside the scope of this project
# and my initial attempts to use Twitter-specific preprocessing have been unsuccessful


def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text


In [9]:
# Process the data for model input
def get_texts_and_labels(df):
  texts = df['text'].map(lambda x: clean_text(x)).tolist()
  texts = [t.split()[0:MAXLEN] for t in texts]
  labels = df['male'].tolist()
  return texts, labels

train_text, train_label = get_texts_and_labels(df_train)
valid_text, valid_label = get_texts_and_labels(df_valid)
test_text, test_label = get_texts_and_labels(df_test)

max([len(x) for x in train_text]), max([len(x) for x in valid_text]), max([len(x) for x in test_text])

(25, 25, 25)

In [10]:
# Fit tokenizer on training data
tok = Tokenizer()
tok.fit_on_texts(train_text)
vocab_size = len(tok.word_index) + 1
vocab_size

30027

In [11]:
def flatten_text(text):
    return([word for doc in text for word in doc])

In [12]:
flat_train = flatten_text(train_text)
len(flat_train)

393894

In [13]:
train_counts = Counter(flat_train)
len(train_counts)

30026

In [41]:
train_counts.most_common(50)

[('the', 11704),
 ('i', 10906),
 ('to', 8710),
 ('!', 8138),
 ('a', 7826),
 ('you', 6848),
 ('and', 6220),
 ('+', 5574),
 ('of', 5082),
 ('is', 5052),
 ('it', 4788),
 ('in', 4439),
 ('for', 4260),
 ('that', 3679),
 ('this', 3466),
 ('my', 3315),
 ('not', 3122),
 ('on', 2946),
 ('-', 2510),
 ('are', 2506),
 ('me', 2458),
 ('have', 2301),
 ('be', 2183),
 (':', 2181),
 ('s', 2179),
 ('with', 2139),
 ('so', 2031),
 ('t', 1896),
 ('your', 1862),
 ('just', 1807),
 ('but', 1746),
 ('do', 1718),
 ('was', 1706),
 ('we', 1700),
 ('what', 1661),
 ('he', 1635),
 ('at', 1600),
 ('all', 1558),
 ('like', 1540),
 ('can', 1401),
 ('they', 1395),
 ('one', 1326),
 ('if', 1267),
 ('out', 1227),
 ('will', 1172),
 ('from', 1161),
 ('as', 1149),
 ('by', 1123),
 ('am', 1102),
 ('more', 1093)]

In [15]:
len(train_text), len(train_text[0])

(34146, 8)

In [16]:
train_text[0]

['ah', 'the', 'flat', 'earth', 'gambit', '+', 'well', 'played']

In [17]:
# Tokenize the data
def get_tokenized_texts(texts):
  encoded_docs = tok.texts_to_sequences(texts)
  padded_docs = pad_sequences(encoded_docs, maxlen=MAXLEN, padding='post')
  return padded_docs

docs_train = get_tokenized_texts(train_text)
docs_valid = get_tokenized_texts(valid_text)
docs_test = get_tokenized_texts(test_text)

print(type(docs_train), len(docs_train), len(docs_valid), len(docs_test))
docs_train[0][:10]

<class 'numpy.ndarray'> 34146 10914 10450


array([  952,     1,  4021,  1149, 13275,     8,    98,   731,     0,
           0], dtype=int32)

In [18]:
if have_glove_embeddings:
    embed_df = pd.read_pickle('../data/glove_train_embeddings.pkl.gz')
    embeddings_index = {word:embedding.values for word,embedding in embed_df.iterrows()}
else:
    # Load the whole embedding into memory
    embeddings_index = dict()
    f = open(glovefile[:-3])
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 25458 word vectors.


In [19]:
len(embeddings_index)

25458

In [20]:
it = iter(embeddings_index)
for i in range(3):
    word = next(it)
    emb = embeddings_index[word]
    print(word, type(emb), emb.shape)

: <class 'numpy.ndarray'> (200,)
rt <class 'numpy.ndarray'> (200,)
! <class 'numpy.ndarray'> (200,)


In [21]:
# Create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tok.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [22]:
embedding_matrix.shape

(30027, 200)

In [23]:
sums = np.sum(embedding_matrix,axis=1)
sums.shape

(30027,)

In [24]:
len(sums)-np.sum(sums==0.)

25379

In [67]:
def normalize(vector):
    norm = np.linalg.norm(vector)
    if norm==0:
        return(vector)
    else:
        return(vector/norm)
normed_embeddings_index = {word:normalize(embeddings_index[word]) for word in embeddings_index}
words = list(normed_embeddings_index.keys())
type(words), len(words), words[:3]

(list, 25458, [':', 'rt', '!'])

In [68]:
e = normed_embeddings_index[':']
type(e), len(e), e[:3]

(numpy.ndarray, 200, array([0.12839531, 0.07914481, 0.01314618]))

In [69]:
normed_embeddings = [normed_embeddings_index[word] for word in words]
type(normed_embeddings), len(normed_embeddings), type(normed_embeddings[0]), len(normed_embeddings[0])

(list, 25458, numpy.ndarray, 200)

In [70]:
normed_embeddings_matrix = np.array([normed_embeddings_index[word] for word in words])
normed_embeddings_matrix.shape

(25458, 200)

In [71]:
normed_embedding = normed_embeddings_index[':'].reshape((-1,1))
normed_embedding.shape

(200, 1)

In [72]:
len(words)

25458

In [87]:
def get_cosine_similarities(word):
    normed_embedding = normed_embeddings_index[word]
    return(np.dot(normed_embeddings_matrix,normed_embedding))

In [53]:
len(normed_embeddings_index)

25458

In [74]:
cs1 = get_cosine_similarities(':')
cs1.shape

(25458, 200) (200,)


(25458,)

In [75]:
cs1[:5]

array([1.        , 0.73673689, 0.54241606, 0.51215136, 0.49458044])

In [76]:
words[:5]

[':', 'rt', '!', 'i', 'a']

In [77]:
sorted = np.argsort(cs1)
sorted[:5]

array([25371, 25333, 25418, 22810, 25363])

In [78]:
len(sorted_list), type(sorted_list[0])

(25458, int)

In [79]:
[(words[i],cs1[i]) for i in np.argsort(cs1)][-2]

('rt', 0.7367368930897619)

In [80]:
normed_embeddings_matrix.shape

(25458, 200)

In [38]:
# NERUAL NETWORK MODEL

# PARAMETERS

batchsize = 512

temporal_dropout = .2
random_dropout = .1
flat_dropout = .3
#max_dropout= .3
#av_dropout = .3

base_frozen_lr = 1e-3
base_frozen_decay = 1e-4
frozen_epochs = 12
frozen_batchsize = batchsize

base_unfrozen_lr = 3e-4
base_unfrozen_decay = 3e-5
unfrozen_epochs = 20
unfrozen_batchsize = batchsize


base_batchsize = 512

frozen_lr_factor = frozen_batchsize / base_batchsize
unfrozen_lr_factor = unfrozen_batchsize / base_batchsize

frozen_lr = base_frozen_lr * frozen_lr_factor
frozen_decay = base_frozen_decay * frozen_lr_factor

unfrozen_lr = base_unfrozen_lr * unfrozen_lr_factor
unfrozen_decay = base_unfrozen_decay * unfrozen_lr_factor



inputs = layers.Input((MAXLEN,), dtype="int32")

# EMBEDDING BLOCK
raw_embed = layers.Embedding(vocab_size, 
                           EMBEDDING_DIM,
                           weights=[embedding_matrix], 
                           input_length=MAXLEN, 
                           trainable=False)(inputs)
embed_random_drop = layers.Dropout(rate=random_dropout)(raw_embed)
embed_time_drop = layers.Dropout(rate=temporal_dropout, 
                       noise_shape=(None, MAXLEN, 1))(embed_random_drop)

#inversion = layers.Lambda(lambda x: -x)(embed_time_drop)

max_pooled = layers.GlobalMaxPooling1D()(embed_time_drop)
#min_pooled = layers.GlobalMaxPooling1D()(inversion)
av_pooled = layers.GlobalAveragePooling1D()(embed_time_drop)

#drop_max_pooled = layers.Dropout(rate=max_dropout)(max_pooled)
#drop_av_pooled = layers.Dropout(rate=av_dropout)(av_pooled)

#pooled = layers.merge.concatenate([max_pooled,min_pooled,av_pooled])
pooled = layers.merge.concatenate([max_pooled,av_pooled])
#drop_pooled = layers.merge.concatenate([drop_max_pooled,drop_av_pooled])
drop_pooled = layers.Dropout(rate=flat_dropout)(pooled)
pred = layers.Dense(1, activation='sigmoid')(drop_pooled)

# FINAL MODEL
model = Model(inputs=[inputs], outputs=pred)
model.compile(optimizer=Adam(frozen_lr, decay=frozen_decay), 
              loss='binary_crossentropy', metrics=['acc'])

model.fit(docs_train, train_label, epochs=frozen_epochs, 
          validation_data=(docs_valid, valid_label), batch_size=frozen_batchsize)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 34146 samples, validate on 10914 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f4d498c7438>

In [39]:
# Unfreeze the embeddings
model.layers[1].trainable = True
model.compile(optimizer=Adam(unfrozen_lr, decay=unfrozen_decay), 
              loss='binary_crossentropy', metrics=['acc'])
model.fit(docs_train, train_label, epochs=unfrozen_epochs, 
          validation_data=(docs_valid, valid_label), batch_size=unfrozen_batchsize)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 34146 samples, validate on 10914 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f4d499894a8>

In [81]:
embeddings = model.layers[1].get_weights()[0]
words_embeddings = {w:embeddings[idx] for w, idx in tok.word_index.items()}

In [82]:
normed_words_embeddings = {word:normalize(words_embeddings[word]) for word in words_embeddings}
new_words = list(normed_words_embeddings.keys())
new_normed_embeddings = [normed_words_embeddings[word] for word in new_words]
normed_words_embeddings_matrix = np.array([normed_words_embeddings[word] for word in new_words])

In [83]:
normed_embeddings_matrix.shape

(25458, 200)

In [84]:
def get_new_cosine_similarities(word):
    normed_embedding = normed_words_embeddings[word]
    return(np.dot(normed_words_embeddings_matrix,normed_embedding))

In [124]:
def get_gender_prediction_all(word):
    t  = get_tokenized_texts([" ".join(25*[word])])
    return(model.predict(t)[0][0])

In [154]:
n = 10000
for word, count in train_counts.most_common(n):
    if word in embeddings_index and word in words_embeddings:
        cs1 = get_cosine_similarities(word)
        ordered = [(words[i],cs1[i]) for i in np.argsort(cs1)]
        closest, cs_1 = ordered[-2]
        others = ordered[-5:-2]
        cs2 = get_new_cosine_similarities(word)
        new_closest, cs_2 = [(new_words[i],cs2[i]) for i in np.argsort(cs2)][-2]
        if closest != new_closest and cs_1 > .8:
            pred0 = int(100*get_gender_prediction_all(word))
            pred1 = int(100*get_gender_prediction_all(closest))
            if (abs(pred0-pred1) > 20):
                print( '\nWord: ', word.ljust(10), '{:4d}'.format(pred0), 
                      '       Closest: ',  closest.ljust(10), '{:4d}'.format(pred1))
                      #'  Closest in gender ID: ', new_closest)


Word:  more          1        Closest:  than         97

Word:  she           0        Closest:  said         75

Word:  need          2        Closest:  get          54

Word:  got          95        Closest:  get          54

Word:  every        76        Closest:  this         12

Word:  week         59        Closest:  weeks         6

Word:  amazing       0        Closest:  awesome      40

Word:  which        59        Closest:  also         36

Word:  since        92        Closest:  already      13

Word:  anything     22        Closest:  nothing      93

Word:  tonight      61        Closest:  night         1

Word:  wrong        87        Closest:  thing        38

Word:  wanna         0        Closest:  gonna        49

Word:  says         80        Closest:  tells        14

Word:  taking        5        Closest:  take         83

Word:  seems        90        Closest:  seemed       44

Word:  until        18        Closest:  till         57

Word:  thinking      2        

In [140]:
get_both('awesome'), get_both('amazing')

((0.40717125, 0.51067287), (0.0016183348, 0.34136224))

In [125]:
def get_gender_prediction_one(word):
    t  = get_tokenized_texts([word])
    return(model.predict(t)[0][0])

In [129]:
def get_both(word):
    return(get_gender_prediction_all(word), get_gender_prediction_one(word))

In [130]:
get_both('love')

(0.0018903546, 0.3798359)

In [131]:
get_both('until')

(0.18754022, 0.47223723)

In [132]:
get_both('honor')

(0.78357667, 0.59650254)

In [133]:
get_both('because')

(0.05627837, 0.4640955)

In [134]:
get_both('not')

(0.6312561, 0.5251856)

In [135]:
get_both('omg')

(2.8351282e-05, 0.2348928)

In [136]:
get_both('says'), get_both('tells')

((0.8086454, 0.5406278), (0.14933832, 0.46669975))

In [137]:
get_both('latest'), get_both('recent')

((0.99757904, 0.66125166), (0.48146516, 0.48407304))

In [138]:
get_both('since'), get_both('already')

((0.9237992, 0.57901967), (0.13683257, 0.47456348))

In [139]:
get_both('is'), get_both('was')

((0.27680013, 0.46570095), (0.16106057, 0.47281244))