In [None]:
from keras import backend
from keras.layers import Conv1D, Dense, Input, Lambda, LSTM, GRU, RNN, CuDNNGRU, CuDNNLSTM, Dropout, Bidirectional
from keras.layers.merge import concatenate
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.utils import to_categorical, plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
from string import punctuation
from nltk.corpus import stopwords
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import gensim.downloader as api

In [None]:
vectors = api.load("glove-twitter-100")

In [None]:
def process_data(blogs, lower=True, remove_punc=True, split=True, remove_stop=False):
    # lowercase all text
    if lower:
        blogs = [blog.lower() for blog in blogs]
    # remove punctuation
    if remove_punc:
        blogs = [blog.translate(str.maketrans('', '', punctuation)).strip() for blog in blogs]
    # split into individual tokens
    if split:
        blogs = [blog.split() for blog in blogs]
    # remove stopwords
    if remove_stop:
        stop = set(stopwords.words('english'))
        final_blogs = []
        for blog in blogs:
            b = []
            for token in blog.split():
                if token not in stop:
                    b.append(token)
            final_blogs.append(' '.join(b))
        blogs = final_blogs
    return blogs

In [None]:
blog_data = pd.read_csv("blog-gender-dataset_csv.csv")

In [None]:
blog_data.Blog = process_data(blog_data.Blog.astype(str), split=False, remove_punc=False)

In [None]:
def process_gender(gender):
    encoded_genders = []
    for g in gender:
        if g in ('M', 'm', 'male', 'Male'):
            g = 1
        else:
            g = 0
        encoded_genders.append(g)
    return encoded_genders

In [None]:
blog_data.Gender = process_gender(blog_data.Gender.astype(str))

In [None]:
embeddings = np.zeros((vectors.syn0.shape[0] + 1, vectors.syn0.shape[1]), dtype = "float32")
embeddings[:vectors.syn0.shape[0]] = vectors.syn0

MAX_TOKENS = vectors.syn0.shape[0]
embedding_dim = vectors.syn0.shape[1]

In [None]:
input_docs = []
input_lc = []
input_rc = []
max_len_contexts=199
for blog in blog_data.Blog:
    tokens = [vectors.vocab[token].index if token in vectors.vocab else MAX_TOKENS for token in blog.split()]
    doc = np.array(tokens)
    left_context = np.array([MAX_TOKENS] + tokens[:max_len_contexts])
    right_context = np.array(tokens[1:max_len_contexts + 1] + [MAX_TOKENS])
    input_docs.append(doc)
    input_lc.append(left_context)
    input_rc.append(right_context)

In [None]:
max_len=500
input_docs = sequence.pad_sequences(input_docs, maxlen=max_len, padding='post', truncating='post', value=0)
input_lc = sequence.pad_sequences(input_lc, maxlen=max_len, padding='post', truncating='post', value=0)
input_rc = sequence.pad_sequences(input_rc, maxlen=max_len, padding='post', truncating='post', value=0)

In [None]:
splits = train_test_split(input_docs, input_lc, input_rc, blog_data.Gender, test_size=0.1, shuffle=True, stratify=blog_data.Gender)
input_docs_train, input_docs_test = splits[0], splits[1]
input_lc_train, input_lc_test = splits[2], splits[3]
input_rc_train, input_rc_test = splits[4], splits[5]
genders_train, genders_test = splits[6], splits[7]

In [None]:
def RCNN(embeddings, MAX_TOKENS, embedding_dim, hidden_1=100, hidden_2=100):
    document = Input(shape = (None, ), dtype = "int32")
    left_context = Input(shape = (None, ), dtype = "int32")
    right_context = Input(shape = (None, ), dtype = "int32")
    
    embedder = Embedding(MAX_TOKENS + 1, embedding_dim, weights = [embeddings], trainable = True)
    doc_embedding = embedder(document)
    l_embedding = embedder(left_context)
    r_embedding = embedder(right_context)
    
    doc_embedding = Dropout(0.1)(doc_embedding)
    l_embedding = Dropout(0.1)(l_embedding)
    r_embedding = Dropout(0.1)(r_embedding)
    
    forward = CuDNNLSTM(hidden_1, kernel_initializer='glorot_normal', return_sequences = True, activity_regularizer=l2(0.007))(l_embedding) # See equation (1).
    backward = CuDNNLSTM(hidden_1, kernel_initializer='glorot_normal', return_sequences = True, go_backwards = True, activity_regularizer=l2(0.007))(r_embedding) # See equation (2).

    backward = Lambda(lambda x: backend.reverse(x, axes = 1))(backward)
    together = concatenate([forward, doc_embedding, backward], axis = 2)
        
    rcnn = Conv1D(hidden_2, kernel_size=1, padding='same', activation = "tanh", activity_regularizer=l2(0.007))(together) # See equation (4).
    
    pool_rcnn = Lambda(lambda x: backend.max(x, axis = 1), output_shape = (hidden_2, ))(rcnn) # See equation (5).
    
    dense = Dense(64, activity_regularizer=l2())(pool_rcnn)
    
    dense = Dropout(0.2)(dense)
    
    dense = Dense(32)(dense)
    
    output = Dense(1, input_dim = hidden_2, activation = "sigmoid")(dense) # See equations (6) and (7).

    model = Model(inputs = [document, left_context, right_context], outputs = output)
    model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
    
    return model

In [None]:
model = RCNN(embeddings, MAX_TOKENS, embedding_dim)
model.summary()

In [None]:
history = model.fit([input_docs_train, input_lc_train, input_rc_train], genders_train, 
                    epochs=200, verbose=1, batch_size=64, validation_split=0.1, shuffle=True])

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'][10:])
plt.plot(history.history['val_loss'][10:])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
model.evaluate([input_docs_test, input_lc_test, input_rc_test], genders_test)

In [None]:
plot_model(model, to_file='model.png')