## Reddit r/worldnews: Word Embeddings

In [109]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import re
import string
import unicodedata
import random

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score
from numpy import array,asarray,zeros

import tensorflow as tf
import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, load_model
from keras.layers.core import Activation, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Concatenate, Input
from keras.layers.embeddings import Embedding
from keras.callbacks import Callback,EarlyStopping,ModelCheckpoint
from keras.optimizers import SGD,Adam
from keras.layers import BatchNormalization
from keras import backend as K 

import warnings
warnings.filterwarnings('ignore')


In [110]:
# import cleaned dataframe
comments_final = pd.read_pickle('data/comments_final_.pkl')

In [5]:
# import metric functions
from model_nn_metrics import *

I investigate performance using a pretrained word embedding framework and convolutional neural network (CNN) model.

## Transfer Learning: Word2Vec Pretrained Embeddings

In [129]:
# label target
y = comments_final['Removed'].to_numpy()

# train-test split
X_train, X_test, y_train, y_test = train_test_split(
     comments_final,y, test_size = 0.2, random_state = 0, stratify = y)

In [130]:
x_tr, y_tr = X_train['body_clean_stop'].values, y_train
x_val, y_val = X_test['body_clean_stop'].values, y_test

In [131]:
# tokenize sentences
tokenizer = Tokenizer(num_words=100000)

# prepare vocabulary
tokenizer.fit_on_texts(list(x_tr))

# convert text into integer sequences
x_tr_seq  = tokenizer.texts_to_sequences(x_tr) 
x_val_seq = tokenizer.texts_to_sequences(x_val)

# pad to get sequences of same length
x_tr_seq  = pad_sequences(x_tr_seq, maxlen=300)
x_val_seq = pad_sequences(x_val_seq, maxlen=300)

In [132]:
# load the pretrained embedding into memory
# https://wikipedia2vec.github.io/wikipedia2vec/pretrained/
w2v_embeddings_index = dict()
f = open('data/enwiki_20180420_300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    w2v_embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(w2v_embeddings_index))

Loaded 4529833 word vectors.


In [133]:
size_of_vocabulary=len(tokenizer.word_index) + 1 # add one for padding

# create a weight matrix for words 
w2v_embedding_matrix = np.zeros((size_of_vocabulary, 300))

for word, i in tokenizer.word_index.items():
    w2v_embedding_vector = w2v_embeddings_index.get(word)
    if w2v_embedding_vector is not None:
        w2v_embedding_matrix[i] = w2v_embedding_vector
print(size_of_vocabulary)

132213


In [134]:
w2v_embedding_matrix.shape

(132213, 300)

#### CNN: Pretrained Wiki Embeddings, Text-only, 1 Layer, Default Parameters

In [137]:
# define the model
model = Sequential()
embedding_layer = Embedding(size_of_vocabulary, 300, weights=[w2v_embedding_matrix], input_length=300, trainable=False)
model.add(embedding_layer)
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss=f1_loss, metrics=[f1_loss,f1_beta])

# implement early stopping and track val f1 loss
es = EarlyStopping(monitor='val_f1_loss', mode='min', verbose=1,patience=3)
mc = ModelCheckpoint('models/best_cnn_pretr_emb.h5', monitor='val_f1_loss', mode='min', save_best_only=True,verbose=1)  

# print summary of model
print(model.summary())

batch_size = 2500
epochs = 20

# fit model
history = model.fit(np.array(x_tr_seq),np.array(y_tr),batch_size=batch_size,epochs=epochs,validation_data=(np.array(x_val_seq),y_test),verbose=1,callbacks=[mc,es])


model = load_model('models/best_cnn_pretr_emb.h5',custom_objects = {'f1_beta':f1_beta,'f1_loss':f1_loss})

# predict probabilities for test set
yhat_probs = model.predict(x=np.array(x_val_seq), verbose=0)

# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_probs[yhat_probs>=0.5] = 1
yhat_probs[yhat_probs<0.5] = 0

# accuracy = (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat_probs)
print('Accuracy: %f' % accuracy)

# precision (for positive class) = tp / (tp + fp)
precision = precision_score(y_test, yhat_probs)
print('Precision: %f' % precision)

# recall (for positive class) = tp / (tp + fn)
recall = recall_score(y_test, yhat_probs)
print('Recall: %f' % recall)

# f1 (for positive class) = 2 * precision * recall/(precision + recall)
f1 = f1_score(y_test, yhat_probs)
print('F1 score (positive class): %f' % f1)

# f1 for both classes
f1_both = f1_score(y_test,yhat_probs,average=None)
print(f'F1 for both classes: {f1_both}')

# weighted precision
precision_w = precision_score(y_test,yhat_probs,average='weighted')
print(f'Weighted Precision: {precision_w}')

# weighted recall
recall_w = recall_score(y_test,yhat_probs,average='weighted')
print(f'Weighted Recall: {recall_w}')

# weighted F1
f1_w = f1_score(y_test,yhat_probs,average='weighted')
print(f'Weighted F1: {f1_w}')


Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_51 (Embedding)     (None, 300, 300)          39663900  
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 296, 128)          192128    
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 128)               0         
_________________________________________________________________
dense_127 (Dense)            (None, 128)               16512     
_________________________________________________________________
dense_128 (Dense)            (None, 1)                 129       
Total params: 39,872,669
Trainable params: 208,769
Non-trainable params: 39,663,900
_________________________________________________________________
None
Epoch 1/20
Epoch 00001: val_f1_loss improved from inf to 0.77253, saving model to models/best_cn

### Pretrained w2v: Text + Non-text features (Wikipedia)


In [171]:
comments_final.columns

Index(['index', 'score', 'subreddit', 'parent_id', 'id', 'created_utc',
       'Removed', 'body', 'author', 'body_no_quotes', 'body_norm',
       'body_norm_mod', 'body_clean_no_stop', 'body_clean_stop', 'run_rem',
       'run_tot', 'run_prop_rem', 'run_prev_rem', 'run_prev_tot',
       'run_prop_prev_rem', 'parent_id_2', 'parent_prefix', 'child_rem_flag',
       'sec_child_rem_flag', 'third_child_rem_flag', 'fourth_child_rem_flag',
       'fifth_child_rem_flag'],
      dtype='object')

In [184]:
# specify final set of text & non-text features
final_features = ['body_clean_stop','run_prop_prev_rem','child_rem_flag','sec_child_rem_flag','third_child_rem_flag', 'fourth_child_rem_flag','fifth_child_rem_flag','run_prev_rem','run_prev_tot']
X = comments_final[final_features]

y = comments_final['Removed']

In [185]:
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)

In [186]:
# split text features
X1_train = X_train['body_clean_stop']
X1_test = X_test['body_clean_stop']

In [187]:
# tokenize sentences
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(X1_train)

# convert text into integer sequences
X1_train = tokenizer.texts_to_sequences(X1_train)
X1_test = tokenizer.texts_to_sequences(X1_test)

vocab_size = len(tokenizer.word_index) + 1 # add one for padding

maxlen = 200

# pad to get sequences of same length
X1_train = pad_sequences(X1_train, padding='post', maxlen=maxlen)
X1_test = pad_sequences(X1_test, padding='post', maxlen=maxlen)

In [188]:
# split non-text features
X2_train = X_train[['run_prop_prev_rem','child_rem_flag','sec_child_rem_flag','third_child_rem_flag', 'fourth_child_rem_flag','fifth_child_rem_flag','run_prev_rem','run_prev_tot']].values
X2_test = X_test[['run_prop_prev_rem','child_rem_flag','sec_child_rem_flag','third_child_rem_flag', 'fourth_child_rem_flag','fifth_child_rem_flag','run_prev_rem','run_prev_tot']].values

In [189]:
# normalize non-text features
scaler = MinMaxScaler()

X2_train_s = scaler.fit_transform(X2_train)
X2_test_s = scaler.transform(X2_test)

#### CNN: Non-text features & Pretrained w2v embeddings, 1 Layer, Default Parameters


In [193]:
# define the model
input_1 = Input(shape=(maxlen,))
input_2 = Input(shape=(8,))
embedding_layer = Embedding(size_of_vocabulary, 300, weights=[w2v_embedding_matrix], trainable=False)(input_1)
Conv_1 = Conv1D(128, 5, activation='relu')(embedding_layer)
Conv_2 = GlobalMaxPooling1D()(Conv_1)
concat_layer = Concatenate()([Conv_2, input_2])
dense_layer = Dense(128, activation='relu')(concat_layer)
dense_layer2 = Dense(64, activation='relu')(dense_layer)
output = Dense(1, activation='sigmoid')(dense_layer2)
model = Model(inputs=[input_1, input_2], outputs=output)

# print summary of model
model.compile(loss=f1_loss, optimizer='adam', metrics=[f1_loss,f1_beta])
print(model.summary())

# implement early stopping and track val f1 loss
es = EarlyStopping(monitor='val_f1_loss', mode='min', verbose=1,patience=5)  
mc = ModelCheckpoint('models/best_cnn_w2v_emb_nontext.h5', monitor='val_f1_loss', mode='min', save_best_only=True,verbose=1)  

batch_size = 2500
epochs = epochs

# fit the model
history = model.fit(x=[X1_train, X2_train_s], y=y_train, validation_data=([X1_test,X2_test_s],y_test), epochs=epochs, batch_size=batch_size, verbose=1,callbacks=[es,mc])

model = load_model('models/best_cnn_w2v_emb_nontext.h5',custom_objects = {'f1_beta':f1_beta,'f1_loss':f1_loss})

# predict probabilities for test data
yhat_probs = model.predict(x=[X1_test,X2_test_s], verbose=0)

# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_probs[yhat_probs>=0.5] = 1
yhat_probs[yhat_probs<0.5] = 0
 
# accuracy = (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat_probs)
print('Accuracy: %f' % accuracy)

# precision (for positive class) = tp / (tp + fp)
precision = precision_score(y_test, yhat_probs)
print('Precision: %f' % precision)

# recall (for positive class) = tp / (tp + fn)
recall = recall_score(y_test, yhat_probs)

print('Recall: %f' % recall)

# f1 (for positive class) = 2 * precision * recall/(precision + recall)
f1 = f1_score(y_test, yhat_probs)
print('F1 score (positive class): %f' % f1)

# f1 (for both classes)
f1_both = f1_score(y_test,yhat_probs,average=None)
print(f'F1 for both classes: {f1_both}')

# weighted precision
precision_w = precision_score(y_test,yhat_probs,average='weighted')
print(f'Weighted Precision: {precision_w}')

# weighted recall
recall_w = recall_score(y_test,yhat_probs,average='weighted')
print(f'Weighted Recall: {recall_w}')

# weighted F1
f1_w = f1_score(y_test,yhat_probs,average='weighted')
print(f'Weighted F1: {f1_w}')


Model: "functional_81"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_83 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_68 (Embedding)        (None, 200, 300)     39663900    input_83[0][0]                   
__________________________________________________________________________________________________
conv1d_25 (Conv1D)              (None, 196, 128)     192128      embedding_68[0][0]               
__________________________________________________________________________________________________
global_max_pooling1d_25 (Global (None, 128)          0           conv1d_25[0][0]                  
______________________________________________________________________________________

## Summary

For text-only features,  using a CNN and pretrained embedding layer, we get a weighted F1 of 0.961 and F1 (positive class) of 0.304. Compared to the best text-only TF-IDF model (random forest using bigrams with weighted F1 of 0.960 and F1 (positive class) of 0.201), these figures represent a 0.1% improvement for weighted F1 and 51.2% improvement for F1 (positive class), respectively. For text and non-text features, also using a CNN and pretrained embedding layer, we get a weighted F1 of 0.972 and F1 (positive class) of 0.505. These models outperform my "best" text- and non-text TF-IDF model (logistic regression using unigrams, where weighted F1 is 0.971, and F1 for the positive class is 0.444) by 0.1% and 13.7%, respectively.