In [29]:
'''
Train google BERT on Quora Insincere Questions Classification
https://www.kaggle.com/c/quora-insincere-questions-classification.
The model's task is to predict whether a question is sincere (label=0) or
insincere (label=1)
'''
import os
import numpy as np
import pandas as pd

from keras import Model
from keras.layers import Lambda, Dense
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

from batch_generator.batch_generator import BatchGenerator
from load_pretrained_bert import load_google_bert

import string

## Data Cleaning & Loading

In [15]:
raw_df = pd.read_csv('Political-media-DFE.csv',encoding='latin')
df = raw_df[['bias','message','embed','label','source','text']]
df.head()

Unnamed: 0,bias,message,embed,label,source,text
0,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",From: Trey Radel (Representative from Florida),twitter,RT @nowthisnews: Rep. Trey Radel (R- #FL) slam...
1,partisan,attack,"<blockquote class=""twitter-tweet"" width=""450"">...",From: Mitch McConnell (Senator from Kentucky),twitter,VIDEO - #Obamacare: Full of Higher Costs and ...
2,neutral,support,"<blockquote class=""twitter-tweet"" width=""450"">...",From: Kurt Schrader (Representative from Oregon),twitter,Please join me today in remembering our fallen...
3,neutral,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",From: Michael Crapo (Senator from Idaho),twitter,RT @SenatorLeahy: 1st step toward Senate debat...
4,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",From: Mark Udall (Senator from Colorado),twitter,.@amazon delivery #drones show need to update ...


In [16]:
df['bias'].value_counts()

neutral     3689
partisan    1311
Name: bias, dtype: int64

In [17]:
def remove_punctuations(text):
    '''Removes punctuation from strings'''
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [18]:
df['text'] = df.loc[:,'text'].apply(remove_punctuations)
df['label'] = df['label'].str.replace('From: ','')
df['purpose_and_bias'] = df['message'] + '_' + df['bias']
df['text'] = df['text'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [19]:
df.head()

Unnamed: 0,bias,message,embed,label,source,text,purpose_and_bias
0,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Trey Radel (Representative from Florida),twitter,rt nowthisnews rep trey radel r fl slams obama...,policy_partisan
1,partisan,attack,"<blockquote class=""twitter-tweet"" width=""450"">...",Mitch McConnell (Senator from Kentucky),twitter,video obamacare full of higher costs and bro...,attack_partisan
2,neutral,support,"<blockquote class=""twitter-tweet"" width=""450"">...",Kurt Schrader (Representative from Oregon),twitter,please join me today in remembering our fallen...,support_neutral
3,neutral,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Michael Crapo (Senator from Idaho),twitter,rt senatorleahy 1st step toward senate debate ...,policy_neutral
4,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Mark Udall (Senator from Colorado),twitter,amazon delivery drones show need to update law...,policy_partisan


In [20]:
congressmen_df = pd.read_csv('congressmen_2015.csv')
congressmen_df.head()

Unnamed: 0,First,Last,congressman,affiliation
0,Gregorio,Sablan,Gregorio Sablan (Representative from NA),d
1,Robert,Aderholt,Robert Aderholt (Representative from Alabama),r
2,Lamar,Alexander,Lamar Alexander (Senator from Tennessee),r
3,Justin,Amash,Justin Amash (Representative from Michigan),r
4,Mark,Amodei,Mark Amodei (Representative from Nevada),r


In [21]:
df = df.merge(congressmen_df, how='left',left_on='label',right_on='congressman')
df.loc[df.bias == 'partisan', 'target'] = df['affiliation']
df.loc[df.bias == 'neutral', 'target'] = df['bias']
df.dropna(axis=0,inplace=True)
df = df[df['target'] != 'i']

In [22]:
df.head()

Unnamed: 0,bias,message,embed,label,source,text,purpose_and_bias,First,Last,congressman,affiliation,target
0,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Trey Radel (Representative from Florida),twitter,rt nowthisnews rep trey radel r fl slams obama...,policy_partisan,Trey,Radel,Trey Radel (Representative from Florida),r,r
1,partisan,attack,"<blockquote class=""twitter-tweet"" width=""450"">...",Mitch McConnell (Senator from Kentucky),twitter,video obamacare full of higher costs and bro...,attack_partisan,Mitch,McConnell,Mitch McConnell (Senator from Kentucky),r,r
2,neutral,support,"<blockquote class=""twitter-tweet"" width=""450"">...",Kurt Schrader (Representative from Oregon),twitter,please join me today in remembering our fallen...,support_neutral,Kurt,Schrader,Kurt Schrader (Representative from Oregon),d,neutral
3,neutral,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Michael Crapo (Senator from Idaho),twitter,rt senatorleahy 1st step toward senate debate ...,policy_neutral,Michael,Crapo,Michael Crapo (Senator from Idaho),r,neutral
4,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Mark Udall (Senator from Colorado),twitter,amazon delivery drones show need to update law...,policy_partisan,Mark,Udall,Mark Udall (Senator from Colorado),d,d


In [23]:
df['target'].value_counts()

neutral    3631
r           791
d           490
Name: target, dtype: int64

In [24]:
def replace_contraction(text):
    contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'can not'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                         (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    return text
def replace_links(text, filler=' '):
        text = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*',
                      filler, text).strip()
        return text
def remove_numbers(text):
    text = ''.join([i for i in text if not i.isdigit()])
    return text

In [25]:
import re
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = replace_contraction(text)
    text = replace_links(text, "link")
    text = remove_numbers(text)
    text = re.sub(r'[,!@#$%^&*)(|/><";:.?\'\\}{]',"",text)
    text = text.lower()
    return text

In [33]:
df.loc[df['target'] == 'neutral', 'target'] = 0
df.loc[df['target'] == 'r', 'target'] = 1
df.loc[df['target'] == 'd', 'target'] = 2
df.head()

Unnamed: 0,bias,message,embed,label,source,text,purpose_and_bias,First,Last,congressman,affiliation,target
0,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Trey Radel (Representative from Florida),twitter,rt nowthisnews rep trey radel r fl slams obama...,policy_partisan,Trey,Radel,Trey Radel (Representative from Florida),r,1
1,partisan,attack,"<blockquote class=""twitter-tweet"" width=""450"">...",Mitch McConnell (Senator from Kentucky),twitter,video obamacare full of higher costs and bro...,attack_partisan,Mitch,McConnell,Mitch McConnell (Senator from Kentucky),r,1
2,neutral,support,"<blockquote class=""twitter-tweet"" width=""450"">...",Kurt Schrader (Representative from Oregon),twitter,please join me today in remembering our fallen...,support_neutral,Kurt,Schrader,Kurt Schrader (Representative from Oregon),d,0
3,neutral,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Michael Crapo (Senator from Idaho),twitter,rt senatorleahy 1st step toward senate debate ...,policy_neutral,Michael,Crapo,Michael Crapo (Senator from Idaho),r,0
4,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Mark Udall (Senator from Colorado),twitter,amazon delivery drones show need to update law...,policy_partisan,Mark,Udall,Mark Udall (Senator from Colorado),d,2


## Incorporating BERT 

In [46]:
BERT_PRETRAINED_DIR = 'multi_cased_L-12_H-768_A-12/'
SEQ_LEN = 70
BATCH_SIZE = 12
LR = 1e-5

In [37]:
X = df['text'].apply(cleanText).values
Y = df['target'].values
print(X[0])  # How did Quebec nationalists see their province as a nation in the 1960s?
print(Y[0])  # 0

rt nowthisnews rep trey radel r fl slams obamacare politics httpstcozvywmgyih
1


In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

In [39]:
print(len(X_train))
print(len(X_test))
print(len(Y_train))
print(len(Y_test))

3684
1228
3684
1228


In [48]:
train_gen = BatchGenerator(X_train,
                           vocab_file=os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt'),
                           seq_len=SEQ_LEN,
                           labels=Y_train,
                           do_lower_case=False,
                           batch_size=BATCH_SIZE)
valid_gen = BatchGenerator(X_test,
                           vocab_file=os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt'),
                           seq_len=SEQ_LEN,
                           labels=Y_test,
                           do_lower_case=False,
                           batch_size=BATCH_SIZE)

100%|██████████| 3684/3684 [00:02<00:00, 1265.01it/s]
100%|██████████| 3684/3684 [00:00<00:00, 53884.70it/s]
100%|██████████| 1228/1228 [00:00<00:00, 1366.40it/s]
100%|██████████| 1228/1228 [00:00<00:00, 40200.16it/s]


In [49]:
g_bert = load_google_bert(base_location=BERT_PRETRAINED_DIR, use_attn_mask=False, max_len=SEQ_LEN)
g_bert.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
bert/embeddings/LayerNorm/beta  ->  layer_normalization_1/beta:0
bert/embeddings/LayerNorm/gamma  ->  layer_normalization_1/gamma:0
bert/embeddings/position_embeddings  ->  PositionEmbedding/embeddings:0
bert/embeddings/token_type_embeddings  ->  SegmentEmbedding/embeddings:0
bert/embeddings/word_embeddings  ->  TokenEmbedding/embeddings:0
bert/encoder/layer_0/attention/output/LayerNorm/beta  ->  layer_0/ln_1/beta:0
bert/encoder/layer_0/attention/output/LayerNorm/gamma  ->  layer_0/ln_1/gamma:0
bert/encoder/layer_0/attention/output/dense/bias  ->  layer_0/c_attn_proj/bias:0
bert/encoder/layer_0/attention/output/dense/kernel  ->  layer_0/c_attn_proj/kernel:0
bert/encoder/layer_0/attention/self/key/bias  ->  layer_0/c_attn/bias:0
bert/encoder/layer_0/attention/self/key/kernel  ->  layer_0/c_attn/kernel:0

bert/encoder/layer_4/attention/output/LayerNorm/beta  ->  layer_4/ln_1/beta:0
bert/encoder/layer_4/attention/output/LayerNorm/gamma  ->  layer_4/ln_1/gamma:0
bert/encoder/layer_4/attention/output/dense/bias  ->  layer_4/c_attn_proj/bias:0
bert/encoder/layer_4/attention/output/dense/kernel  ->  layer_4/c_attn_proj/kernel:0
bert/encoder/layer_4/attention/self/key/bias  ->  layer_4/c_attn/bias:0
bert/encoder/layer_4/attention/self/key/kernel  ->  layer_4/c_attn/kernel:0
bert/encoder/layer_4/attention/self/query/bias  ->  layer_4/c_attn/bias:0
bert/encoder/layer_4/attention/self/query/kernel  ->  layer_4/c_attn/kernel:0
bert/encoder/layer_4/attention/self/value/bias  ->  layer_4/c_attn/bias:0
bert/encoder/layer_4/attention/self/value/kernel  ->  layer_4/c_attn/kernel:0
bert/encoder/layer_4/intermediate/dense/bias  ->  layer_4/c_fc/bias:0
bert/encoder/layer_4/intermediate/dense/kernel  ->  layer_4/c_fc/kernel:0
bert/encoder/layer_4/output/LayerNorm/beta  ->  layer_4/ln_2/beta:0
bert/encoder

layer_9/c_ffn_proj (Conv1D)     (None, 70, 768)      2360064     layer_9/gelu[0][0]               
__________________________________________________________________________________________________
layer_9/ln_2_drop (Dropout)     (None, 70, 768)      0           layer_9/c_ffn_proj[0][0]         
__________________________________________________________________________________________________
layer_9/ln_2_add (Add)          (None, 70, 768)      0           layer_9/ln_1[0][0]               
                                                                 layer_9/ln_2_drop[0][0]          
__________________________________________________________________________________________________
layer_9/ln_2 (LayerNormalizatio (None, 70, 768)      1536        layer_9/ln_2_add[0][0]           
__________________________________________________________________________________________________
layer_10/c_attn (Conv1D)        (None, 70, 2304)     1771776     layer_9/ln_2[0][0]               
__________

In [50]:
# Choose Layer 0 as containing the features relevant for classification; see BERT paper for further explanation on
# this choice.
classification_features = Lambda(lambda x: x[:, 0, :])(g_bert.output)
out = Dense(3, activation='softmax')(classification_features)

In [57]:
model = Model(g_bert.inputs, out)
model.compile(optimizer=Adam(LR), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
segment_input (InputLayer)      (None, 70)           0                                            
__________________________________________________________________________________________________
position_input (InputLayer)     (None, 70)           0                                            
__________________________________________________________________________________________________
token_input (InputLayer)        (None, 70)           0                                            
__________________________________________________________________________________________________
SegmentEmbedding (Embedding)    (None, 70, 768)      1536        segment_input[0][0]              
__________________________________________________________________________________________________
PositionEm

In [58]:
model.fit_generator(train_gen,
                    epochs=1,
                    verbose=1,
                    validation_data=valid_gen,
                    shuffle=True)

Epoch 1/1


KeyboardInterrupt: 

In [None]:
Y_valid_predictions = model.predict_generator(valid_gen, verbose=1)
Y_test = Y_test[:len(Y_valid_predictions)]
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(Y_test, (Y_valid_predictions > thresh).astype(int))
    print(f"F1 score at threshold {thresh} is {f1}")

'''
After 1 Epoch
F1 score at threshold 0.32 is 0.687372802960222
Note that the results may vary slightly from run to run due to the non-deterministic nature of tensorflow/keras.
'''