In [45]:
# Import packages
import numpy as np
import pandas as pd
import random
import math
from collections import Counter
from stemming.porter2 import stem
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\Brian
[nltk_data]     Yeung\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Preparation

In [46]:
# Import data
bodies = pd.read_csv('fnc-1/train_bodies.csv')
stances = pd.read_csv('fnc-1/train_stances.csv')

### Training/validation split: 90/10

In [47]:
r = random.Random()
r.seed(12345)
body_ids = bodies['Body ID'].tolist()
r.shuffle(body_ids)
train_ids = body_ids[:int(len(body_ids)*0.9)]
validation_ids = body_ids[int(len(body_ids)*0.9):]

train_bodies = bodies[bodies['Body ID'].isin(train_ids)].reset_index(drop=True)
train_stances = stances[stances['Body ID'].isin(train_ids)].reset_index(drop=True)
validation_bodies = bodies[bodies['Body ID'].isin(validation_ids)].reset_index(drop=True)
validation_stances = stances[stances['Body ID'].isin(validation_ids)].reset_index(drop=True)

In [49]:
# Summary statistics of stance distribution in training set
train_stances.groupby('Stance')['Body ID'].count()/len(train_stances)*100

Stance
agree         7.294118
disagree      1.667037
discuss      18.182020
unrelated    72.856826
Name: Body ID, dtype: float64

In [50]:
# Summary statistics of stance distribution in validation set
validation_stances.groupby('Stance')['Body ID'].count()/len(validation_stances)*100

Stance
agree         7.964242
disagree      1.808208
discuss      14.587566
unrelated    75.639984
Name: Body ID, dtype: float64

# Vector representation

In [51]:
# Function to convert a document into lower case and remove all symbols
def clean_doc(string):
    cleaned_string = string.lower().replace(',', ' ').replace('.', ' ').replace(';', ' ').replace(':', ' ').replace('(', '').replace(')', '').replace('[','').replace(']','').replace('\'','').replace('\"','').replace('‘','').replace('’','').replace('“','').replace('”','').replace('/','').replace('?','').replace('!','').replace('%','').replace('&','').replace('-','').replace('$','').replace('—','')
    return cleaned_string

In [52]:
# Extract unique words for all Headline and articleBody
train_stances['Headline words'] = train_stances['Headline'].apply(lambda x: set(clean_doc(x).split()))
train_bodies['articleBody words'] = train_bodies['articleBody'].apply(lambda x: set(clean_doc(x).split()))
validation_stances['Headline words'] = validation_stances['Headline'].apply(lambda x: set(clean_doc(x).split()))
validation_bodies['articleBody words'] = validation_bodies['articleBody'].apply(lambda x: set(clean_doc(x).split()))

#### Stemming (optional)

In [53]:
train_stances_stemmed = train_stances.copy()
train_bodies_stemmed = train_bodies.copy()
validation_stances_stemmed = validation_stances.copy()
validation_bodies_stemmed = validation_bodies.copy()

In [55]:
# Extract stemmed words
stem_words_train_bodies = []
stem_words_train_headline = []
stem_words_validation_bodies = []
stem_words_validation_headline = []

for i in range(len(train_bodies_stemmed)):
    stem_words_train_bodies.append([stem(x) for x in clean_doc(train_bodies_stemmed.loc[i,'articleBody']).split()])
for i in range(len(train_stances_stemmed)):
    stem_words_train_headline.append([stem(x) for x in clean_doc(train_stances_stemmed.loc[i,'Headline']).split()])
for i in range(len(validation_bodies_stemmed)):
    stem_words_validation_bodies.append([stem(x) for x in clean_doc(validation_bodies_stemmed.loc[i,'articleBody']).split()])
for i in range(len(validation_stances_stemmed)):
    stem_words_validation_headline.append([stem(x) for x in clean_doc(validation_stances_stemmed.loc[i,'Headline']).split()])

train_bodies_stemmed['articleBody words'] = stem_words_train_bodies
train_stances_stemmed['Headline words'] = stem_words_train_headline
validation_bodies_stemmed['articleBody words'] = stem_words_validation_bodies
validation_stances_stemmed['Headline words'] = stem_words_validation_headline

#### Remove stop words (optional)

In [23]:
train_stances_filtered = train_stances_stemmed.copy()
train_bodies_filtered = train_bodies_stemmed.copy()
validation_stances_filtered = validation_stances_stemmed.copy()
validation_bodies_filtered = validation_bodies_stemmed.copy()

In [26]:
# Remove stop words
stop_words = set(stopwords.words('english'))
train_stances_filtered['Headline words'] = train_stances_filtered['Headline words'].apply(lambda x: [w for w in x if not w in stop_words])
train_bodies_filtered['articleBody words'] = train_bodies_filtered['articleBody words'].apply(lambda x: [w for w in x if not w in stop_words])
validation_stances_filtered['Headline words'] = validation_stances_filtered['Headline words'].apply(lambda x: [w for w in x if not w in stop_words])
validation_bodies_filtered['articleBody words'] = validation_bodies_filtered['articleBody words'].apply(lambda x: [w for w in x if not w in stop_words])

#### Word occurrences count

In [27]:
# Extract a master list of all words. Initiate a list to hold all words
all_words = set([])
# Loop through all Body and Headline texts and append new words to the master list
for i in range(len(train_stances)):
    all_words.update(train_stances.loc[i,'Headline words'])
for i in range(len(train_bodies)):
    all_words.update(train_bodies.loc[i,'articleBody words'])
for i in range(len(validation_stances)):
    all_words.update(validation_stances.loc[i,'Headline words'])
for i in range(len(validation_bodies)):
    all_words.update(validation_bodies.loc[i,'articleBody words'])

In [28]:
# Do the same but on stemmed words
all_words_stemmed = set([])
# Loop through all Body and Headline texts and append new words to the master list
for i in range(len(train_stances_stemmed)):
    all_words_stemmed.update(train_stances_stemmed.loc[i,'Headline words'])
for i in range(len(train_bodies_stemmed)):
    all_words_stemmed.update(train_bodies_stemmed.loc[i,'articleBody words'])
for i in range(len(validation_stances_stemmed)):
    all_words_stemmed.update(validation_stances_stemmed.loc[i,'Headline words'])
for i in range(len(validation_bodies_stemmed)):
    all_words_stemmed.update(validation_bodies_stemmed.loc[i,'articleBody words'])

In [29]:
# Do the same but on stemmed words with stop words filtered out
all_words_filtered = set([])
# Loop through all Body and Headline texts and append new words to the master list
for i in range(len(train_stances_stemmed)):
    all_words_filtered.update(train_stances_filtered.loc[i,'Headline words'])
for i in range(len(train_bodies_stemmed)):
    all_words_filtered.update(train_bodies_filtered.loc[i,'articleBody words'])
for i in range(len(validation_stances_stemmed)):
    all_words_filtered.update(validation_stances_filtered.loc[i,'Headline words'])
for i in range(len(validation_bodies_stemmed)):
    all_words_filtered.update(validation_bodies_filtered.loc[i,'articleBody words'])

In [30]:
# Define function to convert documents in dataset into vectors of word occurences
def word_counter(df, words_vector):
    import time
    start = time.time()
    
    # Initiate the master array to hold vector representations for each document
    master_vector = []
    for i in range(len(df)):
        # Initiate an empty array for each document
        doc_vector = []
        counter = Counter(df.loc[i])
        for word in words_vector:
            # Check number of times word in master list appears in document
            doc_vector.append(counter[word])
        master_vector.append(doc_vector)
    
    end = time.time()
    print(end - start)
    
    return master_vector

In [86]:
# Compute word counts in each headlines & article bodies - for training and validation dataset
train_bodies_vector = []
train_bodies_vector = word_counter(train_bodies['articleBody words'], all_words)
validation_bodies_vector = []
validation_bodies_vector = word_counter(validation_bodies['articleBody words'], all_words)
train_stances_vector = []
train_stances_vector = word_counter(train_stances['Headline words'], all_words)
validation_stances_vector = []
validation_stances_vector = word_counter(validation_stances['Headline words'], all_words)

24.174811601638794
3.442002773284912
776.4593210220337
94.89768719673157


In [57]:
# Compute word counts (stemming discarded) in each headlines & article bodies - for training and validation dataset
train_bodies_vector_stemmed = []
train_bodies_vector_stemmed = word_counter(train_bodies_stemmed['articleBody words'], all_words_stemmed)
validation_bodies_vector_stemmed = []
validation_bodies_vector_stemmed = word_counter(validation_bodies_stemmed['articleBody words'], all_words_stemmed)
train_stances_vector_stemmed = []
train_stances_vector_stemmed = word_counter(train_stances_stemmed['Headline words'], all_words_stemmed)
validation_stances_vector_stemmed = []
validation_stances_vector_stemmed = word_counter(validation_stances_stemmed['Headline words'], all_words_stemmed)

9.685055494308472
1.1514663696289062
280.6340591907501
30.7632737159729


In [31]:
# Compute word counts (stop words discarded) in each headlines & article bodies - for training and validation dataset
train_bodies_vector_filtered = []
train_bodies_vector_filtered = word_counter(train_bodies_filtered['articleBody words'], all_words_filtered)
validation_bodies_vector_filtered = []
validation_bodies_vector_filtered = word_counter(validation_bodies_filtered['articleBody words'], all_words_filtered)
train_stances_vector_filtered = []
train_stances_vector_filtered = word_counter(train_stances_filtered['Headline words'], all_words_filtered)
validation_stances_vector_filtered = []
validation_stances_vector_filtered = word_counter(validation_stances_filtered['Headline words'], all_words_filtered)

10.730015754699707
0.9939184188842773
262.15616488456726
27.414483070373535


# Feature Engineering, Modelling and Results

### With stemming

In [58]:
# Feature Engineering
import feature_engineering
train_stemmed, validation_stemmed = feature_engineering.feature_engineering(train_bodies_stemmed, train_stances_stemmed, train_bodies_vector_stemmed, train_stances_vector_stemmed, validation_bodies_stemmed, validation_stances_stemmed, validation_bodies_vector_stemmed, validation_stances_vector_stemmed)

###### Training dataset feature engineering #####
Time elapsed for cosine similarity calculation: 265.7892382144928
Time elapsed for KL Divergence: 6.93135666847229
Time elapsed for KL Divergence with Dirichlet Smoothing: 44.43669843673706
###### Validation dataset feature engineering #####
Time elapsed for KL Divergence: 0.6870007514953613
Time elapsed for KL Divergence with Dirichlet Smoothing: 4.4889702796936035


In [59]:
# Modelling: Linear Regression, Logistic Regression & Random Forest
import modelling
modelling.modelling(train_stemmed, validation_stemmed)

##### Linear Regression #####
Confusion matrix: 
[[  74  154   93   71]
 [  10   31   19   29]
 [  92  227  228  171]
 [  43  194   80 3406]]
Precision recall F1: 
(array([ 0.33789954,  0.05115512,  0.54285714,  0.92629861]), array([ 0.18877551,  0.34831461,  0.31754875,  0.91485361]), array([ 0.24222586,  0.08920863,  0.40070299,  0.92054054]), array([ 392,   89,  718, 3723], dtype=int64))
##### Logistic Regression #####
Confusion matrix: 
[[  63  166   96   67]
 [  10   31   19   29]
 [  89  237  231  161]
 [  40  228   84 3371]]
Precision recall F1 score: 
(array([ 0.31188119,  0.04682779,  0.5372093 ,  0.92916207]), array([ 0.16071429,  0.34831461,  0.32172702,  0.90545259]), array([ 0.21212121,  0.08255659,  0.40243902,  0.91715413]), array([ 392,   89,  718, 3723], dtype=int64))
##### Random Forest #####
Confusion matrix: 
[[  63  166   96   67]
 [  10   31   19   29]
 [  89  237  231  161]
 [  40  228   84 3371]]
Precision recall F1 score: 
(array([ 0.38904899,  0.58823529,  0.5

### Stemming discarded

In [60]:
# Feature Engineering
import feature_engineering
train, validation = feature_engineering.feature_engineering(train_bodies, train_stances, train_bodies_vector, train_stances_vector, validation_bodies, validation_stances, validation_bodies_vector, validation_stances_vector)

###### Training dataset feature engineering #####
Time elapsed for cosine similarity calculation: 385.39739990234375
Time elapsed for KL Divergence: 6.87669825553894
Time elapsed for KL Divergence with Dirichlet Smoothing: 43.66496229171753
###### Validation dataset feature engineering #####
Time elapsed for KL Divergence: 0.6919715404510498
Time elapsed for KL Divergence with Dirichlet Smoothing: 4.375027179718018


In [61]:
# Modelling: Linear Regression, Logistic Regression & Random Forest
import modelling
modelling.modelling(train, validation)

##### Linear Regression #####
Confusion matrix: 
[[  58   57  212   65]
 [   5   12   47   25]
 [  58   72  433  155]
 [   3   41   81 3598]]
Precision recall F1: 
(array([ 0.46774194,  0.06593407,  0.56015524,  0.93624772]), array([ 0.14795918,  0.13483146,  0.60306407,  0.96642493]), array([ 0.2248062 ,  0.08856089,  0.58081824,  0.95109701]), array([ 392,   89,  718, 3723], dtype=int64))
##### Logistic Regression #####
Confusion matrix: 
[[  26  138  151   77]
 [   1   28   33   27]
 [  27  168  356  167]
 [   0   61   36 3626]]
Precision recall F1 score: 
(array([ 0.48148148,  0.07088608,  0.61805556,  0.93045933]), array([ 0.06632653,  0.31460674,  0.49582173,  0.97394574]), array([ 0.11659193,  0.11570248,  0.55023184,  0.95170604]), array([ 392,   89,  718, 3723], dtype=int64))
##### Random Forest #####
Confusion matrix: 
[[  26  138  151   77]
 [   1   28   33   27]
 [  27  168  356  167]
 [   0   61   36 3626]]
Precision recall F1 score: 
(array([ 0.38123167,  0.3       ,  0.5

### Stemming and stop words discarded

In [62]:
# Feature Engineering
import feature_engineering
train, validation = feature_engineering.feature_engineering(train_bodies_filtered, train_stances_filtered, train_bodies_vector_filtered, train_stances_vector_filtered, validation_bodies_filtered, validation_stances_filtered, validation_bodies_vector_filtered, validation_stances_vector_filtered)

###### Training dataset feature engineering #####
Time elapsed for cosine similarity calculation: 275.06541991233826
Time elapsed for KL Divergence: 6.799327373504639
Time elapsed for KL Divergence with Dirichlet Smoothing: 35.84059405326843
###### Validation dataset feature engineering #####
Time elapsed for KL Divergence: 0.6805007457733154
Time elapsed for KL Divergence with Dirichlet Smoothing: 3.660275936126709


In [63]:
# Modelling: Linear Regression, Logistic Regression & Random Forest
import modelling
modelling.modelling(train, validation)

##### Linear Regression #####
Confusion matrix: 
[[ 143  137   30   82]
 [  29   31    5   24]
 [ 240  211  123  144]
 [   6   21    0 3696]]
Precision recall F1: 
(array([ 0.34210526,  0.0775    ,  0.77848101,  0.9366447 ]), array([ 0.36479592,  0.34831461,  0.17130919,  0.99274778]), array([ 0.35308642,  0.12678937,  0.28082192,  0.96388056]), array([ 392,   89,  718, 3723], dtype=int64))
##### Logistic Regression #####
Confusion matrix: 
[[  96  117   88   91]
 [  18   25   19   27]
 [ 157  152  250  159]
 [   2   19    2 3700]]
Precision recall F1 score: 
(array([ 0.35164835,  0.0798722 ,  0.69637883,  0.93034951]), array([ 0.24489796,  0.28089888,  0.34818942,  0.99382219]), array([ 0.2887218 ,  0.12437811,  0.46425255,  0.96103896]), array([ 392,   89,  718, 3723], dtype=int64))
##### Random Forest #####
Confusion matrix: 
[[  96  117   88   91]
 [  18   25   19   27]
 [ 157  152  250  159]
 [   2   19    2 3700]]
Precision recall F1 score: 
(array([ 0.46496815,  0.76190476,  0.6