In [1]:
import numpy as np
import copy
from tqdm import tqdm
import pandas as pd
import re
import gensim

from keras import optimizers
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.layers import *
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from score import report_score
from sklearn.metrics import accuracy_score
from tensorflow.keras import layers
from tensorflow.keras import regularizers



In [2]:
datadir="fnc-1"
w2v_path = './data/GoogleNews-vectors-negative300.bin'
save_path = "./saved/"
batch_size = 128
max_sent_length = 250
random_state = 37
lstm_hidden_dim = 100
epoch = 10

In [3]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [4]:
raw_train_bodies = pd.read_csv(datadir + '/train_bodies.csv')   
raw_train_stances = pd.read_csv(datadir + '/train_stances.csv')
raw_test_bodies = pd.read_csv(datadir + '/competition_test_bodies.csv') 
raw_test_stances = pd.read_csv(datadir + '/competition_test_stances.csv') 

In [5]:
stance_to_int = {"agree":0, "discuss": 1, "disagree": 2, "unrelated": 3}
int_to_stance = {0:"agree", 1:"discuss", 2:"disagree", 3: "unrelated"}

In [6]:
actual_test_stances = raw_test_stances['Stance']
raw_train_stances['Stance'] = raw_train_stances['Stance'].apply(lambda x: stance_to_int[x])
raw_test_stances['Stance'] = raw_test_stances['Stance'].apply(lambda x: stance_to_int[x])

In [7]:
train_df = raw_train_stances.join(raw_train_bodies.set_index('Body ID'), on='Body ID')
test_df = raw_test_stances.join(raw_test_bodies.set_index('Body ID'), on='Body ID')

In [8]:
def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

# Pre-processing words
clean_train_headline = [text_to_word_sequence(clean(head)) for head in train_df['Headline']]
clean_train_bodies = [text_to_word_sequence(clean(body)) for body in train_df['articleBody']]
clean_test_headline = [text_to_word_sequence(clean(head)) for head in test_df['Headline']]
clean_test_bodies = [text_to_word_sequence(clean(body)) for body in test_df['articleBody']]

In [9]:
wordlist = []
for i in range(len(clean_train_headline)):
    wordlist.append(clean_train_headline[i])
for i in range(len(clean_train_bodies)):
    wordlist.append(clean_train_bodies[i])
for i in range(len(clean_test_headline)):
    wordlist.append(clean_test_headline[i])
for i in range(len(clean_test_bodies)):
    wordlist.append(clean_test_bodies[i])

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(wordlist)
len(tokenizer.word_index)

29451

In [11]:
train_lines = [] 
for i in range(len(clean_train_headline)):
    headline =  clean_train_headline[i]
    body = clean_train_bodies[i]
    newline = headline+body
    train_lines.append(newline)

test_lines = [] 
for i in range(len(clean_test_headline)):
    headline =  clean_test_headline[i]
    body = clean_train_bodies[i]
    newline = headline+body
    test_lines.append(newline)

In [12]:
X_train = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_length]) for seq in train_lines])
raw_X_train = pad_sequences(X_train, maxlen=max_sent_length, padding='post', truncating='post')
raw_y_train = train_df['Stance']

In [13]:
X_test = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_length]) for seq in test_lines])
X_test = pad_sequences(X_test, maxlen=max_sent_length, padding='post', truncating='post')
y_test = test_df['Stance']

In [14]:
X_train, X_vali, y_train, y_vali = train_test_split(raw_X_train, raw_y_train, random_state = random_state, test_size=0.2)

In [15]:
# Convert y to onehot
y_train_onehot = np_utils.to_categorical(y_train)
y_vali_onehot = np_utils.to_categorical(y_vali)
y_test_onehot = np_utils.to_categorical(y_test)

### Basic Model

In [16]:
embedding_dim = 300
embeddings = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, embedding_dim))

for word, i in tokenizer.word_index.items():
    try:
        embeddings_vector = embeddings[word]
        embeddings_matrix[i] = embeddings_vector
    except KeyError:
        pass
        
del embeddings


In [17]:
def lstm_model(n_classes):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                            output_dim=embedding_dim,
                            weights = [embeddings_matrix],
                            trainable=False, name='embedding_layer',
                            mask_zero=True))

    model.add(Bidirectional(LSTM(lstm_hidden_dim, return_sequences=False, name='lstm_layer',
                    
                    kernel_regularizer =tf.keras.regularizers.L2(l2=1e-3))))
    model.add(Dropout(rate=0.8, name='dropout'))
    model.add(Activation(activation='relu', name='activation_1'))
    model.add(Dense(n_classes, activation='softmax', name='output_layer'))
    return model

### Basic model trained over to four-classfier

In [18]:
basic_model = lstm_model(n_classes=4)
basic_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [19]:
print(basic_model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, None, 300)         8835600   
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               320800    
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 200)               0         
_________________________________________________________________
output_layer (Dense)         (None, 4)                 804       
Total params: 9,157,204
Trainable params: 321,604
Non-trainable params: 8,835,600
_________________________________________________________________
None


In [20]:
history = basic_model.fit(X_train, y_train_onehot,
          batch_size=batch_size,
          epochs=epoch,
          validation_data=(X_vali, y_vali_onehot))
# basic_model.save(save_path+"basic_mode")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
# Accuracies on the validation set
preds_vali = np.argmax(basic_model.predict(X_vali), axis = -1)
true_valid = np.argmax(y_vali_onehot, axis = -1)

total_relatedness = len(true_valid)
total_opinion = 0
correct_relatedness = 0
correct_opinion = 0

for i in range(len(true_valid)):
    true = true_valid[i]
    pred = preds_vali[i]
    if true==3:
        if pred==3:
            correct_relatedness+=1
    else:
        total_opinion+=1
        if pred!=3:
            correct_relatedness+=1
        if pred==true:
            correct_opinion+=1

print(f"The Relatedness Accuracy is {correct_relatedness/total_relatedness}")
print(f"The Opinion Accuracy is {correct_opinion/total_opinion}")

The Relatedness Accuracy is 0.8571285642821411
The Opinion Accuracy is 0.44963503649635034


In [22]:
preds = basic_model.predict(X_test)
outputs = [int_to_stance[np.argmax(p, axis = -1)] for p in preds]
report_score(actual_test_stances,outputs)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |     4     |     0     |    39     |   1860    |
-------------------------------------------------------------
| disagree  |     2     |     0     |    17     |    678    |
-------------------------------------------------------------
|  discuss  |     6     |     0     |    253    |   4205    |
-------------------------------------------------------------
| unrelated |    21     |     0     |    838    |   17490   |
-------------------------------------------------------------
Score: 4645.5 out of 11651.25	(39.87125844866431%)


39.87125844866431

### Relatedness Classifier

In [23]:
# agree, disagree, discuss are 1, unrelated is 0
int_to_relatedness={0:1,1:1,2:1,3:0}
str_to_relatedness = {'unrelated':0 , 'related':1}

In [24]:
relatedness_y_train = y_train.apply(lambda x: int_to_relatedness[x])
relatedness_y_train_onehot = np_utils.to_categorical(relatedness_y_train)

relatedness_y_vali = y_vali.apply(lambda x: int_to_relatedness[x])
relatedness_y_vali_onehot = np_utils.to_categorical(relatedness_y_vali)

relatedness_y_test = y_test.copy()
relatedness_y_test = relatedness_y_test.apply(lambda x: int_to_relatedness[x])

In [25]:
relatedness_model = lstm_model(n_classes=2)
relatedness_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [26]:
history = relatedness_model.fit(X_train, relatedness_y_train_onehot ,batch_size=batch_size,epochs=epoch, 
                                validation_data=(X_vali, relatedness_y_vali_onehot))
preds = relatedness_model.predict(X_test)
outputs = [np.argmax(p, axis = -1) for p in preds]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
accuracy_score(relatedness_y_test, outputs)

0.6708771101404792

### Opinion Classfier

#### Dataset that all unrelated column is removed

In [28]:
train_drop_index = train_df[train_df['Stance']==3].index
opinion_train_df = train_df.drop(train_df[train_df['Stance']==3].index)

test_drop_index = test_df[test_df['Stance']==3].index
opinion_test_df = test_df.drop(test_df[test_df['Stance']==3].index)

In [29]:
opinion_train_headline = [text_to_word_sequence(clean(head)) for head in opinion_train_df['Headline']]
opinion_train_bodies = [text_to_word_sequence(clean(body)) for body in opinion_train_df['articleBody']]

opinion_test_headline = [text_to_word_sequence(clean(head)) for head in opinion_test_df['Headline']]
opinion_test_bodies = [text_to_word_sequence(clean(body)) for body in opinion_test_df['articleBody']]

In [30]:
opinion_train_lines = [] 
for i in range(len(opinion_train_headline)):
    headline =  opinion_train_bodies[i]
    body = clean_train_bodies[i]
    newline = headline+body
    opinion_train_lines.append(newline)

opinion_test_lines = [] 
for i in range(len(opinion_test_headline)):
    headline =  opinion_test_bodies[i]
    body = clean_train_bodies[i]
    newline = headline+body
    opinion_test_lines.append(newline)

In [31]:
len(opinion_train_lines)

13427

In [32]:
opinion_X_train = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_length]) for seq in opinion_train_lines])
opinion_raw_X_train = pad_sequences(opinion_X_train, maxlen=max_sent_length, padding='post', truncating='post')
opinion_raw_y_train = opinion_train_df['Stance']

In [33]:
opinion_X_test = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_length]) for seq in opinion_test_lines])
opinion_raw_X_test = pad_sequences(opinion_X_test, maxlen=max_sent_length, padding='post', truncating='post')
opinion_y_test = opinion_test_df['Stance']

In [34]:
# Convert y to onehot
opinion_y_train_onehot = np_utils.to_categorical(opinion_raw_y_train)
opinion_y_test_onehot = np_utils.to_categorical(opinion_y_test)

In [35]:
opinion_X_train, opinion_X_vali, opinion_y_train, opinion_y_vali = train_test_split(opinion_raw_X_train, opinion_y_train_onehot, random_state=random_state, test_size=0.2)

In [36]:
opinion_model = lstm_model(n_classes=3)
opinion_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [37]:
history = opinion_model.fit(opinion_X_train, opinion_y_train,
          batch_size=batch_size,
          epochs=10,
          validation_data=(opinion_X_vali, opinion_y_vali))
basic_model.save(save_path+"opinion_model")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ./saved/opinion_model\assets


INFO:tensorflow:Assets written to: ./saved/opinion_model\assets


In [38]:
preds = opinion_model.predict(opinion_raw_X_test)
outputs = [np.argmax(p, axis = -1) for p in preds]
accuracy_score(opinion_y_test, outputs)

0.5995186862967158

### Combine Relatedness model and Opinion Model

In [39]:
class cascaded_model:
    def __init__(self,relatedness_model, opinion_model):
        self.relatedness_model = relatedness_model
        self.opinion_model = opinion_model

    def predict(self, X_test):
        prediction = relatedness_model.predict(X_test)
        prediction = [np.argmax(p, axis = -1) for p in prediction]
        for i in tqdm(range(len(prediction))):
            relatedness = prediction[i]
            if relatedness == 1: #related
                opinion = opinion_model.predict(np.array([X_test[i]]))
                opinion = np.argmax(opinion, axis = -1)
                
                prediction[i] = int(opinion)
            else:
                prediction[i] = 3

        return prediction

In [40]:
cascaded = cascaded_model(relatedness_model, opinion_model)
preds_vali = cascaded.predict(X_vali)
true_valid = np.argmax(y_vali_onehot, axis = -1)
print(f"The stance accuracy of cascaded model is {accuracy_score(true_valid,preds_vali)}")

100%|██████████| 9995/9995 [01:36<00:00, 103.82it/s]

The stance accuracy of cascaded model is 0.832216108054027





: 