In [1]:
import numpy as np
import copy
from tqdm import tqdm
import pandas as pd
import re
import gensim
import keras
from keras import optimizers
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.layers import *
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from score import report_score
from sklearn.metrics import accuracy_score
from tensorflow.keras import layers
from tensorflow.keras import regularizers



In [80]:
datadir="fnc-1"
w2v_path = './data/GoogleNews-vectors-negative300.bin'
save_path = "./saved/"
batch_size = 128
max_sent_length = 250
random_state = 37
lstm_hidden_dim = 200

In [81]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [109]:
raw_train_bodies = pd.read_csv(datadir + '/train_bodies.csv')   
raw_train_stances = pd.read_csv(datadir + '/train_stances.csv')
raw_test_bodies = pd.read_csv(datadir + '/competition_test_bodies.csv') 
raw_test_stances = pd.read_csv(datadir + '/competition_test_stances.csv') 

In [115]:
list(raw_train_stances['Stance']).count('unrelated')

36545

In [84]:
stance_to_int = {"agree":0, "discuss": 1, "disagree": 2, "unrelated": 3}
int_to_stance = {0:"agree", 1:"discuss", 2:"disagree", 3: "unrelated"}

In [86]:
raw_train_stances['Stance'] = raw_train_stances['Stance'].apply(lambda x: stance_to_int[x])
raw_test_stances['Stance'] = raw_test_stances['Stance'].apply(lambda x: stance_to_int[x])

In [87]:
train_df = raw_train_stances.join(raw_train_bodies.set_index('Body ID'), on='Body ID')
test_df = raw_test_stances.join(raw_test_bodies.set_index('Body ID'), on='Body ID')

In [88]:
def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

# Pre-processing words
clean_train_headline = [text_to_word_sequence(clean(head)) for head in train_df['Headline']]
clean_train_bodies = [text_to_word_sequence(clean(body)) for body in train_df['articleBody']]
clean_test_headline = [text_to_word_sequence(clean(head)) for head in test_df['Headline']]
clean_test_bodies = [text_to_word_sequence(clean(body)) for body in test_df['articleBody']]

In [89]:
wordlist = []
for i in range(len(clean_train_headline)):
    wordlist.append(clean_train_headline[i])
for i in range(len(clean_train_bodies)):
    wordlist.append(clean_train_bodies[i])
for i in range(len(clean_test_headline)):
    wordlist.append(clean_test_headline[i])
for i in range(len(clean_test_bodies)):
    wordlist.append(clean_test_bodies[i])

In [90]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(wordlist)
len(tokenizer.word_index)

29451

In [91]:
train_lines = [] 
for i in range(len(clean_train_headline)):
    headline =  clean_train_headline[i]
    body = clean_train_bodies[i]
    newline = headline+body
    train_lines.append(newline)

test_lines = [] 
for i in range(len(clean_test_headline)):
    headline =  clean_test_headline[i]
    body = clean_train_bodies[i]
    newline = headline+body
    test_lines.append(newline)

In [92]:
X_train = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_length]) for seq in train_lines])
raw_X_train = pad_sequences(X_train, maxlen=max_sent_length, padding='post', truncating='post')
raw_y_train = train_df['Stance']

In [15]:
X_test = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_length]) for seq in test_lines])
X_test = pad_sequences(X_test, maxlen=max_sent_length, padding='post', truncating='post')
y_test = test_df['Stance']

In [16]:
# Convert y to onehot
y_train_onehot = np_utils.to_categorical(raw_y_train)
y_test_onehot = np_utils.to_categorical(y_test)

In [17]:
X_train, X_vali, y_train, y_vali = train_test_split(raw_X_train, y_train_onehot, random_state = random_state, test_size=0.2)

In [18]:
embedding_dim = 300
embeddings = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, embedding_dim))

for word, i in tokenizer.word_index.items():
    try:
        embeddings_vector = embeddings[word]
        embeddings_matrix[i] = embeddings_vector
    except KeyError:
        pass
        
del embeddings


In [44]:
def cnn_model(n_classes):
    kernel_sizes = [3, 4, 5]
    num_filters=[80, 80, 80]  
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                            output_dim=embedding_dim,
                            weights = [embeddings_matrix],
                            trainable=False, name='embedding_layer',
                            mask_zero=True))
    model.add(keras.layers.Conv1D(num_filters[0], kernel_sizes[0], padding='valid', activation='relu', kernel_regularizer=regularizers.L2(0.001)))
    model.add(Activation(activation='relu', name='activation_1'))
    model.add(keras.layers.MaxPooling1D(3))
    model.add(keras.layers.Conv1D(num_filters[1], kernel_sizes[1], padding='valid', activation='relu', kernel_regularizer=regularizers.L2(0.001)))
    model.add(Activation(activation='relu', name='activation_2'))
    model.add(keras.layers.MaxPooling1D(3))
    model.add(keras.layers.Conv1D(num_filters[2], kernel_sizes[2], padding='valid', activation='relu', kernel_regularizer=regularizers.L2(0.001)))
    model.add(Activation(activation='relu', name='activation_3'))
    model.add(GlobalMaxPooling1D())
    model.add(keras.layers.Dense(n_classes, activation='softmax', name='output_layer'))
    
    return model

### Basic model trained over to four-classfier

In [42]:
basic_model = cnn_model(n_classes=4)
basic_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [43]:
print(basic_model.summary())

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, None, 300)         8835600   
_________________________________________________________________
conv1d_30 (Conv1D)           (None, None, 80)          72080     
_________________________________________________________________
activation_1 (Activation)    (None, None, 80)          0         
_________________________________________________________________
max_pooling1d_20 (MaxPooling (None, None, 80)          0         
_________________________________________________________________
conv1d_31 (Conv1D)           (None, None, 80)          25680     
_________________________________________________________________
activation_2 (Activation)    (None, None, 80)          0         
_________________________________________________________________
max_pooling1d_21 (MaxPooling (None, None, 80)        

In [45]:
history = basic_model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=10,
          validation_data=(X_vali, y_vali))
basic_model.save(save_path+"basic_mode")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./saved/basic_mode\assets


In [52]:
preds = basic_model.predict(X_test)
outputs = [int_to_stance[np.argmax(p, axis = -1)] for p in preds]
report_score(actual_test_stances,outputs)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    240    |     0     |    141    |   1522    |
-------------------------------------------------------------
| disagree  |    78     |     0     |    58     |    561    |
-------------------------------------------------------------
|  discuss  |    398    |     1     |    429    |   3636    |
-------------------------------------------------------------
| unrelated |   1827    |     0     |   1751    |   14771   |
-------------------------------------------------------------
Score: 4530.75 out of 11651.25	(38.88638558094625%)


38.88638558094625

### Relatedness Classifier

In [53]:
# agree, disagree, discuss are 1, unrelated is 0
int_to_relatedness={0:1,1:1,2:1,3:0}
str_to_relatedness = {'unrelated':0 , 'related':1}

In [54]:
relatedness_y_train = raw_y_train.copy()
relatedness_y_train = relatedness_y_train.apply(lambda x: int_to_relatedness[x])
relatedness_y_train_onehot = np_utils.to_categorical(relatedness_y_train)

relatedness_y_test = y_test.copy()
relatedness_y_test = relatedness_y_test.apply(lambda x: int_to_relatedness[x])

In [75]:
relatedness_X_train, relatedness_X_vali, relatedness_y_train, relatedness_y_vali = train_test_split(raw_X_train, relatedness_y_train_onehot, random_state=random_state, test_size=0.2)

In [76]:
relatedness_model = cnn_model(n_classes=2)
relatedness_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [78]:
history = relatedness_model.fit(X_train, y_train,batch_size=batch_size,epochs=10, validation_data=(relatedness_X_vali, relatedness_y_vali))
preds = relatedness_model.predict(X_test)
outputs = [np.argmax(p, axis = -1) for p in preds]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [59]:
accuracy_score(relatedness_y_test, outputs)

0.6561602329516389

### Opinion Classfier

#### Dataset that all unrelated column is removed

In [60]:
train_drop_index = train_df[train_df['Stance']==3].index
opinion_train_df = train_df.drop(train_df[train_df['Stance']==3].index)

test_drop_index = test_df[test_df['Stance']==3].index
opinion_test_df = test_df.drop(test_df[test_df['Stance']==3].index)

In [61]:
len(opinion_test_df)

7064

In [62]:
opinion_train_headline = [text_to_word_sequence(clean(head)) for head in opinion_train_df['Headline']]
opinion_train_bodies = [text_to_word_sequence(clean(body)) for body in opinion_train_df['articleBody']]

opinion_test_headline = [text_to_word_sequence(clean(head)) for head in opinion_test_df['Headline']]
opinion_test_bodies = [text_to_word_sequence(clean(body)) for body in opinion_test_df['articleBody']]

In [63]:
opinion_train_lines = [] 
for i in range(len(opinion_train_headline)):
    headline =  opinion_train_bodies[i]
    body = clean_train_bodies[i]
    newline = headline+body
    opinion_train_lines.append(newline)

opinion_test_lines = [] 
for i in range(len(opinion_test_headline)):
    headline =  opinion_test_bodies[i]
    body = clean_train_bodies[i]
    newline = headline+body
    opinion_test_lines.append(newline)

In [64]:
len(opinion_train_lines)

13427

In [65]:
opinion_X_train = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_length]) for seq in opinion_train_lines])
opinion_raw_X_train = pad_sequences(opinion_X_train, maxlen=max_sent_length, padding='post', truncating='post')
opinion_raw_y_train = opinion_train_df['Stance']

In [66]:
opinion_X_test = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_length]) for seq in opinion_test_lines])
opinion_raw_X_test = pad_sequences(opinion_X_test, maxlen=max_sent_length, padding='post', truncating='post')
opinion_y_test = opinion_test_df['Stance']

In [67]:
# Convert y to onehot
opinion_y_train_onehot = np_utils.to_categorical(opinion_raw_y_train)
opinion_y_test_onehot = np_utils.to_categorical(opinion_y_test)

In [69]:
opinion_X_train, opinion_X_vali, opinion_y_train, opinion_y_vali = train_test_split(opinion_raw_X_train, opinion_y_train_onehot, random_state=random_state, test_size=0.2)

In [70]:
opinion_model = cnn_model(n_classes=3)
opinion_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [71]:
history = opinion_model.fit(opinion_X_train, opinion_y_train,
          batch_size=batch_size,
          epochs=10,
          validation_data=(opinion_X_vali, opinion_y_vali))
basic_model.save(save_path+"opinion_model")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./saved/opinion_model\assets


In [72]:
preds = opinion_model.predict(opinion_raw_X_test)
outputs = [np.argmax(p, axis = -1) for p in preds]
accuracy_score(opinion_y_test, outputs)

0.5549263873159683

### Combine Relatedness model and Opinion Model

In [73]:
class combined_model:
    def __init__(self,relatedness_model, opinion_model):
        self.relatedness_model = relatedness_model
        self.opinion_model = opinion_model

    def predict(self, X_test):
        prediction = relatedness_model.predict(X_test)
        prediction = [np.argmax(p, axis = -1) for p in prediction]
        for i in tqdm(range(len(prediction))):
            relatedness = prediction[i]
            if relatedness == 1: #related
                opinion = opinion_model.predict(np.array([X_test[i]]))
                opinion = np.argmax(opinion, axis = -1)
                
                prediction[i] = int_to_stance[int(opinion)]
            else:
                prediction[i] = 'unrelated'

        return prediction

In [79]:
y_vali

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [74]:
combined = combined_model(relatedness_model, opinion_model)
outputs = combined.predict()
report_score(actual_test_stances,outputs)

 26%|██▌       | 696/2686 [00:13<00:39, 50.88it/s]


KeyboardInterrupt: 