In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stance-detection-dataset/fnc-1-master/scorer.py
/kaggle/input/stance-detection-dataset/fnc-1-master/README.md
/kaggle/input/stance-detection-dataset/fnc-1-master/train_stances.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/test_stances_unlabeled.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/train_bodies.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/competition_test_stances.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/train_stances.random.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/competition_test_bodies.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/competition_test_stances_unlabeled.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/test_bodies.csv


In [4]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
#load data
df_body_train = pd.read_csv("/kaggle/input/stance-detection-dataset/fnc-1-master/train_bodies.csv")
df_stance_train = pd.read_csv("/kaggle/input/stance-detection-dataset/fnc-1-master/train_stances.csv")
df_body_test = pd.read_csv("/kaggle/input/stance-detection-dataset/fnc-1-master/competition_test_bodies.csv")
df_stance_test = pd.read_csv("/kaggle/input/stance-detection-dataset/fnc-1-master/competition_test_stances.csv")


# merge the tables by Body ID
train_df = pd.merge(df_body_train, df_stance_train, on='Body ID', how='inner')
test_df = pd.merge(df_body_test, df_stance_test, on='Body ID', how='inner')

# null_counts_train = train_df.isnull().sum() #no nulls
# null_counts_test = test_df.isnull().sum()  #no nulls

total_rows_train = len(train_df)
total_rows_test = len(test_df)

unique_body_ids_train = train_df['Body ID'].nunique()
unique_body_ids_test = test_df['Body ID'].nunique()

print("TRAIN: Total number of rows: ",total_rows_train,", Unique Body IDs:",unique_body_ids_train)
print("TEST: Total number of rows: ",total_rows_test,", Unique Body IDs:",unique_body_ids_test)

# print(train_df.head())
# print(test_df.head())

# convert the last column i.e. the categorical column to a one hot encoded list. 
train_df['list'] = pd.get_dummies(train_df['Stance'],columns=train_df.columns).astype(int).values.tolist()
new_df_train = train_df[['articleBody','Headline', 'list']].copy()
# Passing colums as train.columns so that the encoding is consistent among train and test
test_df['list'] = pd.get_dummies(test_df['Stance'],columns=train_df.columns).astype(int).values.tolist()
new_df_test = test_df[['articleBody','Headline', 'list']].copy()

TRAIN: Total number of rows:  49972 , Unique Body IDs: 1683
TEST: Total number of rows:  25413 , Unique Body IDs: 904


In [6]:
new_df_train.head()

Unnamed: 0,articleBody,Headline,list
0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...","[0, 0, 0, 1]"
1,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,"[0, 0, 0, 1]"
2,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,"[0, 0, 0, 1]"
3,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Ottawa,"[0, 0, 0, 1]"
4,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,"[0, 0, 0, 1]"


In [7]:
new_df_test.head()

Unnamed: 0,articleBody,Headline,list
0,Al-Sisi has denied Israeli reports stating tha...,Apple installing safes in-store to protect gol...,"[0, 0, 0, 1]"
1,Al-Sisi has denied Israeli reports stating tha...,El-Sisi denies claims he'll give Sinai land to...,"[1, 0, 0, 0]"
2,Al-Sisi has denied Israeli reports stating tha...,Apple to keep gold Watch Editions in special i...,"[0, 0, 0, 1]"
3,Al-Sisi has denied Israeli reports stating tha...,Apple Stores to Keep Gold “Edition” Apple Watc...,"[0, 0, 0, 1]"
4,Al-Sisi has denied Israeli reports stating tha...,South Korean woman's hair 'eaten' by robot vac...,"[0, 0, 0, 1]"


In [8]:
new_df_train['articleBody'].apply(lambda x: len(str(x).split())).max()

4788

In [9]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 4
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [10]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.article_body = self.data["articleBody"]
        self.headline = self.data["Headline"]
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.article_body)

    def __getitem__(self, index):
        article_body = str(self.article_body[index])
        article_body = " ".join(article_body.split())
        headline = str(self.headline[index])
        headline = " ".join(headline.split())

        inputs = self.tokenizer(
            article_body, 
            headline,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation='only_first', 
            return_overflowing_tokens=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [11]:
# Creating the dataset and dataloader for the neural network

train_dataset=new_df_train.sample(frac=1,random_state=200).reset_index(drop=True)
test_dataset=new_df_test.sample(frac=1,random_state=200).reset_index(drop=True)


# print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

TRAIN Dataset: (49972, 3)
TEST Dataset: (25413, 3)


In [12]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [13]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        # Freeze the weights of the BERT layer
        for param in self.l1.parameters():
            param.requires_grad = False
#         self.pre_classifier = torch.nn.Linear(768, 768)
#         self.dropout = torch.nn.Dropout(0.3)
#         self.classifier = torch.nn.Linear(768, 4)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 4)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [14]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [15]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [16]:
loss_tracker = []
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%1000==0:
            loss_tracker.append(loss)
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
accuracy_tracker = []
f1_micro_tracker = []
f1_macro_tracker = []
precision_tracker = []
recall_tracker = []
mcc_tracker = []
logloss_tracker = []
hammingloss_tracker = []

for epoch in range(EPOCHS):
    print("Epoch ",epoch)
    train(epoch)
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    precision_score = metrics.precision_score(targets, outputs, average = 'samples', zero_division = 0)
    recall_score = metrics.recall_score(targets, outputs, average = 'samples')
    # MCC not supported for multiclass
#     mcc_score = metrics.matthews_corrcoef(targets,outputs)
    logloss_score = metrics.log_loss(targets, outputs)
    hammingloss_score = metrics.hamming_loss(targets, outputs)
    
    accuracy_tracker.append(accuracy)
    f1_micro_tracker.append(f1_score_micro)
    f1_macro_tracker.append(f1_score_macro)
    precision_tracker.append(precision_score)
    recall_tracker.append(recall_score)
#     mcc_tracker.append(mcc_score)
    logloss_tracker.append(logloss_score)
    hammingloss_tracker.append(hammingloss_score)
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"Precision = {precision_score}")
    print(f"Recall = {recall_score}")
#     print(f"MCC = {mcc_score}")
    print(f"LogLoss = {logloss_score}")
    print(f"Hamming Loss = {hammingloss_score}")

Epoch  0
Epoch: 0, Loss:  0.6539868116378784
Epoch: 0, Loss:  0.3803189992904663
Epoch: 0, Loss:  0.2370111644268036
Epoch: 0, Loss:  0.32327479124069214
Epoch: 0, Loss:  0.505055844783783
Epoch: 0, Loss:  0.29159337282180786
Epoch: 0, Loss:  0.2822578549385071
Accuracy Score = 0.7099515995750206
F1 Score (Micro) = 0.7538750783371632
F1 Score (Macro) = 0.22110577394373104
Precision = 0.7099909495140283
Recall = 0.7100302994530359
LogLoss = 6.417605167195905
Hamming Loss = 0.11590524534686972
Epoch  1
Epoch: 1, Loss:  0.2268395572900772
Epoch: 1, Loss:  0.3137916028499603
Epoch: 1, Loss:  0.3585245609283447
Epoch: 1, Loss:  0.1453838348388672
Epoch: 1, Loss:  0.13214647769927979
Epoch: 1, Loss:  0.1081865206360817
Epoch: 1, Loss:  0.4101746380329132
Accuracy Score = 0.6718608586156691
F1 Score (Micro) = 0.7477445913987912
F1 Score (Macro) = 0.22463732179206744
Precision = 0.6718608586156691
Recall = 0.6718608586156691
LogLoss = 4.793033791580611
Hamming Loss = 0.11332782434187227
Epoch 

In [None]:
np.savetxt('stance_bert_loss_tracker.txt',[tensor.cpu().detach().numpy() for tensor in loss_tracker],delimiter=',')
np.savetxt('stance_bert_accuracy_tracker.txt',accuracy_tracker,delimiter=',')
np.savetxt('stance_bert_f1_micro_tracker.txt',f1_micro_tracker,delimiter=',')
np.savetxt('stance_bert_f1_macro_tracker.txt',f1_macro_tracker,delimiter=',')
np.savetxt('stance_bert_precision_tracker.txt',precision_tracker,delimiter=',')
np.savetxt('stance_bert_recall_tracker.txt',recall_tracker,delimiter=',')
# np.savetxt('stance_bert_mcc_tracker.txt',mcc_tracker,delimiter=',')
np.savetxt('stance_bert_logloss_tracker.txt',logloss_tracker,delimiter=',')
np.savetxt('stance_bert_f1_hammingloss_tracker.txt',hammingloss_tracker,delimiter=',')

Pkl_Filename = "stance_detection_bert.pkl"  
import pickle
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)

In [None]:
# def validation(epoch):
#     model.eval()
#     fin_targets=[]
#     fin_outputs=[]
#     with torch.no_grad():
#         for _, data in enumerate(testing_loader, 0):
#             ids = data['ids'].to(device, dtype = torch.long)
#             mask = data['mask'].to(device, dtype = torch.long)
#             token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
#             targets = data['targets'].to(device, dtype = torch.float)
#             outputs = model(ids, mask, token_type_ids)
#             fin_targets.extend(targets.cpu().detach().numpy().tolist())
#             fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
#     return fin_outputs, fin_targets

In [None]:
# for epoch in range(EPOCHS):
#     print("Epoch ",epoch)
#     outputs, targets = validation(epoch)
#     outputs = np.array(outputs) >= 0.5
#     accuracy = metrics.accuracy_score(targets, outputs)
#     f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
#     f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
#     print(f"Accuracy Score = {accuracy}")
#     print(f"F1 Score (Micro) = {f1_score_micro}")
#     print(f"F1 Score (Macro) = {f1_score_macro}")