In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets rank_bm25 sentence_transformers stanza

**Import Relevant Packages**

In [3]:
import json
import pandas as pd
import numpy as np
import time
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, util, InputExample
from sentence_transformers import models, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from gensim.parsing.preprocessing import preprocess_documents
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig
from gensim.parsing import preprocess_string
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import logging
import json
import torch
from torch.nn import CrossEntropyLoss
import ast
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from torch.nn import MarginRankingLoss
from tqdm import tqdm
import stanza
import spacy
import random
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**Load and convert data to dataframes**

In [4]:
# Load the data
with open('/content/drive/MyDrive/NLP_data/train-claims.json') as f:
    train_data = json.load(f)

with open('/content/drive/MyDrive/NLP_data/dev-claims.json') as f:
    dev_data = json.load(f)

with open('/content/drive/MyDrive/NLP_data/test-claims-unlabelled.json') as f:
    test_data = json.load(f)

with open('/content/drive/MyDrive/NLP_data/evidence.json') as f:
    evidence_data = json.load(f)


print(len(test_data))
print(len(dev_data))

153
154


In [5]:
# function to get the evidence for a claim
def get_evidence(evidence_id):
    return evidence_data[evidence_id]

# function to get the claim text
def get_claim_text(data, claim_id):
    return data[claim_id]['claim_text']

# function to get the label for a claim
def get_label(data, claim_id):
    return data[claim_id]['claim_label']

# function to create a dataframe with the claim, evidence and label
def create_dataframe(data):
    rows = []
    for claim_id in data:
        for evidence_id in data[claim_id]['evidences']:
            row = {
                'claim_text': get_claim_text(data,claim_id),
                'evidence_text': get_evidence(evidence_id=evidence_id),
                'label': get_label(data,claim_id)
            }
            rows.append(row)
    df = pd.concat([pd.DataFrame([row]) for row in rows], ignore_index=True)
    return df

# create the dataframes
train_df = create_dataframe(train_data)
dev_df = create_dataframe(dev_data)
evidences = list(evidence_data.values())
evidence_df = pd.DataFrame(evidence_data.items(), columns=['evidence_id', 'evidence_text'])

#print entire evidence for a claim of the train_df
train_df


Unnamed: 0,claim_text,evidence_text,label
0,Not only is there no scientific evidence that ...,At very high concentrations (100 times atmosph...,DISPUTED
1,Not only is there no scientific evidence that ...,Plants can grow as much as 50 percent faster i...,DISPUTED
2,Not only is there no scientific evidence that ...,Higher carbon dioxide concentrations will favo...,DISPUTED
3,El Niño drove record highs in global temperatu...,While ‘climate change’ can be due to natural f...,REFUTES
4,El Niño drove record highs in global temperatu...,This acceleration is due mostly to human-cause...,REFUTES
...,...,...,...
4117,But abnormal temperature spikes in February an...,The coastline sees significantly mild temperat...,NOT_ENOUGH_INFO
4118,Sending oscillating microwaves from an antenna...,"Dielectric heating, also known as electronic h...",SUPPORTS
4119,Sending oscillating microwaves from an antenna...,An example is absorption or emission of radio ...,SUPPORTS
4120,Sending oscillating microwaves from an antenna...,"Water, fat, and other substances in the food a...",SUPPORTS


**Preprocess Data**

Preprocess the corpus of text documents using the SpaCy library. The motivation behind this code is to prepare the text data for retrieval and improve performance while using BM-25 and TF-IDF. The preprocessed text is more suitable for NLP tasks as it removes unnecessary information and reduces the complexity of the text, making further analysis more efficient and accurate. 

Here's a step-by-step explanation:

1. The `stopwords` from the nltk library are loaded into a set called `stop_words`. These are common words like 'is', 'the', 'and', etc., that are often removed during preprocessing as they do not provide much information for downstream tasks.

2. The SpaCy English model `en_core_web_sm` is loaded into the `nlp` object. This model will be used for tokenization, lemmatization, and stop word and punctuation removal.

3. The `preprocess_text_spacy` function is defined to preprocess a single document. It takes a SpaCy Doc object as input and returns a list of lemmatized tokens. Lemmatization is the process of reducing a word to its base or root form (e.g., 'running' becomes 'run'). The function also converts tokens to lowercase and removes stop words, punctuation, and non-alphabetic tokens.

4. The `preprocess_corpus_spacy` function is defined to preprocess a corpus of documents. It divides the corpus into batches for more efficient processing. The function uses the `nlp.pipe` method to process the documents in batches, which is faster than processing one document at a time. 

Deep Learning methods already tokenise and embedd the corpus of text and they do not require this step.

In [6]:
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def preprocess_text_spacy(doc):
    """
    Preprocess a single document using SpaCy.

    Lemmatizes tokens, converts them to lowercase, and removes stop words, punctuation, 
    and non-alphabetic tokens.

    Args:
        doc: A SpaCy Doc object representing a document.

    Returns:
        lemmatized_tokens: A list of preprocessed, lemmatized tokens from the document.
    """
    lemmatized_tokens = [
        token.lemma_.lower() for token in doc
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]
    return lemmatized_tokens

def preprocess_corpus_spacy(corpus, nlp=nlp, n_process=2, batch_size=5000):
    """
    Preprocess a corpus of documents using SpaCy's capabilities.

    This function divides the corpus into batches for more efficient processing.

    Args:
        corpus: A list of documents to be preprocessed.
        nlp: A SpaCy Language object for text processing. Defaults to a global `nlp` object.
        n_process: The number of processors to use. Defaults to 2.
        batch_size: The number of documents to process at a time. Defaults to 5000.

    Returns:
        preprocessed_corpus: A list of preprocessed documents, where each document is a list of lemmatized tokens.
    """
    preprocessed_corpus = []

    for batch in tqdm(nlp.pipe(corpus, batch_size=batch_size), total=len(corpus), desc="Processing documents"):
        preprocessed_corpus.append(preprocess_text_spacy(batch))

    return preprocessed_corpus



**Preprocess the data in the format required for BM25 and TF-IDF**

In [None]:
preprocessed_corpus = preprocess_corpus_spacy(evidences)
processed_evidence_df = evidence_df
processed_evidence_df['processed_evidence'] = preprocessed_corpus
processed_evidence_df.to_csv('/content/drive/MyDrive/Colab_Notebooks/NLP-project/data/post-processed-evidence.csv')
# preprocessed_corpus_tfidf = [' '.join(word_list) for word_list in preprocessed_corpus]
# preprocessed_corpus_tfidf

**Please restart the kernal after saving the model and then load the data after saving it takes 20 minutes for preprocessing**

In [7]:
processed_corpus = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/NLP-project/data/post-processed-evidence.csv')
preprocessed_corpus = processed_corpus['processed_evidence'].apply(ast.literal_eval).tolist()
preprocessed_corpus_tfidf = processed_corpus['processed_evidence'].apply(ast.literal_eval).apply(' '.join)

*Run the code to get the vectors for both BM25 and TF-IDF*

In [9]:
from rank_bm25 import BM25Okapi
vectorizer = TfidfVectorizer()

bm25 = BM25Okapi(preprocessed_corpus)
vectors = vectorizer.fit_transform(preprocessed_corpus_tfidf)


In [10]:
import torch
from collections import Counter

def get_relevant_evidence_tfidf(claim, vectors, evidences, top_n=7):
    """
    Get the top `top_n` most relevant evidences for the claim, using TF-IDF.

    Args:
        claim: The claim to get evidence for.
        vectors: The TF-IDF vectors for the evidences.
        evidences: The evidences.
        top_n: The number of top-n evidences to return.

    Returns:
        A tuple of (the top `top_n` evidences, the indices of the top `top_n` evidences in the `evidences` list).
    """

    # Process the claim
    processed_claim = preprocess_corpus_spacy(claim)

    # Get the tfidf vector for the claim
    claim_vector = vectorizer.transform([processed_claim])

    # Get the top 5 most similar evidence
    top_n_evidences = cosine_similarity(claim_vector, vectors).argsort()[0][-top_n:]
    # Get the top 5 most similar evidence unprocessed
 
    filtered_items = [item for i, item in enumerate(evidence_data.items()) if i in top_n_evidences]
    most_similar_evidence_keys, most_similar_evidence = zip(*filtered_items)

    most_similar_evidence_keys = list(most_similar_evidence_keys)
    most_similar_evidence = list(most_similar_evidence)
    return most_similar_evidence, most_similar_evidence_keys

def get_relevant_evidence_bm25(claim, evidence_data, bm25, top_n=7):
    """
    Get the top `top_n` most relevant evidences for the claim, using BM25.

    Args:
        claim: The claim to get evidence for.
        evidence_data: The evidence data, in the form of a dictionary mapping from evidence id to evidence text.
        bm25: The BM25 model.
        top_n: The number of top-n evidences to return.

    Returns:
        A tuple of (the top `top_n` evidences, the indices of the top `top_n` evidences in the `evidence_data` dictionary).
    """
    processed_claim = preprocess_corpus_spacy([claim])
  

    # Get BM25 scores for each evidence sentence
    bm25_scores = bm25.get_scores(processed_claim[0]) # needed for spacy
    
    top_n_indices = bm25_scores.argsort()[::-1][:top_n]
    filtered_items = [item for i, item in enumerate(evidence_data.items()) if i in top_n_indices]
    most_similar_evidence_keys, most_similar_evidence = zip(*filtered_items)

    most_similar_evidence_keys = list(most_similar_evidence_keys)
    most_similar_evidence = list(most_similar_evidence)
    
    return most_similar_evidence, most_similar_evidence_keys


claim = 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'
# # Get relevant evidences using BM25
most_similar_evidence_bm25, most_similar_evidence_keys_bm25 = get_relevant_evidence_bm25(claim, evidence_data, bm25, top_n=15)
print(most_similar_evidence_bm25)

Processing documents: 100%|██████████| 1/1 [00:00<00:00, 21.17it/s]


['Scavengers play an important role in the ecosystem by consuming the dead animal and plant material.', 'This same high oxidising potential, however, causes ozone to damage mucous and respiratory tissues in animals, and also tissues in plants, above concentrations of about.', 'The concentration of secondary metabolites such as phenylpropanoids and flavonoids can also be altered in plants exposed to high concentrations of CO 2.', 'These ecosystems hold a globally significant plant and animal biodiversity, with unique habitats and high levels of endemism.', 'Sustainable agriculture is the cultivation of plant and animal materials in a manner that preserves plant and animal ecosystems and that can improve soil health and soil fertility over the long term.', 'At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whitef

**Create the augmeted training data**

We perform data augmentation on a given DataFrame `df`. The motivation behind this code is to increase the amount of training data, which can help improve the performance of machine learning models, especially deep learning models, and prevent overfitting.

Here's a step-by-step explanation:

1. The function `augment_data` is defined to create new instances of data. It takes as input a DataFrame `df`, a number of claims `num_claims`, and a number of evidences `num_evidences`.

2. The function first identifies unique claims that are labeled as "REFUTES" and "DISPUTED" from the DataFrame.

3. For each label ("REFUTES" and "DISPUTED"), it randomly samples a specified number of claims (`num_claims`).

4. For each sampled claim, it retrieves the most relevant evidence using the `get_relevant_evidence_bm25` function. This function likely uses the BM25 algorithm, a popular information retrieval technique, to rank the evidence based on its relevance to the claim.

5. It then selects the top `num_evidences` from the ranked list of evidence and creates a new instance for each evidence. Each new instance consists of the claim, the evidence, and the label.

6. These new instances are appended to a list, which is then converted into a DataFrame `augmented_df`.

7. The function returns the augmented DataFrame.

8. The augmented data is then concatenated with the original training data to create a larger training set `train_df_augmented`.

9. Finally, the augmented training data is saved to a CSV file.

By augmenting the data in this way, the code increases the diversity and amount of training data, which can help the model generalize better to unseen data and prevent overfitting. This is particularly important for deep learning models, which have a large number of parameters and are prone to overfitting if not provided with sufficient training data.

In [None]:
def augment_data(df, num_claims=110, num_evidences=7):
    refutes_claims = df[df["label"] == "REFUTES"]["claim_text"].unique()
    disputed_claims = df[df["label"] == "DISPUTED"]["claim_text"].unique()

    new_instances = []

    for label, claims in [("REFUTES", refutes_claims), ("DISPUTED", disputed_claims)]:
        sampled_claims = np.random.choice(claims, num_claims, replace=False)

        for claim in tqdm(sampled_claims, desc=f"Processing {label} claims"):
            most_similar_evidence_bm25, _ = get_relevant_evidence_bm25(claim, evidence_data, bm25, top_n=15)

            for evidence in most_similar_evidence_bm25[:num_evidences]:
                new_instances.append({
                    "claim_text": claim,
                    "evidence_text": evidence,
                    "label": label
                })

    augmented_df = pd.DataFrame(new_instances)
    return augmented_df

augmented_data = augment_data(train_df)
train_df_augmented = pd.concat([train_df, augmented_data], ignore_index=True)
train_df_augmented.to_csv('/content/drive/MyDrive/Colab_Notebooks/NLP-project/data/train_df_augmented.csv', index=False)

In [11]:
train_df_augmented = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/NLP-project/data/train_df_augmented.csv')

**Evidence retrieval with Sentence Transformers**

*Please restart the kernal before training if you use V100 GPU*

**Prepare data for training**

We prepare the training data for a machine learning model. We iterate over the rows of the augmented training DataFrame `train_df_augmented` and creates an `InputExample` object for each row. The `InputExample` object is a data structure used by the Transformers library to encapsulate a pair of related texts and optionally a label.

Here's a step-by-step explanation:

1. An empty list `train_samples` is initialized. This list will hold the `InputExample` objects.

2. The code then iterates over each row in the `train_df_augmented` DataFrame using the `iterrows()` function.

3. For each row, it extracts the claim text (`claim_text`), the evidence text (`evidence_text`), and the label (`label`).

4. It checks if the label is one of 'SUPPORTS', 'REFUTES', or 'DISPUTED'. If the label is not one of these, the row is skipped. This is likely done to ensure that only instances with these specific labels are used for training.

5. If the label is one of the specified labels, it creates an `InputExample` object with the claim text and the evidence text. Note that the label is not included in the `InputExample`. This suggests that the model being trained is an unsupervised model or the labels are used elsewhere in the code.

6. The `InputExample` object is then appended to the `train_samples` list.

The motivation behind this code is to convert the DataFrame into a format that can be directly used by a machine learning model for training. The `InputExample` objects provide a standardized way to pair related texts, which is useful for our text classification task.

In [12]:
train_samples = []

for index, row in train_df_augmented.iterrows():
    claim_text = row['claim_text']
    evidence_text = row['evidence_text']
    label = row['label']

    if label in ['SUPPORTS', 'REFUTES', 'DISPUTED']:
        train_samples.append(InputExample(texts=[claim_text, evidence_text]))
    


In [13]:
from sentence_transformers import datasets

batch_size = 16

loader = datasets.NoDuplicatesDataLoader(
    train_samples, batch_size=batch_size)


**Training the Sentence Embedding model for better retrieval**

*Training takes 33 minutes approximately*

We set up, train, and save a SentenceTransformer model, which is a type of model used for generating sentence embeddings. We finetune our model using the 'facebook/bart-large-mnli' as the base model. The model is trained using the Multiple Negatives Ranking Loss. This loss function is designed for ranking tasks, where the goal is to learn embeddings that are close for positive pairs and far apart for negative pairs.
The motivation behind this code is to finetune the pre-trained SentenceTransformer model on creating better embeddings or allow the model to improve its internal representation of the climate fact corpus which its not previously trained on. Fine-tuning allows the model to adapt to the specific characteristics of the task, which can lead to better performance compared to using the pre-trained model directly.

In [None]:
from sentence_transformers import SentenceTransformer, losses, models, InputExample, evaluation
from torch.utils.data import DataLoader

# Load a pre-trained SentenceTransformer model
model_name = 'facebook/bart-large-mnli'

# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='mean')
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


# Define the loss functions
loss = losses.MultipleNegativesRankingLoss(model)


# Fine-tune the model
num_epochs = 6
warmup_steps = int(len(loader) * num_epochs * 0.1)
model_save_path = '/content/drive/MyDrive/Colab_Notebooks/NLP-project/sentence-transformer-facebook-bart-large-mnli-final'

logging.basicConfig(level=logging.INFO)
model.fit(
    train_objectives=[(loader, loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    show_progress_bar=True
)



**Fine tuning Prediction Model**

We define a custom PyTorch `Dataset` class, `CLIMATEFEVERDataBert`, for preparing the CLIMATEFEVER data for BERT-like models. This class is designed to handle the preprocessing and tokenization of the data, and to provide a convenient interface for loading the data in batches during training.

Here's a step-by-step explanation:

1. The class is initialized with a dictionary of pandas DataFrames and a base model. The dictionary should contain the training and validation data, and the base model is used to initialize the BERT tokenizer.

2. The `init_data` method is called to initialize the training and validation data. This method calls the `load_data` method to load and tokenize the data from the DataFrames.

3. The `load_data` method iterates over the rows of the DataFrame, tokenizes the claim and evidence texts using the BERT tokenizer, and maps the label to an integer. The tokenized inputs and labels are then stored in PyTorch tensors and returned as a `TensorDataset`.

4. The `get_data_loaders` method creates `DataLoader` objects for the training and validation data. These objects can be used to load the data in batches during training.

The motivation behind this code is to encapsulate the data preprocessing and loading steps in a single class, which simplifies the training code and makes it easier to manage the data. BERT models require special tokens to be added to the input text to work correctly. Specifically, each input sequence should start with a `[CLS]` token and end with a `[SEP]` token. The `[CLS]` token is used as an aggregate representation for classification tasks, and the `[SEP]` token is used to separate different sentences or text segments. The `load_data` method of the `CLIMATEFEVERDataBert` class takes care of this step. The `DataLoader` objects allow for efficient loading of the data in batches, which is essential for training large models like BERT. 

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer

class CLIMATEFEVERDataBert(Dataset):
    """
    A custom Dataset class for preparing CLIMATEFEVER data for BERT-like models.

    This class takes as input a dictionary of DataFrames and a base model, 
    and creates a PyTorch Dataset suitable for training and validation.

    Attributes:
        label_dict: A dictionary mapping string labels to integers.
        train_df: DataFrame containing the training data.
        val_df: DataFrame containing the validation data.
        tokenizer: A BERT tokenizer.
        train_data: TensorDataset for the training data.
        val_data: TensorDataset for the validation data.
    """

    def __init__(self, ds, base_model):
        """
        Initiate the dataset object with dataframes and base model.

        Args:
            ds: A dictionary of pandas DataFrames.
            base_model: A pretrained model to base the tokenizer on.
        """
        self.label_dict = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}

        self.train_df = ds['train']
        self.val_df = ds['dev']

        self.tokenizer = AutoTokenizer.from_pretrained(base_model, do_lower_case=True)
        self.train_data = None
        self.val_data = None
        self.init_data()

    def init_data(self):
        """
        Initialize the train and validation data.
        """
        self.train_data = self.load_data(self.train_df)
        self.val_data = self.load_data(self.val_df)

    def load_data(self, df):
        """
        Load and tokenize data from a DataFrame.

        Args:
            df: A DataFrame containing the data.

        Returns:
            dataset: A TensorDataset containing tokenized inputs and labels.
        """
        MAX_LEN = 512
        token_ids = []
        mask_ids = []
        y = []

        claim_list = df['claim_text']
        evidence_list = df['evidence_text']
        label_list = df['label']

        for (claim, evidence, label) in zip(claim_list, evidence_list, label_list):
            encoded_inputs = self.tokenizer.encode_plus(claim, evidence, add_special_tokens=True, max_length=MAX_LEN,
                                                            truncation=True, padding='max_length', return_tensors='pt')

            pair_token_ids = encoded_inputs['input_ids'][0]
            attention_mask_ids = encoded_inputs['attention_mask'][0]

            token_ids.append(pair_token_ids)
            mask_ids.append(attention_mask_ids)
            y.append(self.label_dict[label])

        token_ids = pad_sequence(token_ids, batch_first=True)
        mask_ids = pad_sequence(mask_ids, batch_first=True)
        y = torch.tensor(y)
        dataset = TensorDataset(token_ids, mask_ids, y)
        print(len(dataset))
        return dataset



    def get_data_loaders(self, batch_size=32, shuffle=True):
        """
        Get DataLoader objects for the training and validation data.

        Args:
            batch_size: Number of samples per batch. Defaults to 32.
            shuffle: Whether to shuffle the data before loading. Defaults to True.

        Returns:
            train_loader, val_loader: DataLoader objects for the training and validation data.
        """
        train_loader = DataLoader(
            self.train_data,
            shuffle=shuffle,
            batch_size=batch_size
        )

        val_loader = DataLoader(
            self.val_data,
            shuffle=shuffle,
            batch_size=batch_size
        )

        return train_loader, val_loader


    


In [None]:
ds = {"train": train_df_augmented, "dev": dev_df}
data = CLIMATEFEVERDataBert(ds, 'distilroberta-base')
train_loader, val_loader = data.get_data_loaders(batch_size=37, shuffle=True)


In [None]:

num_labels = 4
base_model = 'distilroberta-base'
# # Load the model configuration
config = AutoConfig.from_pretrained(base_model, num_labels=num_labels)
# Adjust the dropout rates
#config.hidden_dropout_prob = 0.15  # Adjust the dropout rate for hidden layers
#config.attention_probs_dropout_prob = 0.3  # Adjust the dropout rate for attention probabilities

# Load the pretrained model with the updated configuration
prediction_model = AutoModelForSequenceClassification.from_pretrained(base_model, ignore_mismatched_sizes=True ,num_labels=num_labels)

print(prediction_model)

In [18]:
from transformers import AdamW
# Instantiate the optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, correct_bias=False)



In [19]:
def multi_acc(y_pred, y_test):
    acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
    return acc

In [20]:
def train(model, train_loader, val_loader, optimizer, accumulation_steps=4):
    """
    Train the model on the provided data.

    The training procedure also includes validation steps to check the performance of the model on the validation set.

    Args:
        model: The model to be trained.
        train_loader: DataLoader object containing the training data.
        val_loader: DataLoader object containing the validation data.
        optimizer: The optimizer used to update the model's parameters.
        accumulation_steps: The number of steps for gradient accumulation. Defaults to 4.
    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    EPOCHS = 1
    for epoch in range(EPOCHS):
        print(f"\nEpoch {epoch+1}/{EPOCHS}")
        # set the model to train mode
        model.train()
        train_losses = []
        train_accuracies = []
        # Reset gradients outside the mini-batch loop
        optimizer.zero_grad()  
        for i, (pair_token_ids, mask_ids, y) in tqdm(enumerate(train_loader), total=len(train_loader)):
            # Move the tensors to the appropriate device
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            labels = y.to(device)

            output = model(pair_token_ids,
                           attention_mask=mask_ids,
                           labels=labels)
            # Store the loss and accuracy
            loss = output.loss
            logits = output.logits

            train_losses.append(loss.item())
            train_accuracies.append(multi_acc(logits, labels).item())

            loss.backward()
            # Update weights after accumulation_steps mini-batches
            if (i + 1) % accumulation_steps == 0:  
                optimizer.step()
                optimizer.zero_grad()

        print(f"Epoch: {epoch + 1}, Training Loss: {sum(train_losses) / len(train_losses)}, Training Accuracy: {sum(train_accuracies) / len(train_accuracies)}")

        # Set the model to evaluation mode
        model.eval()
        val_losses = []
        val_accuracies = []
        for i, (pair_token_ids, mask_ids, y) in tqdm(enumerate(val_loader), total=len(val_loader)):
            # Ensure no gradients are calculated
            with torch.no_grad():
                pair_token_ids = pair_token_ids.to(device)
                mask_ids = mask_ids.to(device)
                labels = y.to(device)

                output = model(pair_token_ids,
                               attention_mask=mask_ids,
                               labels=labels)

                loss = output.loss
                logits = output.logits

                val_losses.append(loss.item())
                val_accuracies.append(multi_acc(logits, labels).item())

        print(f"Epoch: {epoch + 1}, Validation Loss: {sum(val_losses) / len(val_losses)}, Validation Accuracy: {sum(val_accuracies) / len(val_accuracies)}")


    

**Train the model**

*Takes 5 minutes to train*

In [None]:
train(prediction_model, train_loader, val_loader, optimizer)


**Save Prediction model**

In [None]:
path = "/content/drive/MyDrive/Colab_Notebooks/NLP-project/model_checkpoints-distilroberta-climatefever"
torch.save(model.state_dict(),path)

**Load sentence retriever model if not loaded**

In [None]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    """
    Apply mean pooling on the model's output.

    Args:
        model_output: The output of the transformer model.
        attention_mask: The attention mask applied during tokenization.

    Returns:
        Mean pooled representation of the model's output.
    """
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    

def find_best_matches(claim, evidences, evidence_keys, top_n: int, model, tokenizer):

    """
    Given a claim and a list of evidences, find the top_n evidences that best match the claim.

    Args:
        claim: A string representing the claim.
        evidences: A list of strings, each representing an evidence.
        evidence_keys: A list of keys corresponding to the evidences.
        top_n: An integer specifying how many top matching evidences to return.
        model: The transformer model used for encoding.
        tokenizer: The tokenizer used for encoding.

    Returns:
        top_evidences: A list of top_n evidences that best match the claim.
        top_keys: The keys corresponding to the top_evidences.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Tokenize and encode the claim
    encoded_claim = tokenizer(claim, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output_claim = model(**encoded_claim)
    claim_embedding = mean_pooling(model_output_claim, encoded_claim['attention_mask'])

    # Tokenize and encode the evidences
    encoded_evidences = tokenizer(evidences, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output_evidences = model(**encoded_evidences)
    evidence_embeddings = mean_pooling(model_output_evidences, encoded_evidences['attention_mask'])

    # Calculate cosine similarities and get the top indices
    similarities = cosine_similarity(claim_embedding.cpu(), evidence_embeddings.cpu())[0]
    top_indices = np.argsort(similarities)[-top_n:][::-1]

    # Get the top evidences and their keys
    top_evidences = [evidences[i] for i in top_indices]
    top_keys = [evidence_keys[i] for i in top_indices]

    return top_evidences, top_keys



retriever_model_7 = '/content/drive/MyDrive/Colab_Notebooks/NLP-project/sentence-transformer-facebook-bart-large-mnli-final' # model saved above and used for final submission



In [11]:
def majority_vote_prediction(claim, relevant_evidences, model, tokenizer):
    """
    Given a claim and a list of relevant evidences, predict the label of the claim based on majority voting.

    Args:
        claim: A string representing the claim.
        relevant_evidences: A list of strings, each representing an evidence.
        model: The transformer model used for prediction.
        tokenizer: The tokenizer used for encoding.

    Returns:
        majority_vote: The label that received the most votes.
    """

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    label_mapping = ['SUPPORTS', 'REFUTES', 'NOT_ENOUGH_INFO', 'DISPUTED']
    predictions = []
    probabilities_list = []

    # Loop over each evidence and make a prediction
    for evidence in relevant_evidences:
        # Tokenize and encode the claim and evidence pair
        encoded_pair = tokenizer(claim, evidence, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            outputs = model(**encoded_pair)

        scores = outputs.logits
        probabilities = torch.softmax(scores, dim=1).cpu().detach().numpy()[0]
        probabilities_list.append(probabilities)
        # Append the predicted label to the list
        predictions.append(label_mapping[torch.argmax(scores, dim=1).item()])
    # Pick the class with highest vote
    majority_vote = max(set(predictions), key=predictions.count)
    return majority_vote




def get_evidence_make_prediction(claim_data, evidence_data, vectors, bm25, top_n, mode, model, tokenizer, prediction_model, prediction_tokenizer):
    """
    Given claim data and evidence data, find relevant evidences and make predictions.

    Args:
        claim_data: A dictionary where each key is a claim ID and the value is a dictionary containing the claim text.
        evidence_data: A dictionary containing the evidence data.
        vectors: Vector representations of the evidences.
        bm25: An instance of the BM25 model.
        top_n: An integer specifying how many top matching evidences to return.
        mode: A string indicating the method to use for finding relevant evidences. Options are "tfidf", "bm25", and "bm25+pretrained".
        model: The transformer model used for encoding.
        tokenizer: The tokenizer used for encoding.
        prediction_model: The model used for making predictions.
        prediction_tokenizer: The tokenizer used for encoding the claims and evidences for the prediction model.

    Returns:
        results: A dictionary where each key is a claim ID and the value is a dictionary containing the claim text, 
                 the predicted label, and the keys of the relevant evidences."""

    results = {}
    # Loop over each claim in the claim data
    for i, claim_id in enumerate(claim_data):
        print("Claim no: ", i)
        claim = claim_data[claim_id]['claim_text']

        # Find relevant evidences based on the specified mode
        if mode == "tfidf":
            relevant_evidences, relevant_evidence_keys = get_relevant_evidence_tfidf(claim, vectors, evidence_data, top_n)
        elif mode == "bm25":
            relevant_evidences, relevant_evidence_keys = get_relevant_evidence_bm25(claim, evidence_data, bm25, top_n)
        elif mode == "bm25+pretrained":
            # Get relevant evidences using BM25
            top_evidences, top_keys = get_relevant_evidence_bm25(claim, evidence_data, bm25, top_n=50)

            relevant_evidences, relevant_evidence_keys = find_best_matches(claim, top_evidences, top_keys, top_n, model, tokenizer)

        predicted_label = majority_vote_prediction(claim, relevant_evidences, prediction_model, prediction_tokenizer)
      
        results[claim_id] = {
            "claim_text": claim,
            "claim_label": predicted_label,
            "evidences": relevant_evidence_keys
        }
        
    return results

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

**INFERENCE**

*Takes 22-27 minutes with GPU on 154 samples to complete*

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
prediction_path = '/content/drive/MyDrive/Colab_Notebooks/NLP-project/model_checkpoints-distilroberta-base-climatefever'
base_model = 'distilroberta-base'
prediction_model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=4, ignore_mismatched_sizes=True)
prediction_model.load_state_dict(torch.load(prediction_path, map_location=device))
prediction_model.to(device)
prediction_model.eval()
prediction_tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')

retriever_model_7 = '/content/drive/MyDrive/Colab_Notebooks/NLP-project/sentence-transformer-facebook-bart-large-mnli'
model = AutoModel.from_pretrained(retriever_model_7)
tokenizer = AutoTokenizer.from_pretrained(retriever_model_7)
predicted_results_test = get_evidence_make_prediction(test_data, evidence_data, vectors, bm25, 3, "bm25+pretrained", model,tokenizer,  prediction_model,prediction_tokenizer)

with open('test-claims-predictions.json', 'w') as fp:
    json.dump(predicted_results_test, fp)