# AIR - Exercise in Google Colab

## Colab Preparation

Open via google drive -> right click: open with Colab

**Get a GPU**

Toolbar -> Runtime -> Change Runtime Type -> GPU

**Mount Google Drive**

* Download data and clone your github repo to your Google Drive folder
* Use Google Drive as connection between Github and Colab (Could also use direct github access, but re-submitting credentials might be annoying)
* Commit to Github locally from the synced drive

**Keep Alive**

When training google colab tends to kick you out, This might help: https://medium.com/@shivamrawat_756/how-to-prevent-google-colab-from-disconnecting-717b88a128c0

**Get Started**

Run the following script to mount google drive and install needed python packages. Pytorch comes pre-installed.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install -r ../requirements.txt

In [None]:
import torch

print("Version:",torch.__version__)
print("Has GPU:",torch.cuda.is_available()) # check that 1 gpu is available
print("Random tensor:",torch.rand(10,device="cuda")) # check that pytorch works 

# Main.py Replacement

-> add your code here

- Replace *air_test* with your google drive location in the sys.path.append()

In [None]:
from allennlp.common import Params, Tqdm
from allennlp.common.util import prepare_environment
from allennlp.data.dataloader import PyTorchDataLoader
from anyio import current_default_worker_thread_limiter
prepare_environment(Params({})) # sets the seeds to be fixed

import torch
import pandas as pd


from allennlp.data.vocabulary import Vocabulary

from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

from data_loading import *
from model_knrm import *
from model_conv_knrm import *
from model_tk import *

#def RR@k
def RR(docIds, docScores, docIdsRelevant, k):
    #sort docScores in descending order and consider only the top k entries
    data = pd.DataFrame({"doc_ids":docIds, "docScores": docScores})\
        .sort_values("docScores", ascending=False)\
            .reset_index(drop=True)\
                .iloc[0:k]
    
    #check which of the top k entries is relevant according to docIdsRelevant
    check = np.column_stack(
        [data.doc_ids == rel for rel in docIdsRelevant]
        ).any(axis=1)
    
    #extract the index of the first relevant document
    ind = np.where(check == True)[0]
    if len(ind)>0:
        return(1/(ind[0]+1))
    else:
        return(0)

# change paths to your data directory
config = {
    "vocab_directory": "../data/Part-2/allen_vocab_lower_10",
    "pre_trained_embedding": "../data/Part-2/glove.42B.300d.txt",
    "model": "conv_knrm",
    "train_data": "../data/Part-2/triples.train.tsv",
    "validation_data": "../data/Part-2/msmarco_tuples.validation.tsv",
    "test_data":"../data/Part-2/msmarco_tuples.test.tsv",
}


#
# data loading
#

vocab = Vocabulary.from_files(config["vocab_directory"])
tokens_embedder = Embedding(vocab=vocab,
                           pretrained_file= config["pre_trained_embedding"],
                           embedding_dim=300,
                           trainable=True,
                           padding_index=0)
word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder})

# recommended default params for the models (but you may change them if you want)
if config["model"] == "knrm":
    model = KNRM(word_embedder, n_kernels=11)
elif config["model"] == "conv_knrm":
    model = Conv_KNRM(word_embedder, n_grams=3, n_kernels=11, conv_out_dim=128)
elif config["model"] == "tk":
    model = TK(word_embedder, n_kernels=11, n_layers = 2, n_tf_dim = 300, n_tf_heads = 10)


# todo optimizer, loss 

print('Model',config["model"],'total parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad))
print('Network:', model)
#use plain margin ranking as loss function, set margin=1 to get formula as given on p. 15 of the slides "introduction to neural re-ranking"
loss_function = torch.nn.MarginRankingLoss(margin=1)

#use Adam  as optimiser
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))

#use GPU if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
cpu_device = torch.device('cpu')
print("Using device", device)    
#model.to(device)

#
# train
#

_triple_reader = IrTripleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_triple_reader = _triple_reader.read(config["train_data"])
_triple_reader.index_with(vocab)
loader = PyTorchDataLoader(_triple_reader, batch_size=32)

#use validation loader within train loop
_tuple_reader = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_tuple_reader = _tuple_reader.read(config["validation_data"])
_tuple_reader.index_with(vocab)
validation_loader = PyTorchDataLoader(_tuple_reader, batch_size=128)

qrels = pd.read_csv("../data/Part-2/msmarco_qrels.txt", sep="\t", header= None, names =["query_id","hardcoded-Q0", "doc_id", "relevance-grade"])
qrels.query_id = qrels.query_id.astype(str)
qrels.doc_id = qrels.doc_id.astype(str)


#save all train losses in a list
train_loss = []
validation_metric = []

for epoch in range(2):
    #set model into train mode
    model.train()
    for batch in Tqdm.tqdm(loader):
        # todo train loop
        #set all gradients to 0
        optimizer.zero_grad()

        query, documentPos, documentNeg = batch["query_tokens"], batch["doc_pos_tokens"], batch["doc_neg_tokens"]
        #determine output for positive relevance
        outputPos = model(query, documentPos)
        outputNeg = model(query, documentNeg)
        loss = loss_function(outputPos, outputNeg, torch.ones(outputPos.size(), requires_grad=False)) #set y=1 to state that outputPos should be greater than outputNeg
        #calculate the gradients backward the loss function
        loss.backward()

        #change parameters according to optimiser
        optimizer.step()
    
    print(f"Epoch: {epoch} Loss: {loss}")
    train_loss.append(loss.detach().cpu().numpy() )
    
    MRR = 0        
    nrValidationQueries = 0 
    for batch in Tqdm.tqdm(validation_loader):
        # todo validation loop 
        query_ids, doc_ids, query, document = batch["query_id"],batch["doc_id"],batch["query_tokens"], batch["doc_tokens"]
        output = model(query, document).detach().numpy()      
        # todo evaluation
        # implement MRR@10
        distinct_queries = list(set(query_ids))        
        nrValidationQueries = nrValidationQueries + len(distinct_queries)
        for currentQuery in distinct_queries:
            currentRR= RR(
                [dID for qID, dID in zip(query_ids, doc_ids) if qID == currentQuery],
                output[[ qID == currentQuery for qID in query_ids]],
                qrels.loc[qrels.query_id == currentQuery,"doc_id"].to_list(),
                10
            )
            print(currentRR)
            MRR = MRR +currentRR
        
    validation_metric.append(MRR / nrValidationQueries)
    
#
# eval (duplicate for validation inside train loop - but rename "loader", since
# otherwise it will overwrite the original train iterator, which is instantiated outside the loop)
#

_tuple_reader = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_tuple_reader = _tuple_reader.read(config["test_data"])
_tuple_reader.index_with(vocab)
loader = PyTorchDataLoader(_tuple_reader, batch_size=128)

test_metric = []
MRR = 0        
nrTestQueries = 0 
for batch in Tqdm.tqdm(loader):
    # todo test loop 
    query, document = batch["query_tokens"], batch["doc_tokens"]
    output = model(query, document).detach().numpy()
    # todo evaluation
    query_ids, doc_ids, query, document = batch["query_id"],batch["doc_id"],batch["query_tokens"], batch["doc_tokens"]
    output = model(query, document).detach().numpy()      
    # todo evaluation
    # implement MRR@10
    distinct_queries = list(set(query_ids))        
    nrTestQueries = nrTestQueries + len(distinct_queries)
    for currentQuery in distinct_queries:
        currentRR= RR(
            [dID for qID, dID in zip(query_ids, doc_ids) if qID == currentQuery],
            output[[ qID == currentQuery for qID in query_ids]],
            qrels.loc[qrels.query_id == currentQuery,"doc_id"].to_list(),
            10
        )
        print(currentRR)
        MRR = MRR +currentRR

test_metric.append(MRR / nrTestQueries)
