## News Article Similarity Modelling
- Cross encoding 
- Translated data 
- Using Title 

In [1]:
from tqdm.auto import tqdm
import torch 
import random
from torch import nn
from transformers import RobertaTokenizer, PreTrainedTokenizer, DistilBertTokenizer, DistilBertModel, RobertaModel
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses
from datasets import Dataset
import pandas as pd
from transformers.optimization import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt 
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from torch.nn import CosineEmbeddingLoss
import transformers
import seaborn as sns
#Build up to SBERT model 

In [2]:
deviceNum = 1
device = torch.device("cuda:" + str(deviceNum) if torch.cuda.is_available() else "cpu")


In [3]:
GRAD_ACC = 6
EPOCHS = 3
FOLDS = 5
SEED = 85
BATCH_SIZE = 5

In [4]:
def check_mem():
    torch.cuda.empty_cache()
    a = torch.cuda.memory_allocated(deviceNum)/1024/1024/1024
    r = torch.cuda.memory_reserved(deviceNum)/1024/1024/1024
    print("torch.cuda.memory_allocated: %fGB"%a)
    print("torch.cuda.memory_reserved: %fGB"%r)
    print("torch.cuda.memory_free: %fGB"%(r-a))
    print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(deviceNum)/1024/1024/1024))
check_mem()

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.memory_free: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


In [5]:
#set seeds 
torch.manual_seed(85)
random.seed(85)

In [6]:
df = pd.read_csv("/home/blitt/projects/localNews/data/processed/translated_200_56.tsv", sep="\t")
#df = df.loc[(df["url1_lang"] == "en") & (df["url2_lang"] == "en")]

groundTruths = ["Geography", "Entities","Time", "Narrative", "Overall"]
features = ['text1Merged', 'text2Merged','url1_lang', 'url2_lang']
toSelect = groundTruths + features 
 
#get only the columns we need 
#TODO: do we need "pair_id"? 
leanDf = df[toSelect].dropna()

#rescale data from (0, 4): (0, 1)
for colName in groundTruths: 
    leanDf[colName] = 1 - ((leanDf[colName] - 1) / 3)

#reset index so it is contiguous set of numbers 
leanDf = leanDf.reset_index(drop=True)


In [7]:
class Model(nn.Module): 
    def __init__(self):
        super(Model,self).__init__()
        self.model = RobertaModel.from_pretrained('roberta-base')
        self.l1 = nn.Linear(768, 5).to(device)
        self.loss_func = torch.nn.MSELoss(reduction="mean")
        
    def mean_pooling(self, token_embeddings, attention_mask): 
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def forward(self, input_ids, attention_mask): 
        
        #encode sentence and get mean pooled sentence representation 
        encoding = self.model(input_ids, attention_mask=attention_mask)[0]  #all token embeddings
        meanPooled = self.mean_pooling(encoding, attention_mask)
       
        pred = self.l1(meanPooled)
        
        return pred


In [8]:
"""
Get the loss across multiple different objectives. 
Since overall is most important it gets more weight. 
"""
def getWeightedLoss(predTens, gtTens):
    loss_func = torch.nn.MSELoss(reduction="mean")
    #try getting rid of Tone and Style 
    LOSS_WEIGHTS = [.1, .1, .1, .1, .6]
    loss = 0.0
    for i in range(len(LOSS_WEIGHTS)): 
        
        #get ground truth value associated with this column name 
        currGT = gtTens[:, :, i]
        
        #TODO: figure out how to index properly here 
        pred = predTens[:, :, i]
        
        #get loss 
        loss += (loss_func(pred, currGT) * LOSS_WEIGHTS[i])
    return loss 

In [9]:
def validation(model, validLoader, loss_func): 
    model.eval()
    lossList = []
    predList = []
    GT = []
    
    i = True 
    for batch in validLoader: 

        # prepare batches and more all to the active device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #label = batch['ground_truth'].to(device).unsqueeze(1)

        #send batch info through model 
        pred = model(input_ids, attention_mask).unsqueeze(0)
        
        #get ground truths 
        gts = torch.stack([batch[colName] for colName in groundTruths], 0).to(device).T.unsqueeze(0)
        
        #get weighted loss relating to label prediction 
        loss = getWeightedLoss(gts, pred)
    
        #get output metrics 
        lossList.append(loss.detach().cpu().item())
        predList.append([float(item) for item in list(pred.detach().cpu().squeeze())])
        GT.append([float(item) for item in list(gts.detach().cpu().squeeze())])
        
        del pred
        del gts
        del loss
    #print(vGT)
    return [lossList, predList, GT]
        

In [10]:
#set up relevant variables 
def train(trainDataset, validDataset): 
    torch.cuda.empty_cache()
    #get loaders 
    trainLoader = torch.utils.data.DataLoader(
        trainDataset, batch_size=BATCH_SIZE, shuffle=True
    )
    validLoader = torch.utils.data.DataLoader(
        validDataset, batch_size=1, shuffle=True
    )
    
    trainLen = len(trainDataset)

    #load the model 
    model = Model().to(device)

    #TODO: double check on if reduction="mean" is the right move here...
    #could cosine similarity also work..? I think that is between the two predicted vectors though.. 
    loss_func = torch.nn.MSELoss(reduction="mean")

    # we would initialize everything first
    optim = torch.optim.Adam(model.parameters(), lr=2e-5)

    #set up scheduler
    # and setup a warmup for the first ~10% steps
    total_steps = int((trainLen*EPOCHS) / BATCH_SIZE)
    warmup_steps = int(0.1 * total_steps)
    scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=warmup_steps, num_training_steps=total_steps - warmup_steps)
    
    
    #now run training loop 
    lossList = []
    validMetrics = []
    subLossList = []
    
    # increase from 1 epoch if need be 
    for epoch in range(EPOCHS):
        torch.cuda.empty_cache()
        model.train()  # make sure model is in training mode

        # initialize the dataloader loop with tqdm (tqdm == progress bar)
        loop = tqdm(trainLoader, leave=True)

        validMetrics.append(validation(model, validLoader, loss_func))
        model.train()

        for i, batch in enumerate(loop): 
            # zero all gradients on each new step
            optim.zero_grad()

            # prepare batches and more all to the active device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch["attention_mask"].to(device)
           
            #send batch info through model 
            pred = model(input_ids, attention_mask).unsqueeze(0)

            #get gts 
            gts = torch.stack([batch[colName] for colName in groundTruths], 0).T.to(device).unsqueeze(0)
        
            #get loss relating to label prediction 
            loss = getWeightedLoss(gts, pred) / GRAD_ACC
            
            # using loss, calculate gradients and then optimize
            loss.backward()
            optim.step()

            #get mean loss over last 20 batches 
            if i % 20 == 0: 
                lossList.append(np.mean(subLossList))
                subLossList = []
                pass

            subLossList.append(float(loss.detach().item()))
        
            # update learning rate scheduler
            scheduler.step()

            # update the TDQM progress bar
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())
            del loss 
        print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
        print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
        print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

    validMetrics.append(validation(model, validLoader, loss_func))
    del model
    del trainLoader
    del validLoader
    return validMetrics 
    

In [11]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=FOLDS, shuffle=True)

In [12]:
metrics = []
transformers.logging.set_verbosity_error()
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

#we only want to sample validation data from the pairs that are both english 
enDf = leanDf[(leanDf["url1_lang"] == "en") & (leanDf["url2_lang"] == "en")]

print("Total df len: " +  str(len(leanDf)))
print("English df len: " +  str(len(enDf)))
#we create splits based on the position (not the actual index) of rows in enDf
#the idea is to get a split of the english dataset to set aside and then 
#grab everything else in the en + translated dataset to train on 
for i, (train_index, valid_index) in enumerate(kf.split(enDf)): 
    
    #grab the rows in enDf corresponding to the positions of our split 
    validDf = enDf.iloc[valid_index]
    
    #now get the actual indicies that have been selected
    #and subtract the indices in trainDf away from those 
    remainingIndices = list(set(leanDf.index) - set(validDf.index))
    trainDf = leanDf.loc[remainingIndices]
    print("###### " + str(i).upper() + " ######")
    print("Train df len: " + str(len(trainDf)))
    print("Valid df len: " + str(len(validDf)))
    
    #get data loaded in properly 
    trainDataset = Dataset.from_pandas(trainDf)
    validDataset = Dataset.from_pandas(validDf)
    
    #NOTE: here we use the merged text
    trainDataset = trainDataset.map(lambda x: tokenizer(x["text1Merged"], x["text2Merged"], max_length=512, padding="max_length", truncation=True))
    validDataset = validDataset.map(lambda x: tokenizer(x["text1Merged"], x["text2Merged"], max_length=512, padding="max_length", truncation=True))
    

    #only need the input information 
    trainDataset = trainDataset.remove_columns(["text1Merged", "text2Merged", "__index_level_0__"])
    validDataset = validDataset.remove_columns(["text1Merged", "text2Merged", "__index_level_0__"])
    
    # convert dataset features to PyTorch tensors
    formatColumns = groundTruths + ["input_ids", "attention_mask"]
    validDataset.set_format(type='torch', columns=formatColumns)
    trainDataset.set_format(type='torch', columns=formatColumns)

    validMetrics = train(trainDataset, validDataset)
    metrics.append(validMetrics)
    print(len(metrics))
    del trainDataset
    del validDataset
    !nvidia-smi

Total df len: 4806
English df len: 1738
###### 0 ######
Train df len: 4458
Valid df len: 348


  0%|          | 0/4458 [00:00<?, ?ex/s]

  0%|          | 0/348 [00:00<?, ?ex/s]

  0%|          | 0/892 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB
1
Wed Jan 18 11:52:59 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A5000    Off  | 00000000:04:00.0 Off |                  Off |
| 30%   24C    P8     8W / 230W |  12365MiB / 24256MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:0

  0%|          | 0/4458 [00:00<?, ?ex/s]

  0%|          | 0/348 [00:00<?, ?ex/s]

  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB
2
Wed Jan 18 12:06:53 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A5000    Off  | 00000000:04:00.0 Off |                  Off |
| 30%   23C    P8     8W / 230W |  12365MiB / 24256MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:0

  0%|          | 0/4458 [00:00<?, ?ex/s]

  0%|          | 0/348 [00:00<?, ?ex/s]

  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB
3
Wed Jan 18 12:20:48 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A5000    Off  | 00000000:04:00.0 Off |                  Off |
| 30%   30C    P8    16W / 230W |  12497MiB / 24256MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:0

  0%|          | 0/4459 [00:00<?, ?ex/s]

  0%|          | 0/347 [00:00<?, ?ex/s]

  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB
4
Wed Jan 18 12:34:43 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A5000    Off  | 00000000:04:00.0 Off |                  Off |
| 37%   67C    P2   168W / 230W |  12507MiB / 24256MiB |     36%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:0

  0%|          | 0/4459 [00:00<?, ?ex/s]

  0%|          | 0/347 [00:00<?, ?ex/s]

  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


  0%|          | 0/892 [00:00<?, ?it/s]

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB
5
Wed Jan 18 12:48:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A5000    Off  | 00000000:04:00.0 Off |                  Off |
| 38%   62C    P2   132W / 230W |  12507MiB / 24256MiB |     40%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:0

In [13]:
#saving this cell, undo if we need old code 

In [14]:
epochList = []
foldList = []
corrList = []
for epoch in range(EPOCHS): 
    for fold in range(FOLDS): 
        df = pd.DataFrame(metrics[fold][epoch]).T
        df.columns = ["loss", "pred", "true"]
        #corr = np.corrcoef(df["pred"], df["true"])

        predCols = ["pred" + item for item in groundTruths]
        gtCols = ["gt" + item for item in groundTruths]

        df[predCols] = pd.DataFrame(df["pred"].tolist(), index=df.index)
        df[gtCols] = pd.DataFrame(df["true"].tolist(), index=df.index)

        corrScores = []
        for colName in groundTruths: 
            corr = np.corrcoef(df["pred" + colName], df["gt" + colName])[1, 0]
            corrScores.append(corr)
        epochList.append(epoch)
        foldList.append(fold)
        corrList.append(corrScores)
        

In [15]:
df["pred"][0]

[0.42599859833717346,
 0.19772666692733765,
 0.7713460326194763,
 0.14386795461177826,
 0.17335015535354614]

In [16]:
df = pd.DataFrame({"epoch":epochList, "fold":foldList, "corrList":corrList})
df[gtCols] = pd.DataFrame(df["corrList"].tolist(), index=df.index)
df = df.drop(columns=["corrList"])

In [17]:
df.groupby(by="epoch").agg(np.mean)

Unnamed: 0_level_0,fold,gtGeography,gtEntities,gtTime,gtNarrative,gtOverall
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2,-0.009619,-0.080576,0.108484,-0.025543,-0.019963
1,2,0.679692,0.829344,0.356246,0.814812,0.852245
2,2,0.738554,0.843574,0.373039,0.834543,0.874976


In [18]:
df

Unnamed: 0,epoch,fold,gtGeography,gtEntities,gtTime,gtNarrative,gtOverall
0,0,0,0.048094,-0.023136,0.007887,0.026791,-0.175735
1,0,1,-0.212379,-0.045275,0.177753,-0.125399,-0.008635
2,0,2,0.142628,-0.019238,0.057628,-0.035693,0.019296
3,0,3,0.006282,-0.170403,0.106212,0.105626,0.006546
4,0,4,-0.03272,-0.144826,0.192941,-0.099038,0.058713
5,1,0,0.72159,0.872805,0.378181,0.841913,0.857899
6,1,1,0.63238,0.822974,0.336394,0.814075,0.853672
7,1,2,0.713599,0.823634,0.353132,0.821183,0.852131
8,1,3,0.686716,0.783451,0.317251,0.775192,0.829074
9,1,4,0.644175,0.843857,0.396271,0.821697,0.868447
