## News Article Similarity Modelling
- Cross encoding 
- Translated data 
- Using Title 

In [1]:
from tqdm.auto import tqdm
import torch 
import random
from torch import nn
from transformers import LongformerConfig, LongformerModel, PreTrainedTokenizer, LongformerTokenizer
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses
from datasets import Dataset
import pandas as pd
from transformers.optimization import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt 
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from torch.nn import CosineEmbeddingLoss
import transformers
import seaborn as sns
import sys
#Build up to SBERT model 

In [2]:

"""
for actually running this code in a file 

jsonPath = sys.argv[1]
with open(jsonPath, "r") as f: 
    my_dict = json.load(f)
    BAT
EPOCHS = int(my_dict["EPOCHS"])
BATCH_SIZE = int(my_dict["BATCH_SIZE"])
DROPOUT = float(my_dict["DROPOUT"])
REG_ALPHA = float(my_dict["REG_ALPHA"])
"""
BATCH_SIZE = 2
EPOCHS = 4 
DROPOUT = .25
REG_ALPHA = .5

In [3]:
deviceNum = 5
device = torch.device("cuda:" + str(deviceNum) if torch.cuda.is_available() else "cpu")


In [4]:
def check_mem():
    torch.cuda.empty_cache()
    a = torch.cuda.memory_allocated(deviceNum)/1024/1024/1024
    r = torch.cuda.memory_reserved(deviceNum)/1024/1024/1024
    print("torch.cuda.memory_allocated: %fGB"%a)
    print("torch.cuda.memory_reserved: %fGB"%r)
    print("torch.cuda.memory_free: %fGB"%(r-a))
    print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(deviceNum)/1024/1024/1024))
check_mem()

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.memory_free: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


In [5]:
#set seeds 
torch.manual_seed(85)
random.seed(85)

In [6]:
df = pd.read_csv("/shared/3/projects/benlitterer/localNews/NetworkMVP/translatedCleaned.tsv", sep="\t")
#df = df.loc[(df["url1_lang"] == "en") & (df["url2_lang"] == "en")]

groundTruths = ["Overall"]
features = ['text1', 'text2', 'title1', 'title2', 'url1_lang', 'url2_lang']
toSelect = groundTruths + features 

#get only the columns we need 
#TODO: do we need "pair_id"? 
leanDf = df[toSelect].dropna()

#rescale data from (0, 4): (0, 1)
for colName in groundTruths: 
    leanDf[colName] = 1 - ((leanDf[colName] - 1) / 3)

#reset index so it is contiguous set of numbers 
leanDf = leanDf.reset_index(drop=True)

#now combine title and text together 
#first add ". " to title 
leanDf["title1"] = leanDf["title1"].apply(lambda x: x + ". ")
leanDf["title2"] = leanDf["title2"].apply(lambda x: x + ". ")

leanDf["text1"] = leanDf["title1"] + leanDf["text1"]
leanDf["text2"] = leanDf["title2"] + leanDf["text2"]


In [7]:
df.columns

Index(['Unnamed: 0', 'url1_lang', 'url2_lang', 'pair_id', 'link1', 'link2',
       'ia_link1', 'ia_link2', 'Geography', 'Entities', 'Time', 'Narrative',
       'Overall', 'Style', 'Tone', 'id1', 'id2', 'ogText1', 'ogTitle1',
       'ogText2', 'ogTitle2', 'text1', 'title1', 'text2', 'title2'],
      dtype='object')

In [8]:
leanDf["url2_lang"].value_counts()

en    2296
de     845
es     560
tr     455
pl     310
ar     268
fr      72
Name: url2_lang, dtype: int64

In [9]:
#NOTE: do a language cutoff 
#langList = ["en", "fr", "es"]
#leanDf = leanDf[(leanDf["url1_lang"].isin(langList)) & (leanDf["url2_lang"].isin(langList))]

In [10]:
#we only want to sample validation data from the pairs that are both english 
enDf = leanDf[(leanDf["url1_lang"] == "en") & (leanDf["url2_lang"] == "en")]
validProp = .1
validCount = int(validProp * len(enDf))
print(validCount)
validIndices = random.sample(list(enDf.index), validCount)

#get dataframe with indices of only the original english pairs 
validDf = enDf.loc[validIndices]

#train data should be all rows that aren't in the validation set 
#here we are taking a set difference and then indexing what remains 
trainDf = leanDf.loc[set(leanDf.index) - set(validIndices)]

173


  trainDf = leanDf.loc[set(leanDf.index) - set(validIndices)]


In [11]:
#get data loaded in properly 
trainDataset = Dataset.from_pandas(trainDf)
validDataset = Dataset.from_pandas(validDf)

In [12]:
#link: https://huggingface.co/sentence-transformers/all-mpnet-base-v2
#example of tokenizing 
#tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
#tokenizer = AutoTokenizer.from_pretrained('Giyaseddin/distilbert-base-cased-finetuned-fake-and-real-news-dataset')

tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")

In [13]:
from transformers.utils import logging
logging.set_verbosity_error()
trainDataset = trainDataset.map(lambda x: tokenizer(x["text1"], x["text2"], max_length=1024, padding="max_length", truncation=True))
validDataset = validDataset.map(lambda x: tokenizer(x["text1"], x["text2"], max_length=1024, padding="max_length", truncation=True))

  0%|          | 0/4633 [00:00<?, ?ex/s]

  0%|          | 0/173 [00:00<?, ?ex/s]

In [14]:
#only need the input information 
trainDataset = trainDataset.remove_columns(["text1", "text2", "__index_level_0__"])
validDataset = validDataset.remove_columns(["text1", "text2", "__index_level_0__"])

In [15]:
# convert dataset features to PyTorch tensors
formatColumns = groundTruths + ["input_ids", "attention_mask"]
validDataset.set_format(type='torch', columns=formatColumns)
trainDataset.set_format(type='torch', columns=formatColumns)

In [16]:
# initialize the dataloader
trainLoader = torch.utils.data.DataLoader(
    trainDataset, batch_size=BATCH_SIZE, shuffle=True
)
validLoader = torch.utils.data.DataLoader(
    validDataset, batch_size=1, shuffle=True
)

In [17]:
#TESTING
model = LongformerModel.from_pretrained('allenai/longformer-base-4096',output_hidden_states = True, output_attentions=True)

In [18]:
#TESTING
for i, batch in enumerate(validLoader): 
    if i < 3: 
        out = model(batch["input_ids"], attention_mask=batch["attention_mask"])

In [30]:
out["last_hidden_state"][:,1,:].size()

torch.Size([1, 768])

In [39]:
class RDropModel(nn.Module): 
    def __init__(self, device, DROPOUT):
        super(RDropModel,self).__init__()
        #test getting the longformer model going 
        self.model = LongformerModel.from_pretrained('allenai/longformer-base-4096',output_hidden_states = True)
        self.ReLU = nn.ReLU()
        self.GELU = nn.GELU
        self.dropout = nn.Dropout(DROPOUT)
        self.l1 = nn.Linear(768, 512).to(device)
        self.l2 = nn.Linear(512, 250).to(device)
        self.l3 = nn.Linear(250, 1).to(device)
        self.loss_func = torch.nn.MSELoss(reduction="mean")
        
    def mean_pooling(self, token_embeddings, attention_mask): 
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def forward(self, input_ids, attention_mask): 
        #encode sentence and get mean pooled sentence representation
        #when using Lonformer...
        #index 0 gives activations from final layers 
        #index 1 gives pooled output from CLS token 
        #index 2 gives all activations from all 12 layers + 1 layer of input 
        
        #referring here to: 
        #https://huggingface.co/docs/transformers/v4.24.0/en/model_doc/longformer#transformers.LongformerModel.forward
        encoding = self.model(input_ids, attention_mask=attention_mask)[1] 
        #Debugging: print(encoding.squeeze().shape)
        
        
        #squeeze to remove extra dimension. Gives us 500 x 750 
        #first one is cls token 
        #meanPooled = self.mean_pooling(encoding, attention_mask)
        #token_embeddings = torch.stack(hidden_states, dim=0)
        
        
        #NOTE: Since dropout is random we simply send data through twice 
        #to get two predictions that have some noise 
        out = self.l1(encoding)
        out = self.ReLU(out)
       
        out = self.l2(out)
        out = self.ReLU(out)
        out = self.dropout(out)
        
        pred1 = self.l3(out)
        
        #print("pred1 shape")
        #print(pred1.shape)
        
        encoding = self.model(input_ids, attention_mask=attention_mask)[1]  #all token embeddings
        
        #squeeze to remove extra dimension. Gives us 500 x 750
        #first one is cls token 
        #meanPooled = self.mean_pooling(, attention_mask)
        
        #NOTE: Since dropout is random we simply send data through twice 
        #to get two predictions that have some noise 
        out = self.l1(encoding)
        out = self.ReLU(out)
        
        out = self.l2(out)
        out = self.ReLU(out)
        out = self.dropout(out)
        
        pred2 = self.l3(out)
        return pred1, pred2

In [40]:
model = RDropModel(device, .25).to(device)

#TODO: double check on if reduction="mean" is the right move here...
#could cosine similarity also work..? I think that is between the two predicted vectors though.. 
loss_func = torch.nn.MSELoss(reduction="mean")

trainLen = len(trainDataset)

# we would initialize everything first
optim = torch.optim.Adam(model.parameters(), lr=5e-6)

# and setup a warmup for the first ~10% steps
total_steps = int((trainLen*EPOCHS) / BATCH_SIZE)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=warmup_steps, num_training_steps=total_steps - warmup_steps)

In [41]:
"""
Get the loss across multiple different objectives. 
Since overall is most important it gets more weight. 
"""
def getWeightedLoss(predTens, gtTens):
    #try getting rid of Tone and Style 
    LOSS_WEIGHTS = [1]
    loss = 0.0
    for i in range(len(LOSS_WEIGHTS)): 
        
        #get ground truth value associated with this column name 
        currGT = gtTens[:, :, i]
        
        #TODO: figure out how to index properly here 
        pred = predTens[:, :, i]
        
        """
        print("pred")
        print(pred)
        print(pred.shape)
        print("GT")
        print(currGT)
        print(currGT.shape)
        """
        #get loss 
        loss += (loss_func(pred, currGT) * LOSS_WEIGHTS[i])
    return loss 

In [42]:
def validation(): 
    model.eval()
    lossList = []
    pred = []
    GT = []

    i = True 
    for batch in validLoader: 
        
        # prepare batches and more all to the active device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #label = batch['ground_truth'].to(device).unsqueeze(1)

        #send batch info through model 
        pred1, pred2 = model(input_ids, attention_mask)
        pred1 = pred1.unsqueeze(0)
        pred2 = pred2.unsqueeze(0)
        #print(pred1)
        gts = torch.stack([batch[colName] for colName in groundTruths], 0).to(device).T.unsqueeze(0)
        #return gts, pred1
        
        #get wegihted loss relating to label prediction 
        loss1 = getWeightedLoss(gts, pred1)
        loss2 = getWeightedLoss(gts, pred2)
        loss_b = .5*(loss1 + loss2)
        
        #get loss relating to invariance to dropout 
        #NOTE:
        loss_r = getWeightedLoss(pred1, pred2)
        
        #combine losses with alpha hyperparam 
        loss = REG_ALPHA*loss_r + (1-REG_ALPHA)*loss_b
        lossList.append(loss.item())
        
        #careful about dimensions...
        #we will definitely have 3 dimensions here, if they sum to 3 then 
        #that means every dimension is one 
        if sum(pred1.size()) != 3: 
            pred.append([float(item) for item in list(pred1.squeeze())])
            GT.append([float(item) for item in list(gts.squeeze())])
        else: 
            pred.append([float(item) for item in [pred1.squeeze()]])
            GT.append([float(item) for item in [gts.squeeze()]])
            
        if not (len(lossList) == len(pred) == len(pred)):
            print("lens not equal")
    #print(vGT)
    return [lossList, pred, GT]


In [43]:
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
trainDict = {}
lossList = []
validMetrics = []
trainMetrics = []
subLossList = []
# increase from 1 epoch if need be 
for epoch in range(EPOCHS):
    
    model.train()  # make sure model is in training mode
    
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    loop = tqdm(trainLoader, leave=True)
    
    print("starting validation")
    validMetrics.append(validation())
    #validTester = validation()
    print("finishing validation")
    
   
    model.train()
    
    for i, batch in enumerate(loop): 
        # zero all gradients on each new step
        optim.zero_grad()

        # prepare batches and more all to the active device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch["attention_mask"].to(device)
        
    
        #send batch info through model 
        pred1, pred2 = model(input_ids, attention_mask)
        pred1 = pred1.unsqueeze(0)
        pred2 = pred2.unsqueeze(0)
        
        gts = torch.stack([batch[colName] for colName in groundTruths], 0).T.to(device).unsqueeze(0)
        
        
        #get loss relating to label prediction 
        loss1 = getWeightedLoss(gts, pred1)
        loss2 = getWeightedLoss(gts, pred2)
        loss_b = .5*(loss1 + loss2)
        
        #get loss relating to invariance to dropout 
        loss_r = getWeightedLoss(pred1, pred2)
        
        #combine losses with alpha hyperparam 
        loss = REG_ALPHA*loss_r + (1-REG_ALPHA)*loss_b
        
        # using loss, calculate gradients and then optimize
        loss.backward()
        optim.step()
        
        #get mean loss over last 20 batches 
        if i % 20 == 0 and i > 0: 
            #print(subLossList)
            lossList.append(np.mean(subLossList))
            subLossList = []
        
        subLossList.append(float(loss.item()))
        
        # update learning rate scheduler
        scheduler.step()

        # update the TDQM progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
validMetrics.append(validation())


  0%|          | 0/2317 [00:00<?, ?it/s]

starting validation
finishing validation


OutOfMemoryError: CUDA out of memory. Tried to allocate 50.00 MiB (GPU 5; 10.92 GiB total capacity; 9.94 GiB already allocated; 15.38 MiB free; 10.18 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
lossSmoothing = 20
lossIndex = [(i * lossSmoothing)/len(loop) for i in range(len(lossList))]
plt.plot(lossIndex, lossList)
plt.xlabel("Batch Num")
plt.ylabel("MSE Loss")
plt.title("Train Loss")

In [None]:
#validArr = np.array(validMetrics[4])
#print(validArr.shape)
validArr = np.array(validMetrics)
#validArr = validArr[5,:,:]
#np.corrcoef(test[1], test[2])

outDfList = []
iterList = []
corrList = []
#go through each validation step

for i in range(validArr.shape[0]): 
    print(i)
    subDf = pd.DataFrame(validArr[i].T)
    subDf.columns = ["loss", "pred", "true"]

    predCols = ["pred" + item for item in groundTruths]
    gtCols = ["gt" + item for item in groundTruths]
    
    
    subDf[predCols] = pd.DataFrame(subDf["pred"].tolist(), index=subDf.index)
    subDf[gtCols] = pd.DataFrame(subDf["true"].tolist(), index=subDf.index)
    
    corrScores = []
    for colName in groundTruths: 
        corr = np.corrcoef(subDf["pred" + colName], subDf["gt" + colName])[1, 0]
        corrScores.append(corr)
    corrList.append(corrScores)
    
corrDf = pd.DataFrame(corrList, columns=groundTruths)

#plt.plot(iterList, corrList)
#plt.xlabel("batch num")
#plt.ylabel("pearson correlation")
#plt.title("validation eval")


In [None]:
validArr[1:,:,:].shape

In [None]:
corrDf

In [None]:
sns.lineplot(data=corrDf)
plt.title("Training Over all Objectives")
plt.ylim(0, 1)
plt.xlim(0, EPOCHS)
#Grabbed this line from: https://www.statology.org/seaborn-legend-outside/
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)

In [None]:
plt.scatter(subDf["gtOverall"], subDf["predOverall"], alpha = .2)
plt.title("'Overall' predicted vs. ground truth ")
plt.xlabel("ground truth")
plt.ylabel("prediction")

In [None]:
print(x)
print(label)
print()

In [None]:
loss_func(x, label)

In [None]:
plt.plot(trainDf["testLoss"].dropna())

In [None]:
valX

In [None]:
vLabel

In [None]:
x

In [None]:
label