In [1]:
from tqdm.auto import tqdm
import torch 
import transformers
from transformers import PreTrainedTokenizer
from transformers import RobertaTokenizer, PreTrainedTokenizer, DistilBertTokenizer, DistilBertModel, RobertaModel
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, util
from datasets import Dataset
import pandas as pd
from transformers.optimization import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt 
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from torch.nn import CosineEmbeddingLoss
import random
import os
#Build up to SBERT model 

In [2]:
deviceNum = 4
device = torch.device("cuda:" + str(deviceNum) if torch.cuda.is_available() else "cpu")

GRAD_ACC = 6
EPOCHS = 2
FOLDS = 5
SEED = 85
BATCH_SIZE = 5

#set seeds 
torch.manual_seed(85)
random.seed(85)



In [3]:
#df = pd.read_csv("/shared/3/projects/benlitterer/localNews/NetworkMVP/translatedCleaned.tsv", sep="\t")
df = pd.read_csv("/home/blitt/projects/localNews/data/processed/translated_300_84.tsv", sep="\t")

#put ground truth values into a list 
df["ground_truth"] = df['Overall']

#get only the columns we need 
#TODO: do we need "pair_id"? 
#leanDf = df[["ground_truth",  'text1', 'text2', 'title1', 'title2', 'url1_lang', 'url2_lang']].dropna()
#for when using merged text
leanDf = df[["ground_truth",  'text1Merged', 'text2Merged', 'url1_lang', 'url2_lang']].dropna()

#rescale data from (0, 4): (0, 1)
leanDf["ground_truth"] = 1 - ((leanDf["ground_truth"] - 1) / 3)

#reset index so it is contiguous set of numbers 
leanDf = leanDf.reset_index(drop=True)



In [5]:
def trainBi(epoch, trainExamples, validDf): 
    train_dataloader = torch.utils.data.DataLoader(trainExamples, shuffle=True, batch_size=BATCH_SIZE)
    model = SentenceTransformer('all-mpnet-base-v2', device="cuda:" + str(deviceNum))
    train_loss = losses.CosineSimilarityLoss(model)
    
    output_path = "/home/blitt/projects/localNews/models/sentEmbeddings/2.0-biCrossModel/1.1-bestModelBi"
    checkpoint_path = "/home/blitt/projects/localNews/models/sentEmbeddings/2.0-biCrossModel/1.1-bestModelBi/checkpoints_" + str(epoch)
    
    #save model to checkpoint 
    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=2, warmup_steps=75, output_path=output_path, checkpoint_path=checkpoint_path, save_best_model=True, checkpoint_save_total_limit=1)
    #model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=2, warmup_steps=75)
    
    #VALIDATION 
    #now get model and evaluate its predictions on validDataset
    modelDir = output_path + "/checkpoints_" + str(epoch)
    checkpointName = os.listdir(modelDir)[0]
    modelPath = modelDir + "/" + checkpointName
    model = SentenceTransformer(modelPath)

    
    text1 = list(validDf["text1Merged"])
    text2 = list(validDf["text2Merged"])
    
    embed1 = model.encode(text1)
    embed2 = model.encode(text2)

    #get our cosine sims 
    cos_scores = [] 
    for i in range(len(embed1)): 
        cos_scores.append(float(util.cos_sim(embed1[i], embed2[i])[0][0]))
    #ran this sanity check and it appears to check out...
    #print(cos_scores)
    #print(list(validDf["ground_truth"]))
    
    corr = np.corrcoef(list(validDf["ground_truth"]), cos_scores)[1, 0]
    print(corr)
    return corr 
    

In [6]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=FOLDS, shuffle=True)

In [7]:
metrics = []
transformers.logging.set_verbosity_error()
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

#NOTE: THIS LINE IS ONLY FOR QUICK TRAINING CHECK 
#leanDf = leanDf.iloc[:300, :]

#we only want to sample validation data from the pairs that are both english 
enDf = leanDf[(leanDf["url1_lang"] == "en") & (leanDf["url2_lang"] == "en")]

print("Total df len: " +  str(len(leanDf)))
print("English df len: " +  str(len(enDf)))

biCorrs = []
#we create splits based on the position (not the actual index) of rows in enDf
#the idea is to get a split of the english dataset to set aside and then 
#grab everything else in the en + translated dataset to train on 
for i, (train_index, valid_index) in enumerate(kf.split(enDf)): 
    
    #grab the rows in enDf corresponding to the positions of our split 
    validDf = enDf.iloc[valid_index]
    
    #now get the actual indicies that have been selected
    #and subtract the indices in trainDf away from those 
    remainingIndices = list(set(leanDf.index) - set(validDf.index))
    trainDf = leanDf.loc[remainingIndices]
    print("###### " + str(i).upper() + " ######")
    print("Train df len: " + str(len(trainDf)))
    print("Valid df len: " + str(len(validDf)))
    
    #TRAIN BI ENCODER 
    trainExamples = []
    for row in trainDf.itertuples(): 
        trainExamples.append(InputExample(texts =[row[2], row[3]], label=row[1]))
    validExamples = []
    for row in validDf.itertuples(): 
        validExamples.append(InputExample(texts =[row[2], row[3]], label=row[1]))
        
    biCorrs.append(trainBi(i, trainExamples, validDf))
    
    """
    #TRAIN CROSS ENCODER
    #get data loaded in properly 
    trainDataset = Dataset.from_pandas(trainDf)
    validDataset = Dataset.from_pandas(validDf)
    
    #NOTE: here we use the merged text
    trainDataset = trainDataset.map(lambda x: tokenizer(x["text1Merged"], x["text2Merged"], max_length=512, padding="max_length", truncation=True))
    validDataset = validDataset.map(lambda x: tokenizer(x["text1Merged"], x["text2Merged"], max_length=512, padding="max_length", truncation=True))
    

    #only need the input information 
    trainDataset = trainDataset.remove_columns(["text1Merged", "text2Merged", "__index_level_0__"])
    validDataset = validDataset.remove_columns(["text1Merged", "text2Merged", "__index_level_0__"])

    # convert dataset features to PyTorch tensors
    validDataset.set_format(type='torch', columns=["ground_truth", "input_ids", "attention_mask"])
    trainDataset.set_format(type='torch', columns=["ground_truth", "input_ids", "attention_mask"])
    
    
    validMetrics = train(trainDataset, validDataset)
    metrics.append(validMetrics)
    
    del trainDataset
    del validDataset
    
    """
    

Total df len: 4806
English df len: 1738
###### 0 ######
Train df len: 4458
Valid df len: 348


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/892 [00:00<?, ?it/s]

Iteration:   0%|          | 0/892 [00:00<?, ?it/s]

0.8928811638264366
###### 1 ######
Train df len: 4458
Valid df len: 348


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/892 [00:00<?, ?it/s]

Iteration:   0%|          | 0/892 [00:00<?, ?it/s]

0.8800843362632257
###### 2 ######
Train df len: 4458
Valid df len: 348


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/892 [00:00<?, ?it/s]

Iteration:   0%|          | 0/892 [00:00<?, ?it/s]

0.8992946988138368
###### 3 ######
Train df len: 4459
Valid df len: 347


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/892 [00:00<?, ?it/s]

Iteration:   0%|          | 0/892 [00:00<?, ?it/s]

0.9068255520949272
###### 4 ######
Train df len: 4459
Valid df len: 347


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/892 [00:00<?, ?it/s]

Iteration:   0%|          | 0/892 [00:00<?, ?it/s]

0.8674153396260754


In [8]:
#send corrList to pickled object for later analysis 
import pickle
out_path = "/home/blitt/projects/localNews/models/sentEmbeddings/2.0-biCrossModel/1.0-justCross/crossCorrList.pckl"
f = open(out_path, "wb")

pickle.dump(biCorrs, f)

FileNotFoundError: [Errno 2] No such file or directory: '/home/blitt/projects/localNews/models/sentEmbeddings/2.0-biCrossModel/1.0-justCross/crossCorrList.pckl'

In [9]:
np.mean(biCorrs)

0.8893002181249002

In [None]:
import os
modelPath = output_path 
os.listdir(modelPath + "/checkpoints")[0]

In [None]:

output_path = "/home/blitt/projects/localNews/models/sentEmbeddings/2.0-biCrossModel/justBiExperiments"
checkpoint_path = "/home/blitt/projects/localNews/models/sentEmbeddings/2.0-biCrossModel/justBiExperiments/checkpoints"

modelPath = output_path + "/checkpoints/500"
model = SentenceTransformer(modelPath)

In [None]:
list(validDf["text1Merged"])