In [1]:
from tqdm.auto import tqdm
import torch 
from transformers import DistilBertModel
from transformers import DistilBertTokenizer
from transformers import PreTrainedTokenizer
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses
from datasets import Dataset
import pandas as pd
from transformers.optimization import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt 
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from torch.nn import CosineEmbeddingLoss
import sklearn
import spacy
from torch import nn
#Build up to SBERT model 

In [2]:
deviceNum = 6

device = torch.device("cuda:" + str(deviceNum) if torch.cuda.is_available() else "cpu")

def check_mem():
    torch.cuda.empty_cache()
    a = torch.cuda.memory_allocated(deviceNum)/1024/1024/1024
    r = torch.cuda.memory_reserved(deviceNum)/1024/1024/1024
    print("torch.cuda.memory_allocated: %fGB"%a)
    print("torch.cuda.memory_reserved: %fGB"%r)
    print("torch.cuda.memory_free: %fGB"%(r-a))
    print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(deviceNum)/1024/1024/1024))
check_mem()


torch.manual_seed(0)

df = pd.read_csv("/shared/3/projects/benlitterer/localNews/NetworkMVP/enTrainData.csv", sep="\t")
df = df.loc[(df["url1_lang"] == "en") & (df["url2_lang"] == "en")]

#put ground truth values into a list 
df["ground_truth"] = df['Overall']

#get only the columns we need 
#TODO: do we need "pair_id"? 
leanDf = df[["ground_truth",  'text1', 'text2']].dropna()

#rescale data from (0, 4): (0, 1)
leanDf["ground_truth"] = 1 - ((leanDf["ground_truth"] - 1) / 3)

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.memory_free: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


In [3]:
leanDf["ground_truth"]

0       0.000000
1       0.111111
2       0.555556
3       0.666667
4       0.916667
          ...   
2871    0.333333
2872    0.000000
2873    1.000000
2874    1.000000
2875    1.000000
Name: ground_truth, Length: 1676, dtype: float64

In [4]:
class CustomSentListDataset(Dataset): 
    def __init__(self, inDf): 
        self.inDf = inDf 
        
    def __len__(self): 
        return len(self.inDf)
    
    def __getitem__(self, idx):
        return list(self.inDf.iloc[idx])

In [None]:
#nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"])
#nlp.enable_pipe("senter")
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

def extractSents(inList): 
    separator = nlp.pipe(inList, n_process=8)
    textList = []
    for text in tqdm(separator): 
        sentList = [str(sent) for sent in text.sents if len(sent) > 3]
        textList.append(sentList)
    return textList

#split data 
trainDf, validDf = sklearn.model_selection.train_test_split(leanDf, train_size=.9, test_size=.1)
trainDf = trainDf.reset_index(drop=True)
validDf = validDf.reset_index(drop=True)

trainDf["text1"] = extractSents(trainDf["text1"])
trainDf["text2"] = extractSents(trainDf["text2"])
validDf["text1"] = extractSents(validDf["text1"])
validDf["text2"] = extractSents(validDf["text2"])



0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
trainDf["text1"]

In [None]:
sentCutoff = 50
 
trainDf["text1"] = trainDf["text1"].apply(lambda x: x[:sentCutoff])
trainDf["text2"] = trainDf["text2"].apply(lambda x: x[:sentCutoff])

validDf["text1"] = validDf["text1"].apply(lambda x: x[:sentCutoff])
validDf["text2"] = validDf["text2"].apply(lambda x: x[:sentCutoff])


In [None]:
plt.hist(trainDf["text1"].apply(len))

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')

def getLoader(inDf, batchSize): 
    inDf["text1"] = inDf["text1"].apply(lambda x: tokenizer.batch_encode_plus(x, max_length=70, padding="max_length", truncation=True, return_tensors="pt"))
    inDf["text2"] = inDf["text2"].apply(lambda x: tokenizer.batch_encode_plus(x, max_length=70, padding="max_length", truncation=True, return_tensors="pt"))
    
    inDf["text1_input_ids"] = inDf["text1"].apply(lambda x: x["input_ids"])
    inDf["text2_input_ids"] = inDf["text2"].apply(lambda x: x["input_ids"])
    
    inDf["text1_attention_mask"] = inDf["text1"].apply(lambda x: x["attention_mask"])
    inDf["text2_attention_mask"] = inDf["text2"].apply(lambda x: x["attention_mask"])
    
    #inDf["ground_truth"] = inDf["ground_truth"].apply(lambda x: x))
    
    dataset = CustomSentListDataset(inDf[["ground_truth", "text1_input_ids", "text1_attention_mask","text2_input_ids", "text2_attention_mask"]])
    
    # convert dataset features to PyTorch tensors
    #dataset.set_format(type='torch', columns=inDf.columns())
    loader = torch.utils.data.DataLoader(dataset, batch_size=batchSize, shuffle=True)
    return [dataset, loader]

trainData, trainLoader = getLoader(trainDf, 1)
validData, validLoader = getLoader(validDf, 1)


In [None]:
len1 = [len(item) for item in trainDf["text1_input_ids"]]
len2 = [len(item) for item in trainDf["text2_input_ids"]]

allLens = len1 + len2
plt.hist(allLens, bins=100)
plt.xlim([0, 200])
print(np.median(allLens))
print(np.mean(allLens))

#look at the length of sentences multiplied together 
multLen = np.empty([len(len1), len(len2)])
for i, a in enumerate(len1): 
    for j, b in enumerate(len2): 
        multLen[i, j] = a*b
multLen = multLen.flatten()

#90th quantile is 1148 
np.quantile(multLen, .9)
np.quantile(multLen, .8)
#so go with 800? 

In [None]:

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(token_embeddings, attention_mask):
     #First element of model_output contains all token embeddings
    #print(attention_mask.unsqueeze(-1).shape)
    #print(token_embeddings.size())
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
class MyModel(nn.Module):
    def __init__(self, flatCutoff):
        super(MyModel,self).__init__()
        self.model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')
        self.cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
        self.flatCutoff = flatCutoff
        self.ReLU = nn.ReLU()
        self.l1 = nn.Linear(flatCutoff, 500).to(device)
        self.l2 = nn.Linear(500, 500).to(device)
        self.l3 = nn.Linear(500, 250).to(device)
        self.l4 = nn.Linear(250, 1).to(device)
        self.loss_func = torch.nn.MSELoss(reduction="mean")
        
    def mean_pooling(self, token_embeddings, attention_mask): 
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def forward(self, inputs_ids_a, attention_a, inputs_ids_b, attention_b, gt): 
        
        u = self.model(inputs_ids_a, attention_mask=attention_a)[0]  # all token embeddings B
        #try to clear up memory and keep only the sentence embeddings 
        del inputs_ids_a

        sents_u = mean_pooling(u, attention_a)
        del attention_a
        del u 
        
        v = self.model(inputs_ids_b, attention_mask=attention_b)[0]  # all token embeddings B
        del inputs_ids_b

        sents_v = self.mean_pooling(v, attention_b)
        del attention_b
        del v
        
        #get similarity matrix for sentences between the documents 
        sims = np.empty([sents_u.shape[0], sents_v.shape[0]])
        for i, uSent in enumerate(sents_u): 
            for j, vSent in enumerate(sents_v):
                sims[i, j] = self.cos(uSent, vSent)

        #flatten down to one dimension 
        sims = sims.flatten()
        
        # we know that if we get the first 800 we will have between 
        # 80% and 90% of our possible sentence combinations fully included without cutting them off 
        sims_cut = torch.Tensor(sims[:self.flatCutoff]).to(device)

        #pad on the right out to len of 800
        sims_padded = torch.nn.functional.pad(sims_cut, [0, self.flatCutoff-sims_cut.shape[0]])

        del sims_cut

        #first attempt is a single linear layer
        #maybe linear combination of output is sufficient? 
        #go from 4000 > 500 > 1
        out = self.l1(sims_padded)
        out = self.ReLU(out)
        out = self.l2(out)
        out = self.ReLU(out)
        out = self.l3(out)
        out = self.ReLU(out)
        pred = self.l4(out)
        
        
        #print(vPredSim)
        loss = self.loss_func(pred, gt)
        
        return [loss, pred, gt] 


In [None]:
model = MyModel(500).to(device)

trainBatch = 1

# we would initialize everything first
optim = torch.optim.Adam(model.parameters(), lr=2e-5)

EPOCHS=4

# and setup a warmup for the first ~10% steps
total_steps = int(len(trainData)*EPOCHS / trainBatch)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=warmup_steps, num_training_steps=total_steps - warmup_steps)

In [None]:
def validation(): 
    model.eval()
    lossList = []
    preds = []
    gts = []

    for vBatch in validLoader: 
        # prepare batches and more all to the active device
        inputs_ids_a = vBatch[1][0].to(device)
        attention_a = vBatch[2][0].to(device)
        inputs_ids_b = vBatch[3][0].to(device)
        attention_b = vBatch[4][0].to(device)
        gt = vBatch[0].float().to(device)
        
        #get outputs from model 
        loss, pred, gt = model(inputs_ids_a, attention_a, inputs_ids_b, attention_b, gt)
        
        #get predictions and ground truth to compute training metrics with 
        lossList.append(float(loss))
        preds.append(float(pred))
        gts.append(float(gt))
        
    #print(vGT)
    return [lossList, preds, gts]
        

In [None]:
lossSmoothing = 20
validationFreq = 200 

#TODO: implement a cutoff on how many sentences of the article we can consider 
#TODO: put model in seperate class? How to make sure the params are updating 
trainDict = {}
lossList = []
subLossList = []
validMetrics = []
# increase from 1 epoch if need be 
for epoch in range(EPOCHS):
    
    model.train()  # make sure model is in training mode
    
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    loop = tqdm(trainLoader, leave=True)
    
    #validation if at correct step 
    validMetrics.append(validation())
    model.train()
            
    for i, batch in enumerate(loop): 
        # zero all gradients on each new step
        optim.zero_grad()
        
        #in practice this doesn't seem to usually actually help but worth adding? 
        torch.cuda.empty_cache()

        #get model inputs from batch 
        inputs_ids_a = batch[1][0].to(device)
        attention_a = batch[2][0].to(device)
        inputs_ids_b = batch[3][0].to(device)
        attention_b = batch[4][0].to(device)
        gt = batch[0].float().to(device)

        #get outputs from model 
        loss, pred, gt = model(inputs_ids_a, attention_a, inputs_ids_b, attention_b, gt)
        
        subLossList.append(loss.item())
        if i % lossSmoothing == 0: 
            lossList.append(np.median(subLossList))
            subLossList = []
        
        #update weights 
        loss.backward()
        
        
        
        optim.step()
        scheduler.step()

In [None]:
plt.plot(lossList)


In [None]:
lossIndex = [(i * lossSmoothing)/len(trainData) for i in range(len(lossList))]
plt.plot(lossIndex, lossList)
plt.xlabel("Batch Num")
plt.ylabel("MSE Loss")
plt.title("Train Loss")

In [None]:
np.array(validMetrics).shape

In [None]:
validDf = pd.DataFrame({"loss":[], "pred":[], "true":[], "iter":[]})
validArr = np.array(validMetrics)

iterNum = 0
corrList = []
iterList = []
for i in range(validArr.shape[0]): 
    subDf = pd.DataFrame(validArr[i].T)
    subDf.columns = ["loss", "pred", "true"]
    subDf["iter"] = [iterNum for i in range(len(subDf))]
    iterList.append(iterNum)
    iterNum += 1
    validDf = pd.concat([validDf, subDf])
    corr = np.corrcoef(subDf["pred"], subDf["true"])
    corrList.append(corr[0, 1])

In [None]:
plt.plot(iterList, corrList)
plt.xlabel("batch num")
plt.ylabel("pearson correlation")
plt.title("validation eval")

In [None]:

#print(validArr.shape)

test = validArr[1,:,:]
#np.corrcoef(test[1], test[2])

#go through each validation step
for i in range(validArr.shape[0]): 
    subDf = pd.DataFrame(validArr[i].T)
    subDf.columns = ["loss", "pred", "true"]
    corr = np.corrcoef(subDf["pred"], subDf["true"])
    print(corr)
    
plt.scatter(subDf["true"], subDf["pred"], alpha=.2)
plt.title("predicted vs. ground truth ")
plt.xlabel("ground truth")
plt.ylabel("prediction")

In [None]:
validArr[0]

In [None]:
validDf.loc[validDf["iter"] == 1400, "true"]

In [None]:
#check memory 
t = torch.cuda.mem_get_info()

used = torch.cuda.memory_allocated(device="cuda:4")
#proportion of free memory 
#print("used: " used / t[1])
print("used: " + str(t[0]/t[1]))