## Bi Encoder Preprocessing
- Grab sentence and article embeddings using mpnet 

In [1]:
import pandas as pd
import spacy 
from tqdm.auto import tqdm 
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from datasets import Dataset

In [2]:
deviceNum = 6
device = torch.device("cuda:" + str(deviceNum) if torch.cuda.is_available() else "cpu")


In [3]:
df = pd.read_csv("/shared/3/projects/benlitterer/localNews/NetworkMVP/translatedCleaned.tsv", sep="\t")
df = df.head(20)

In [4]:
df.columns

Index(['Unnamed: 0', 'url1_lang', 'url2_lang', 'pair_id', 'link1', 'link2',
       'ia_link1', 'ia_link2', 'Geography', 'Entities', 'Time', 'Narrative',
       'Overall', 'Style', 'Tone', 'id1', 'id2', 'ogText1', 'ogTitle1',
       'ogText2', 'ogTitle2', 'text1', 'title1', 'text2', 'title2'],
      dtype='object')

In [5]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')


In [6]:
#nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"])
#nlp.enable_pipe("senter")
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

def extractSents(inList): 
    separator = nlp.pipe(inList, n_process=8)
    textList = []
    for text in tqdm(separator): 
        sentList = [str(sent) for sent in text.sents if len(sent) > 3]
        textList.append(sentList)
    return textList

df["sentences1"] = extractSents(df["text1"])
df["sentences2"] = extractSents(df["text2"])

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [7]:
class CustomSentListDataset(Dataset): 
    def __init__(self, inDf): 
        self.inDf = inDf 
        
    def __len__(self): 
        return len(self.inDf)
    
    def __getitem__(self, idx):
        return list(self.inDf.iloc[idx])

In [8]:
df1 = df[["title1", "sentences1", "url1_lang"]]
df1 = df1.explode("sentences1").reset_index().rename(columns={"index":"article_id"})

df2 = df[["title2", "sentences2", "url2_lang"]]
df2 = df2.explode("sentences2").reset_index().rename(columns={"index":"article_id"})

In [9]:
df1Tokenized = tokenizer.batch_encode_plus(list(df1["sentences1"]), max_length=300, padding="max_length", truncation=True, return_tensors="pt")
df2Tokenized = tokenizer.batch_encode_plus(list(df2["sentences2"]), max_length=300, padding="max_length", truncation=True, return_tensors="pt")

In [10]:
df1["input_ids"] = list(df1Tokenized["input_ids"])
df1["attention_mask"] = list(df1Tokenized["attention_mask"])

df2["input_ids"] = list(df2Tokenized["input_ids"])
df2["attention_mask"] = list(df2Tokenized["attention_mask"])


In [11]:
dataset1 = CustomSentListDataset(df1[["input_ids", "attention_mask"]])
loader1 = torch.utils.data.DataLoader(dataset1, batch_size=5, shuffle=False)

dataset2 = CustomSentListDataset(df1[["input_ids", "attention_mask"]])
loader2 = torch.utils.data.DataLoader(dataset2, batch_size=5, shuffle=False)


In [12]:
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2').to(device)

NVIDIA RTX A5000 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA RTX A5000 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [23]:
encodings1 = []
for batch in tqdm(loader1): 
    torch.cuda.empty_cache()
    inputs_ids = batch[0].to(device)
    attention = batch[1].to(device)
    encoded = model(inputs_ids, attention)
    encodings1.append(encoded[0].detach().to("cpu"))
    del encoded
    del inputs_ids
    del attention
    

encodings2 = []
for batch in tqdm(loader1): 
    torch.cuda.empty_cache()
    inputs_ids = batch[0].to(device)
    attention = batch[1].to(device)
    encoded = model(inputs_ids, attention)
    encodings2.append(encoded[0].detach().to("cpu"))
    del encoded
    del inputs_ids
    del attention


  0%|          | 0/62 [00:00<?, ?it/s]

[tensor([[    0, 19957,  4649,  ...,     1,     1,     1],
        [    0,  1041,  6882,  ...,     1,     1,     1],
        [    0,  2000,  4930,  ...,     1,     1,     1],
        [    0, 13330, 10657,  ...,     1,     1,     1],
        [    0, 16159,  2060,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])]


  0%|          | 0/62 [00:00<?, ?it/s]

In [37]:
#torch.flatten(encodings1[0])
#try to get all embeddings in a list one after another, then we add that to dataframe as a column, and do 
#a groupby on the article id 
outlist = []
for item in encodings1: 
    outlist.append(np.array(item.cpu()))
outlist = np.array(outlist)
outlist

  outlist = np.array(outlist)


array([array([[[-0.0977255 , -0.00337414,  0.07573681, ...,  0.01839743,
                -0.02405375, -0.14556141],
               [-0.11246805,  0.01075973,  0.07276851, ...,  0.02959378,
                -0.07353707, -0.1471804 ],
               [-0.06045718,  0.28489646,  0.04751272, ..., -0.02900803,
                 0.01768202, -0.20163122],
               ...,
               [-0.01612242,  0.1881083 ,  0.06369126, ...,  0.03368504,
                -0.03716145, -0.09546674],
               [-0.01612242,  0.1881083 ,  0.06369126, ...,  0.03368504,
                -0.03716145, -0.09546674],
               [-0.01612242,  0.1881083 ,  0.06369126, ...,  0.03368504,
                -0.03716145, -0.09546674]],

              [[-0.02051227,  0.02639319, -0.01802666, ..., -0.08595241,
                 0.08057595, -0.00860115],
               [-0.04071065, -0.05566223,  0.00116065, ..., -0.16155025,
                 0.03689248, -0.04614903],
               [ 0.06684776, -0.19714452, -0.04160

In [None]:
#NOTE: reference this later 
testEncoding = model(moddedDf.loc[1, "text1_input_ids"], max_length = 200, attention_mask=moddedDf.loc[1, "text1_attention_mask"])[0]

In [None]:
for i, row in tqdm(moddedDf.iterrows()): 
    sentEncodings1 = model(row["text1_input_ids"], max_length = 200, attention_mask=row["text1_attention_mask"])[0]

In [None]:
def embedSents(): 
    encodedList = [model() for item in inList]
    return encodedList

In [None]:
moddedDf.head(3)
leanDf = moddedDf

In [None]:
#TODO: pool all of the encodings to get one per sentence. Put this into an extra column in df
#TODO: do the same for the title and the first part of each article 

In [None]:
testEncoding.shape

In [None]:
def encode(inList): 
    return []

In [None]:
extracted1[0]