In [179]:
import pandas as pd
import numpy as np
import torch
from transformers import ElectraModel, ElectraTokenizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
database = pd.read_csv('./Sum_Database/summary_gpu.csv').loc[:,'kobart_sum']
news = list(pd.read_csv('./Sum_Database/news.txt'))[0]

In [9]:
corpus=[x for x in database]

In [10]:
corpus.append(news)

In [47]:
len(corpus)

9728

In [61]:
model = ElectraModel.from_pretrained("monologg/koelectra-small-v3-discriminator")
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")
        

In [92]:
tokens = {'input_ids': [], 'attention_mask': []}

In [95]:
for sentence in corpus[:3]:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    print(type(new_tokens['input_ids'][0]))
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
#tokens['input_ids'] = torch.stack(tokens['input_ids'])
#tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [198]:
tokens = {'input_ids': [], 'attention_mask': []}

In [199]:
MAX_LENGTH = 256
for sentence in corpus:
    # encode each sentence and append to dictionary
    ts = tokenizer.tokenize(sentence)
    input_ids = tokenizer.convert_tokens_to_ids(ts)

    if len(input_ids) >= MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = [1] * MAX_LENGTH
    else:
        n_to_pad = MAX_LENGTH - len(input_ids)
        attention_mask = ([1] * len(input_ids)) + ([0]* n_to_pad)
        input_ids = input_ids + ([0] * n_to_pad)
 
    input_ids= torch.as_tensor(input_ids)
    attention_mask = torch.as_tensor(attention_mask)

    tokens['input_ids'].append(input_ids)
    tokens['attention_mask'].append(attention_mask)

In [200]:
print(len(tokens['attention_mask'][1]))

512


In [201]:
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [202]:
tokens['input_ids']

tensor([[ 2128,  4031,  4472,  ...,     0,     0,     0],
        [ 7353,  4031,  4302,  ...,     0,     0,     0],
        [ 7353,  4031,  4302,  ...,     0,     0,     0],
        ...,
        [ 6325,  4275,  4225,  ...,     0,     0,     0],
        [11634,  4110,  2254,  ...,     0,     0,     0],
        [   11,  7943, 11171,  ...,     0,     0,     0]])

In [203]:
tokens['attention_mask']

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [204]:
outputs = model(**tokens)

In [None]:
embeddings = outputs[0]

In [None]:
attention_mask = tokens['attention_mask']
attention_mask.shape

In [None]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

In [None]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

In [None]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

In [None]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

In [None]:
mean_pooled = summed / summed_mask

In [None]:
mean_pooled

In [None]:

# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate
cosine_similarity(
    [mean_pooled[-1]],
    mean_pooled[:-1]
)