In [12]:
import pandas as pd
import json
from pathlib import Path

In [13]:
DATA = Path.cwd().parent / 'data'
SUMM_FOLDER = DATA / 'summaries_finetune'
TEXT_FILES = SUMM_FOLDER / 'text_files_copy'
SOURCE_TEXTS = SUMM_FOLDER / 'source_texts_clean'

In [14]:
data = open(DATA / 'source_dict.txt', 'r')
source_dict = json.loads(data.read())
source_texts = list(source_dict.values())

In [77]:
summaries_df = pd.read_csv(SUMM_FOLDER / 'final_summaries_ai_aloe_fixed.csv')[['text','source', 'paraphrase_pca', 'content_pca', 'source_text_filename_clean']]

## Getting final hidden embeddings for each summary

In [68]:
from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding
from transformers import LongformerTokenizer

import torch

seed = 42
model_name = "allenai/longformer-base-4096" #"google/bigbird-roberta-base"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModel.from_pretrained(model_name, output_hidden_states=True).to(DEVICE)
tokenizer = LongformerTokenizer.from_pretrained(model_name, use_fast=True)

def getLastState(text):
    tokenized_text = tokenizer(text, return_tensors='pt').to(DEVICE)
    outputs = model(**tokenized_text)
    return outputs.last_hidden_state[0][0]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
summaries_df['source_embedding'] = summaries_df['source'].apply(lambda x: getLastState(x))

### Normalize Dependent Variables

In [78]:
from sklearn.preprocessing import StandardScaler
import numpy as np
  
# copy the data
  
# apply normalization techniques
summaries_df['content_pca'] = StandardScaler().fit_transform(np.array(summaries_df['content_pca']).reshape(-1,1))
summaries_df['paraphrase_pca'] = StandardScaler().fit_transform(np.array(summaries_df['paraphrase_pca']).reshape(-1,1)) 

### Remove Test Set

In [79]:
source_texts = summaries_df['source_text_filename_clean'].value_counts().to_frame().reset_index()
texts_to_remove = list(source_texts.iloc[15:31]['index'])

In [81]:
test_df = summaries_df[summaries_df['source_text_filename_clean'].isin(texts_to_remove)]
train_df = summaries_df[summaries_df['source_text_filename_clean'].isin(texts_to_remove) == False]
print('test n:', len(test_df))
print('train n:', len(train_df))

test n: 703
train n: 3987


In [None]:
def buildDataset(df):
    full_dataset = Dataset.from_pandas(df, preserve_index=False)
    # 70% train, 30% test
    train_valid = full_dataset.train_test_split(test_size=0.176, seed=seed)
    # gather everyone if you want to have a single DatasetDict
    final_dataset = DatasetDict({
        'train': train_valid['train'],
        'valid': train_valid['test']})
    return final_dataset

In [None]:
content_df = train_df[['text', 'source_embedding', 'content_pca']]
content_df.columns = ['text', 'labels']
content_ds = buildDataset(content_df)

paraphrase_df = train_df[['text', 'source_embedding', 'paraphrase_pca']]
paraphrase_df.columns = ['text', 'source_embedding', 'labels']
paraphrase_ds = buildDataset(paraphrase_df)

In [None]:
content_ds['test'] = Dataset.from_pandas(test_df[['text', 'source_embedding', 'content_pca']].rename(columns={'content_pca':'labels'}), preserve_index=False)
paraphrase_ds['test'] = Dataset.from_pandas(test_df[['text', 'source_embedding', 'paraphrase_pca']].rename(columns={'paraphrase_pca':'labels'}), preserve_index=False)

In [None]:
def tokenize_inputs(example):
    return tokenizer(example['text'], truncation = True)

In [106]:
b = getLastState('here is the text that should be used to color the next model')

tokens = tokenizer.encode('</s> this is a test of the emergency broadcast system', return_tensors='pt').to('cuda')

In [105]:
a = model.embeddings.word_embeddings(tokens)

In [100]:
tokens.shape

torch.Size([1, 12])

In [103]:
context.shape

torch.Size([768])

In [109]:
c = torch.cat([a[0], b, a[1:]], 0)


RuntimeError: Tensors must have same number of dimensions: got 2 and 1

In [119]:
a.shape

torch.Size([1, 12, 768])

In [118]:
b.shape

torch.Size([768])

In [120]:
torch.cat([a,b])

RuntimeError: Tensors must have same number of dimensions: got 3 and 1

In [None]:
lambdacloud for cloud inferencing 