# Evaluation on truncated tokens t5-pretrained

<hr>

In [23]:
import GPUtil
GPUtil.showUtilization()

| ID | GPU | MEM |
------------------
|  0 |  0% |  9% |
|  1 |  0% | 77% |
|  2 |  0% | 11% |
|  3 |  0% | 98% |


## 1. Load Dataset

In [3]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import os
import matplotlib.pyplot as plt
import json

from datasets import load_dataset
from datasets import load_metric



from bert_score import score
from bert_score import plot_example


from rouge_score import rouge_scorer
from transformers import T5Tokenizer, T5ForConditionalGeneration, tokenization_utils_base, AutoTokenizer, AutoModelForSeq2SeqLM


device = 'cuda:3'



In [4]:
with open('../../datasets/test_set.txt') as json_file:
    test_set = json.load(json_file)
print("Data loaded")

Data loaded


In [4]:
# P, R, F1 = score([test_set['document'][0], test_set['document'][1]], [test_set['summary'][0], test_set['summary'][1]], lang='en', rescale_with_baseline=True, device=device)
# # P, R, F1 = scorer.score([test_set['document'][0]], [test_set['summary'][0]])
# # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
# print(P)

## 2. Scorers

In [5]:
def compute_metrics(predictions, actuals, tokenizer):
    
    # <your code here>
    metric = load_metric("rouge")
    result = metric.compute(predictions=predictions, references=actuals, use_stemmer=True)
    
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    rouge = {k: round(v, 4) for k, v in result.items()}
    
    P, R, F1 = score(predictions, actuals, lang='en', rescale_with_baseline=True, device=device)
    
    bert = {
        'precision': P.mean().item(),
        'recall': R.mean().item(),
        'F1': F1.mean().item(),
    }

    return rouge, bert

## 3. Model

In [6]:
# let's define model parameters specific to BART
model_params = {
    "MODEL": "gniemiec/t5-small-finetuned-xsum",
    "MAX_SOURCE_TEXT_LENGTH": 512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 36,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
}

In [7]:
torch.manual_seed(model_params["SEED"])  # pytorch random seed
np.random.seed(model_params["SEED"])  # numpy random seed

tokenizer = AutoTokenizer.from_pretrained(model_params["MODEL"])

model = AutoModelForSeq2SeqLM.from_pretrained(model_params["MODEL"])

print(f"Loaded {model_params['MODEL']}")

Loaded gniemiec/t5-small-finetuned-xsum


## 4. Test 1 sample

In [8]:
model.eval()

with torch.no_grad():
    
    # tokens = tokenizer(test_set['document'][1], return_tensors="pt")
    tokens = tokenizer.batch_encode_plus([test_set['document'][0]], 
                                         max_length=model_params["MAX_SOURCE_TEXT_LENGTH"], 
                                         truncation=True, 
                                         padding="max_length", return_tensors="pt").to(device)
    # outputs = model.generate(tokens)
    outputs = model.to(device).generate(
                      input_ids = tokens.input_ids,
                      attention_mask = tokens.attention_mask, 
                      max_length=150, 
                      num_beams=2,
                      repetition_penalty=2.5, 
                      length_penalty=1.0, 
                      early_stopping=True
                      )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(decoded)

Police have recovered three firearms, ammunition and a five-figure sum of money.


In [9]:
rouge, bert = compute_metrics([decoded], [test_set['summary'][0]], tokenizer)   
rouge_df = pd.DataFrame.from_dict(rouge, orient='index')
rouge_df

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,0
rouge1,33.3333
rouge2,14.2857
rougeL,20.0
rougeLsum,20.0
gen_len,1.0


In [10]:
bert_df = pd.DataFrame.from_dict(bert, orient='index')
bert_df

Unnamed: 0,0
precision,0.499494
recall,0.473107
F1,0.487103


## 5. Evaluate on 1000 samples

In [11]:
# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix} | {iteration}/{total}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [12]:
# model.eval()
# model = model.to(device)
# predictions = []

# l = len(test_set['document'])
# printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
# with torch.no_grad():
#     for idx, doc in enumerate(test_set['document']):
#         printProgressBar(idx + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
#         tokens = tokenizer.batch_encode_plus([doc], 
#                                              max_length=model_params["MAX_SOURCE_TEXT_LENGTH"], 
#                                              truncation=True, 
#                                              padding="max_length", 
#                                              return_tensors="pt").to(device)
       
#         outputs = model.generate(
#                       input_ids = tokens.input_ids,
#                       attention_mask = tokens.attention_mask, 
#                       max_length=150, 
#                       num_beams=2,
#                       repetition_penalty=2.5, 
#                       length_penalty=1.0, 
#                       early_stopping=True
#                       )
#         decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
#         predictions.append(decoded)


### 5.1 Saving results

In [13]:
def save_results(predictions, actuals, output):
    df = pd.DataFrame({'predictions': predictions, 'actuals': actuals})
    df.to_csv(output)
    print("PREDICTIONS RESULTS SAVED.")

In [14]:
# save_results(predictions, test_set['summary'], 'outputs/predictions_pretrained_t5xsum.csv')

## 6. Create truncated tokens list

In [15]:

def Eval(output_dir, strategy="none", no_tokens=0, from_left = 0.2):
    '''
    source: array of documents
    strategy:
        - head: truncate head
        - tail: truncate tail 
        - both: truncate head and tail
        - middle: truncate middle words
    words_no: number of tokens to remove
    '''
    print(f"Strategy: Remove {strategy} tokens")
    l = len(test_set['document'])
    
    
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seedtokenizer = AutoTokenizer.from_pretrained(model_params["MODEL"])
    
    print("Loading model..")
    model = AutoModelForSeq2SeqLM.from_pretrained(model_params["MODEL"])
    tokenizer = AutoTokenizer.from_pretrained(model_params["MODEL"])
    
    model.eval()
    model = model.to(device)
    
    predictions = []
    
    printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
    
    with torch.no_grad():
        for idx, doc in enumerate(test_set['document']):
            printProgressBar(idx + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
            tokens = tokenizer.batch_encode_plus([doc], 
                                                 max_length=model_params["MAX_SOURCE_TEXT_LENGTH"], 
                                                 truncation=True, 
                                                 padding="max_length", 
                                                 return_tensors="pt").to(device)
            
            source = tokens.input_ids[0]
            attention = tokens.attention_mask[0]
            
            if strategy == "middle":
                
                left_remove_percent = from_left
                
                no_real_tokens = sum(x != 0 for x in source)
                left = int(left_remove_percent * no_real_tokens)
                
                # 3. Truncate left and right, concat both (input ids and attention mask)
                left_selection_ids = source[:left]
                right_selection_ids = source[left+no_tokens:len(source)]

                left_attention_mask = attention[:left]
                right_attention_mask = attention[left+no_tokens:len(source)]

                new_ids = torch.concat([left_selection_ids, right_selection_ids], 0)        
                new_masks = torch.concat([left_attention_mask,right_attention_mask], 0)

                new_ids = new_ids.tolist()
                new_masks = new_masks.tolist()

                # 4. Create a new source text
                new_encoding = {
                    'input_ids': torch.IntTensor([new_ids]).to(dtype=torch.long),
                    'attention_mask': torch.IntTensor([new_masks]).to(dtype=torch.long)
                }
                

                # 5. Set new token encoding
                new_tokens = tokenization_utils_base.BatchEncoding(new_encoding).to(device)
                
                
            elif strategy == "head":
                new_ids = source[no_tokens:].tolist()
                new_masks = attention[no_tokens:].tolist()

                # 4. Create a new source text
                new_encoding = {
                    'input_ids': torch.IntTensor([new_ids]).to(dtype=torch.long),
                    'attention_mask': torch.IntTensor([new_masks]).to(dtype=torch.long)
                }
                

                # 5. Set new token encoding
                new_tokens = tokenization_utils_base.BatchEncoding(new_encoding).to(device)
                
            elif strategy == "tail":
                no_real_tokens = sum(x != 0 for x in source)
                zeros = source[no_real_tokens:]
                
                new_ids = torch.concat([source[:no_real_tokens-no_tokens],zeros], 0).tolist()
                new_masks = torch.concat([attention[:no_real_tokens-no_tokens],zeros], 0).tolist()
                
                
                new_encoding = {
                    'input_ids': torch.IntTensor([new_ids]).to(dtype=torch.long),
                    'attention_mask': torch.IntTensor([new_masks]).to(dtype=torch.long)
                }
                new_tokens = tokenization_utils_base.BatchEncoding(new_encoding).to(device)
                
            elif strategy == "both":
                pass
            else:
                new_tokens = tokens
            
            outputs = model.generate(
                          input_ids = new_tokens.input_ids,
                          attention_mask = new_tokens.attention_mask, 
                          max_length=150, 
                          num_beams=2,
                          repetition_penalty=2.5, 
                          length_penalty=1.0, 
                          early_stopping=True
                          )
            
            decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
            predictions.append(decoded)
            
    save_results(predictions, test_set['summary'], os.path.join(output_dir,f'predictions_{strategy}_{no_tokens}.csv'))
    
    rouge, bert = compute_metrics(predictions, test_set['summary'], tokenizer)
    
    rouge_df = pd.DataFrame.from_dict(rouge, orient='index')
    rouge_df.to_csv(os.path.join(output_dir, f'rouge_score_{strategy}_{no_tokens}.csv'))
    
    bert_df = pd.DataFrame.from_dict(bert, orient='index')
    bert_df.to_csv(os.path.join(output_dir, f'bert_score_{strategy}_{no_tokens}.csv'))
    
    print(f"SAVE ROUGE TO CSV FINISHED @ {os.path.join(output_dir, f'rouge_score_{strategy}_{no_tokens}.csv')}")
    print(f"SAVE BERT-SCORE TO CSV FINISHED @ {os.path.join(output_dir, f'bert_score_{strategy}_{no_tokens}.csv')}")

## 7. Testing

In [16]:
tokens_to_remove = 20

In [17]:
# #middle words
Eval(output_dir='outputs/', strategy="middle", no_tokens=tokens_to_remove)

Strategy: Remove middle tokens
Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SAVE ROUGE TO CSV FINISHED @ outputs/rouge_score_middle_20.csv
SAVE BERT-SCORE TO CSV FINISHED @ outputs/bert_score_middle_20.csv


In [18]:
rouge_df_middle = pd.read_csv(f'outputs/rouge_score_middle_{tokens_to_remove}.csv')
rouge_df_middle


Unnamed: 0.1,Unnamed: 0,0
0,rouge1,21.7266
1,rouge2,4.171
2,rougeL,15.9697
3,rougeLsum,15.9454
4,gen_len,1.0


In [19]:
# baseline
Eval(output_dir='outputs/', strategy="none")

Strategy: Remove none tokens
Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SAVE ROUGE TO CSV FINISHED @ outputs/rouge_score_none_0.csv
SAVE BERT-SCORE TO CSV FINISHED @ outputs/bert_score_none_0.csv


In [20]:
rouge_df = pd.read_csv('outputs/rouge_score_none_0.csv')
rouge_df

Unnamed: 0.1,Unnamed: 0,0
0,rouge1,22.2732
1,rouge2,4.4585
2,rougeL,16.3347
3,rougeLsum,16.3285
4,gen_len,1.0


In [21]:
#head words
Eval(output_dir='outputs/', strategy="head", no_tokens=tokens_to_remove)

Strategy: Remove head tokens
Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SAVE ROUGE TO CSV FINISHED @ outputs/rouge_score_head_20.csv
SAVE BERT-SCORE TO CSV FINISHED @ outputs/bert_score_head_20.csv




In [22]:
rouge_df_head = pd.read_csv(f'outputs/rouge_score_head_{tokens_to_remove}.csv')
rouge_df_head

Unnamed: 0.1,Unnamed: 0,0
0,rouge1,21.8455
1,rouge2,4.3569
2,rougeL,16.1102
3,rougeLsum,16.1011
4,gen_len,1.0


In [23]:
#tail words
Eval(output_dir='outputs/', strategy="tail", no_tokens=tokens_to_remove)

Strategy: Remove tail tokens
Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SAVE ROUGE TO CSV FINISHED @ outputs/rouge_score_tail_20.csv
SAVE BERT-SCORE TO CSV FINISHED @ outputs/bert_score_tail_20.csv


In [24]:
rouge_df_tail = pd.read_csv(f'outputs/rouge_score_tail_{tokens_to_remove}.csv')
rouge_df_tail

Unnamed: 0.1,Unnamed: 0,0
0,rouge1,20.2834
1,rouge2,3.2438
2,rougeL,14.1731
3,rougeLsum,14.1632
4,gen_len,1.0


## Trying with 50 tokens

In [27]:
tokens_to_remove = 50
Eval(output_dir='outputs/', strategy="middle", no_tokens=tokens_to_remove)
Eval(output_dir='outputs/', strategy="head", no_tokens=tokens_to_remove)
Eval(output_dir='outputs/', strategy="tail", no_tokens=tokens_to_remove)

Strategy: Remove middle tokens
Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SAVE ROUGE TO CSV FINISHED @ outputs/rouge_score_middle_50.csv
SAVE BERT-SCORE TO CSV FINISHED @ outputs/bert_score_middle_50.csv
Strategy: Remove head tokens
Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SAVE ROUGE TO CSV FINISHED @ outputs/rouge_score_head_50.csv
SAVE BERT-SCORE TO CSV FINISHED @ outputs/bert_score_head_50.csv
Strategy: Remove tail tokens
Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SAVE ROUGE TO CSV FINISHED @ outputs/rouge_score_tail_50.csv
SAVE BERT-SCORE TO CSV FINISHED @ outputs/bert_score_tail_50.csv


In [28]:
comparison_df_50 = pd.read_csv('outputs/rouge_score_none_0.csv').rename(columns={'0': 'none'})
comparison_df_50['head'] = pd.read_csv(f'outputs/rouge_score_head_{tokens_to_remove}.csv')['0']
comparison_df_50['tail']= pd.read_csv(f'outputs/rouge_score_tail_{tokens_to_remove}.csv')['0']
comparison_df_50['middle'] = pd.read_csv(f'outputs/rouge_score_middle_{tokens_to_remove}.csv')['0']
comparison_df_50

Unnamed: 0.1,Unnamed: 0,none,head,tail,middle
0,rouge1,22.2732,20.4727,19.8081,21.032
1,rouge2,4.4585,3.773,2.9725,3.8092
2,rougeL,16.3347,15.0436,13.8637,15.5697
3,rougeLsum,16.3285,15.036,13.8608,15.5187
4,gen_len,1.0,1.0,1.0,1.0


## Trying with 10 Tokens

In [30]:
tokens_to_remove = 10
Eval(output_dir='outputs/', strategy="middle", no_tokens=tokens_to_remove)
Eval(output_dir='outputs/', strategy="head", no_tokens=tokens_to_remove)
Eval(output_dir='outputs/', strategy="tail", no_tokens=tokens_to_remove)

Strategy: Remove middle tokens
Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SAVE ROUGE TO CSV FINISHED @ outputs/rouge_score_middle_10.csv
SAVE BERT-SCORE TO CSV FINISHED @ outputs/bert_score_middle_10.csv
Strategy: Remove head tokens
Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SAVE ROUGE TO CSV FINISHED @ outputs/rouge_score_head_10.csv
SAVE BERT-SCORE TO CSV FINISHED @ outputs/bert_score_head_10.csv
Strategy: Remove tail tokens
Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SAVE ROUGE TO CSV FINISHED @ outputs/rouge_score_tail_10.csv
SAVE BERT-SCORE TO CSV FINISHED @ outputs/bert_score_tail_10.csv


## ALL ROUGE SCORES

In [16]:
tokens_to_remove = 10
comparison_df_10 = pd.read_csv('outputs/rouge_score_none_0.csv').rename(columns={'0': 'baseline', 
                                                                                'Unnamed: 0': f'-{tokens_to_remove} tokens'})
comparison_df_10['head_only']= pd.read_csv(f'outputs/rouge_score_tail_{tokens_to_remove}.csv')['0']
comparison_df_10['tail_only'] = pd.read_csv(f'outputs/rouge_score_head_{tokens_to_remove}.csv')['0']
comparison_df_10['head+tail'] = pd.read_csv(f'outputs/rouge_score_middle_{tokens_to_remove}.csv')['0']
comparison_df_10

Unnamed: 0,-10 tokens,baseline,head_only,tail_only,head+tail
0,rouge1,22.2732,20.3532,21.8271,21.9557
1,rouge2,4.4585,3.3742,4.3329,4.2895
2,rougeL,16.3347,14.2553,16.0445,16.1649
3,rougeLsum,16.3285,14.2423,16.0301,16.1509
4,gen_len,1.0,1.0,1.0,1.0


In [15]:
tokens_to_remove = 20
comparison_df_20 = pd.read_csv('outputs/rouge_score_none_0.csv').rename(columns={'0': 'baseline', 
                                                                                'Unnamed: 0': f'-{tokens_to_remove} tokens'})
comparison_df_20['head_only']= pd.read_csv(f'outputs/rouge_score_tail_{tokens_to_remove}.csv')['0']
comparison_df_20['tail_only'] = pd.read_csv(f'outputs/rouge_score_head_{tokens_to_remove}.csv')['0']
comparison_df_20['head+tail'] = pd.read_csv(f'outputs/rouge_score_middle_{tokens_to_remove}.csv')['0']
comparison_df_20

Unnamed: 0,-20 tokens,baseline,head_only,tail_only,head+tail
0,rouge1,22.2732,20.2834,21.8455,21.7266
1,rouge2,4.4585,3.2438,4.3569,4.171
2,rougeL,16.3347,14.1731,16.1102,15.9697
3,rougeLsum,16.3285,14.1632,16.1011,15.9454
4,gen_len,1.0,1.0,1.0,1.0


In [17]:
tokens_to_remove = 50
comparison_df_50 = pd.read_csv('outputs/rouge_score_none_0.csv').rename(columns={'0': 'baseline', 
                                                                                'Unnamed: 0': f'-{tokens_to_remove} tokens'})
comparison_df_50['head_only']= pd.read_csv(f'outputs/rouge_score_tail_{tokens_to_remove}.csv')['0']
comparison_df_50['tail_only'] = pd.read_csv(f'outputs/rouge_score_head_{tokens_to_remove}.csv')['0']
comparison_df_50['head+tail'] = pd.read_csv(f'outputs/rouge_score_middle_{tokens_to_remove}.csv')['0']
comparison_df_50

Unnamed: 0,-50 tokens,baseline,head_only,tail_only,head+tail
0,rouge1,22.2732,19.8081,20.4727,21.032
1,rouge2,4.4585,2.9725,3.773,3.8092
2,rougeL,16.3347,13.8637,15.0436,15.5697
3,rougeLsum,16.3285,13.8608,15.036,15.5187
4,gen_len,1.0,1.0,1.0,1.0


## BERT SCORES

In [20]:
tokens_to_remove = 10
bert_df = pd.read_csv(f'outputs/bert_score_none_0.csv').rename(columns={'0': 'none',

                                                                        'Unnamed: 0': f'-{tokens_to_remove} tokens'})
bert_df['head_only'] = pd.read_csv(f'outputs/bert_score_tail_{tokens_to_remove}.csv')['0']
bert_df['tail_only'] = pd.read_csv(f'outputs/bert_score_head_{tokens_to_remove}.csv')['0']
bert_df['head+tail'] = pd.read_csv(f'outputs/bert_score_middle_{tokens_to_remove}.csv')['0']
bert_df

Unnamed: 0,-10 tokens,none,head_only,tail_only,head+tail
0,precision,0.235331,0.157909,0.233151,0.230292
1,recall,0.216859,0.212301,0.207328,0.212871
2,F1,0.226175,0.185327,0.220371,0.22164


In [21]:
tokens_to_remove = 20
bert_df20 = pd.read_csv(f'outputs/bert_score_none_0.csv').rename(columns={'0': 'none',

                                                                        'Unnamed: 0': f'-{tokens_to_remove} tokens'})
bert_df20['head_only'] = pd.read_csv(f'outputs/bert_score_tail_{tokens_to_remove}.csv')['0']
bert_df20['tail_only'] = pd.read_csv(f'outputs/bert_score_head_{tokens_to_remove}.csv')['0']
bert_df20['head+tail'] = pd.read_csv(f'outputs/bert_score_middle_{tokens_to_remove}.csv')['0']
bert_df20

Unnamed: 0,-20 tokens,none,head_only,tail_only,head+tail
0,precision,0.235331,0.159912,0.233948,0.223344
1,recall,0.216859,0.213422,0.201985,0.210321
2,F1,0.226175,0.186854,0.218017,0.216909


In [22]:
tokens_to_remove = 50
bert_df50 = pd.read_csv(f'outputs/bert_score_none_0.csv').rename(columns={'0': 'none',

                                                                        'Unnamed: 0': f'-{tokens_to_remove} tokens'})
bert_df50['head_only'] = pd.read_csv(f'outputs/bert_score_tail_{tokens_to_remove}.csv')['0']
bert_df50['tail_only'] = pd.read_csv(f'outputs/bert_score_head_{tokens_to_remove}.csv')['0']
bert_df50['head+tail'] = pd.read_csv(f'outputs/bert_score_middle_{tokens_to_remove}.csv')['0']
bert_df50

Unnamed: 0,-50 tokens,none,head_only,tail_only,head+tail
0,precision,0.235331,0.135012,0.213281,0.210393
1,recall,0.216859,0.203137,0.170915,0.204667
2,F1,0.226175,0.168881,0.191823,0.207497
