# 4. Truncation without training on 1000 samples t5-small

<hr>

## 1. Load Dataset

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import os
import matplotlib.pyplot as plt
import json

from datasets import load_dataset
from datasets import load_metric

from rouge_score import rouge_scorer
from transformers import T5Tokenizer, T5ForConditionalGeneration, tokenization_utils_base, AutoTokenizer, AutoModelForSeq2SeqLM


device = 'cuda:3'



In [2]:
with open('../datasets/test_set.txt') as json_file:
    test_set = json.load(json_file)

## 2. Scorers

In [3]:
def compute_metrics(predictions, actuals, tokenizer):
    
    # <your code here>
    metric = load_metric("rouge")
    result = metric.compute(predictions=predictions, references=actuals, use_stemmer=True)
    
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    rouge = {k: round(v, 4) for k, v in result.items()}

    return rouge

## 3. Model

In [4]:
# let's define model parameters specific to BART
model_params = {
    "MODEL": "gniemiec/t5-small-finetuned-xsum",
    "MAX_SOURCE_TEXT_LENGTH": 512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 36,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
}

In [5]:
torch.manual_seed(model_params["SEED"])  # pytorch random seed
np.random.seed(model_params["SEED"])  # numpy random seed

tokenizer = AutoTokenizer.from_pretrained(model_params["MODEL"])

model = AutoModelForSeq2SeqLM.from_pretrained(model_params["MODEL"])

print(f"Loaded {model_params['MODEL']}")

Loaded gniemiec/t5-small-finetuned-xsum


## 4. Test 1 sample

In [6]:
model.eval()

with torch.no_grad():
    
    # tokens = tokenizer(test_set['document'][1], return_tensors="pt")
    tokens = tokenizer.batch_encode_plus([test_set['document'][0]], 
                                         max_length=model_params["MAX_SOURCE_TEXT_LENGTH"], 
                                         truncation=True, 
                                         padding="max_length", return_tensors="pt").to(device)
    # outputs = model.generate(tokens)
    outputs = model.to(device).generate(
                      input_ids = tokens.input_ids,
                      attention_mask = tokens.attention_mask, 
                      max_length=150, 
                      num_beams=2,
                      repetition_penalty=2.5, 
                      length_penalty=1.0, 
                      early_stopping=True
                      )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(decoded)

Police have recovered three firearms, ammunition and a five-figure sum of money.


In [7]:
rouge = compute_metrics([decoded], [test_set['summary'][0]], tokenizer)   
rouge_df = pd.DataFrame.from_dict(rouge, orient='index')
rouge_df

Unnamed: 0,0
rouge1,33.3333
rouge2,14.2857
rougeL,20.0
rougeLsum,20.0
gen_len,1.0


## 5. Evaluate on 1000 samples

In [5]:
# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix} | {iteration}/{total}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [6]:
model.eval()
model = model.to(device)
predictions = []

l = len(test_set['document'])
printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
with torch.no_grad():
    for idx, doc in enumerate(test_set['document']):
        printProgressBar(idx + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        tokens = tokenizer.batch_encode_plus([doc], 
                                             max_length=model_params["MAX_SOURCE_TEXT_LENGTH"], 
                                             truncation=True, 
                                             padding="max_length", 
                                             return_tensors="pt").to(device)
       
        outputs = model.generate(
                      input_ids = tokens.input_ids,
                      attention_mask = tokens.attention_mask, 
                      max_length=150, 
                      num_beams=2,
                      repetition_penalty=2.5, 
                      length_penalty=1.0, 
                      early_stopping=True
                      )
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(decoded)


### 5.1 Saving results

In [6]:
def save_results(predictions, actuals, output):
    df = pd.DataFrame({'predictions': predictions, 'actuals': actuals})
    df.to_csv(output)
    print("PREDICTIONS RESULTS SAVED.")

In [7]:
save_results(predictions, test_set['summary'], 'outputs/predictions_pretrained_t5xsum.csv')

## 6. Create truncated tokens list

In [7]:

def Trainer(output_dir, strategy="none", no_tokens=70):
    '''
    source: array of documents
    strategy:
        - head: truncate head
        - tail: truncate tail 
        - both: truncate head and tail
        - middle: truncate middle words
    words_no: number of tokens to remove
    '''
    print(f"Strategy: Remove {strategy} tokens")
    l = len(test_set['document'])
    
    
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seedtokenizer = AutoTokenizer.from_pretrained(model_params["MODEL"])
    
    print("Loading model..")
    model = AutoModelForSeq2SeqLM.from_pretrained(model_params["MODEL"])
    tokenizer = AutoTokenizer.from_pretrained(model_params["MODEL"])
    
    model.eval()
    model = model.to(device)
    
    predictions = []
    
    printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
    
    with torch.no_grad():
        for idx, doc in enumerate(test_set['document']):
            printProgressBar(idx + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
            tokens = tokenizer.batch_encode_plus([doc], 
                                                 max_length=model_params["MAX_SOURCE_TEXT_LENGTH"], 
                                                 truncation=True, 
                                                 padding="max_length", 
                                                 return_tensors="pt").to(device)
            
            source = tokens.input_ids[0]
            attention = tokens.attention_mask[0]
            
            if strategy == "middle":
                
                left_remove_percent = .20
                
                no_real_tokens = sum(x != 0 for x in source)
                left = int(left_remove_percent * no_real_tokens)
                
                # 3. Truncate left and right, concat both (input ids and attention mask)
                left_selection_ids = source[:left]
                right_selection_ids = source[left+no_tokens:len(source)]

                left_attention_mask = attention[:left]
                right_attention_mask = attention[left+no_tokens:len(source)]

                new_ids = torch.concat([left_selection_ids, right_selection_ids], 0)        
                new_masks = torch.concat([left_attention_mask,right_attention_mask], 0)

                new_ids = new_ids.tolist()
                new_masks = new_masks.tolist()

                # 4. Create a new source text
                new_encoding = {
                    'input_ids': torch.IntTensor([new_ids]).to(dtype=torch.long),
                    'attention_mask': torch.IntTensor([new_masks]).to(dtype=torch.long)
                }
                

                # 5. Set new token encoding
                new_tokens = tokenization_utils_base.BatchEncoding(new_encoding).to(device)
                
                
            elif strategy == "head":
                new_ids = source[no_tokens:].tolist()
                new_masks = attention[no_tokens:].tolist()

                # 4. Create a new source text
                new_encoding = {
                    'input_ids': torch.IntTensor([new_ids]).to(dtype=torch.long),
                    'attention_mask': torch.IntTensor([new_masks]).to(dtype=torch.long)
                }
                

                # 5. Set new token encoding
                new_tokens = tokenization_utils_base.BatchEncoding(new_encoding).to(device)
                
            elif strategy == "tail":
                no_real_tokens = sum(x != 0 for x in source)
                zeros = source[no_real_tokens:]
                
                new_ids = torch.concat([source[:no_real_tokens-no_tokens],zeros], 0).tolist()
                new_masks = torch.concat([attention[:no_real_tokens-no_tokens],zeros], 0).tolist()
                
                
                new_encoding = {
                    'input_ids': torch.IntTensor([new_ids]).to(dtype=torch.long),
                    'attention_mask': torch.IntTensor([new_masks]).to(dtype=torch.long)
                }
                new_tokens = tokenization_utils_base.BatchEncoding(new_encoding).to(device)
                
            elif strategy == "both":
                pass
            else:
                new_tokens = tokens
            
            outputs = model.generate(
                          input_ids = new_tokens.input_ids,
                          attention_mask = new_tokens.attention_mask, 
                          max_length=150, 
                          num_beams=2,
                          repetition_penalty=2.5, 
                          length_penalty=1.0, 
                          early_stopping=True
                          )
            
            decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
            predictions.append(decoded)
            
    save_results(predictions, test_set['summary'], os.path.join(output_dir,f'predictions_{strategy}_{no_tokens}.csv'))
    
    rouge = compute_metrics(predictions, test_set['summary'], tokenizer)
    
    rouge_df = pd.DataFrame.from_dict(rouge, orient='index')
    rouge_df.to_csv(os.path.join(output_dir, f'rouge_score_{strategy}_{no_tokens}.csv'))
    print(f"SAVE ROUGE TO CSV FINISHED @ {os.path.join(output_dir, f'rouge_score_{strategy}_{no_tokens}.csv')}")

## 7. Testing

In [13]:
# #middle words
# Trainer(output_dir='outputs_with_truncation/', strategy="middle", no_tokens=70)

In [18]:
rouge_df_middle = pd.read_csv('outputs_with_truncation/rouge_score_middle.csv')
rouge_df_middle


Unnamed: 0.1,Unnamed: 0,0
0,rouge1,20.2283
1,rouge2,3.5699
2,rougeL,14.8051
3,rougeLsum,14.7833
4,gen_len,1.0


In [15]:
# baseline
Trainer(output_dir='outputs_with_truncation/', strategy="none")

Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.
SAVE ROUGE TO CSV FINISHED @ outputs_with_truncation/rouge_score_none.csv


Unnamed: 0.1,Unnamed: 0,0
0,rouge1,22.2732
1,rouge2,4.4585
2,rougeL,16.3347
3,rougeLsum,16.3285
4,gen_len,1.0


In [9]:
rouge_df = pd.read_csv('outputs_with_truncation/rouge_score_none.csv')
rouge_df

Unnamed: 0.1,Unnamed: 0,0
0,rouge1,22.2732
1,rouge2,4.4585
2,rougeL,16.3347
3,rougeLsum,16.3285
4,gen_len,1.0


In [9]:
#head words
Trainer(output_dir='outputs_with_truncation/', strategy="head", no_tokens=70)

Strategy: Remove head tokens
Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.
SAVE ROUGE TO CSV FINISHED @ outputs_with_truncation/rouge_score_head.csv


Unnamed: 0.1,Unnamed: 0,0
0,rouge1,19.0926
1,rouge2,3.5937
2,rougeL,14.0856
3,rougeLsum,14.0834
4,gen_len,1.0


In [12]:
rouge_df_head = pd.read_csv('outputs_with_truncation/rouge_score_head.csv')
rouge_df_head

In [8]:
#tail words
Trainer(output_dir='outputs_with_truncation/', strategy="tail", no_tokens=70)

Strategy: Remove tail tokens
Loading model..
Progress: |██████████████████████████████████████████████████| 100.0% Complete | 1000/1000
PREDICTIONS RESULTS SAVED.
SAVE ROUGE TO CSV FINISHED @ outputs_with_truncation/rouge_score_tail_70.csv


Unnamed: 0.1,Unnamed: 0,0
0,rouge1,19.4498
1,rouge2,3.0035
2,rougeL,13.7329
3,rougeLsum,13.7329
4,gen_len,1.0


In [13]:
rouge_df_tail = pd.read_csv('outputs_with_truncation/rouge_score_tail_70.csv')
rouge_df_tail

## 8. RESULTS

In [35]:
comparison_df = rouge_df.rename(columns={'0': 'none'})
comparison_df['head'] =rouge_df_head['0']
comparison_df['tail']= rouge_df_tail['0']
comparison_df['middle'] = rouge_df_middle['0']
comparison_df

Unnamed: 0.1,Unnamed: 0,none,head,tail,middle
0,rouge1,22.2732,19.0926,19.4498,20.2283
1,rouge2,4.4585,3.5937,3.0035,3.5699
2,rougeL,16.3347,14.0856,13.7329,14.8051
3,rougeLsum,16.3285,14.0834,13.7329,14.7833
4,gen_len,1.0,1.0,1.0,1.0
