__Project:__ Text Summarization <br> 
__Sub-prj:__ Parametric Evaluation <br>
__Experm:__ t5 (seq-seq) <br>
__Status:__ Version 0.0

Amir Hossini

#### Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import transformers
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM
import tensorflow as tf
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import Callback
import sklearn
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score 
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

print(f"Tensorflow version: {tf.__version__}")
print(f"Sklearn version: {sklearn.__version__}")
print(f"Transformers version: {transformers.__version__}")

physical_devices = tf.config.list_physical_devices('GPU') 
for device in physical_devices:
    tf.config.experimental.set_memory_growth(device, True)
    
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Tensorflow version: 2.5.0
Sklearn version: 1.0
Transformers version: 4.11.3
Num GPUs Available:  1


#### I/O Files & Params

In [104]:
fl_train        = '../datasets/banking77/train.csv'
fl_test         = '../datasets/banking77/test.csv'

checkpoint      ='t5-base'

experiment_grid = {
    'n_labels'  : [4],
    'n_train_inst': [4],
    'n_test_inst': [4],
}

seed = 0
BIG_int = 10**21

#### Functions

In [108]:
def select_data_instances2(train, test, col_label, n_labels=None, n_train_inst=None, n_test_inst=None):
    if n_labels is None:
        n_labels = len(train.loc[:,col_label].unique())
    if n_train_inst is None:
        n_train_inst = BIG_int 
    if n_test_inst is None:
        n_test_inst = BIG_int 
        
    select_labels = np.array(train.loc[:,col_label].value_counts()[0:n_labels].index)
    select_train  = train.loc[train[col_label].isin(select_labels),:].reset_index(drop=True)
    select_test   = test.loc[test[col_label].isin(select_labels),:].reset_index(drop=True)
    subset_train  = pd.DataFrame(columns=select_train.columns)
    subset_test   = pd.DataFrame(columns=select_test.columns)
    for label in select_labels:
        temp_train   = select_train.loc[select_train[col_label]==label,:].reset_index(drop=True)
        select_indx  = np.random.choice(range(len(temp_train)),min(n_train_inst,len(temp_train)))
        temp_train   = temp_train.iloc[select_indx,:]
        subset_train = subset_train.append(temp_train)
        
        temp_test    = select_test.loc[select_test[col_label]==label,:].reset_index(drop=True)
        select_indx  = np.random.choice(range(len(temp_test)),min(n_test_inst,len(temp_test)))
        temp_test    = temp_test.iloc[select_indx,:]
        subset_test  = subset_test.append(temp_test)
    subset_train = subset_train.sample(frac=1).reset_index(drop=True)
    subset_test  = subset_test.sample(frac=1).reset_index(drop=True)
    
    return subset_train, subset_test 

def seqseq_model_compile(checkpoint):
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model     = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
    return tokenizer, model

def seqseq_model_fit(text,tokenizer,model,in_max_len=512,
                     out_min_len=4,out_max_len=7,framework='tf',
                     len_penalty=2, num_b=4, early_stop=False                   
                    ):
    
    inputs  = tokenizer(
        'summarize: ' + text, return_tensors=framework, max_length=in_max_len
                       )
    outputs = model.generate(
        inputs['input_ids'], max_length=out_max_len, min_length=out_min_len, 
        length_penalty=len_penalty, num_beams=num_b, early_stopping=early_stop
    )
    
    return tokenizer.decode(outputs[0]) 

In [109]:
np.random.seed(seed)
train_orig = pd.read_csv(fl_train)
test_orig  = pd.read_csv(fl_test)

tokenizer, compiled_model = seqseq_model_compile(checkpoint)

Big_Matrix = pd.DataFrame(columns=['expr_id','train/test_set','text','category','sum_text'])
iexp=0
for ilab in experiment_grid['n_labels']:
    for i_tninst in experiment_grid['n_train_inst']:
        for i_tsinst in experiment_grid['n_test_inst']:
            iexp+=1
            train, test = select_data_instances2(train_orig, test_orig, 
                                                   'category', n_labels=ilab, 
                                                    n_train_inst=i_tninst, 
                                                    n_test_inst=i_tsinst)
            tem_mat_train                   = pd.DataFrame(columns=Big_Matrix.columns)
            tem_mat_test                    = pd.DataFrame(columns=Big_Matrix.columns)
            tem_mat_train['text']           = train['text']
            tem_mat_train['category']       = train['category']
            tem_mat_train['expr_id']        = iexp
            tem_mat_train['train/test_set'] = 'train'
            tem_mat_train['sum_text']       = tem_mat_train['text'].apply(lambda x: seqseq_model_fit(x,tokenizer,compiled_model))
            tem_mat_test['text']            = test['text']
            tem_mat_test['category']        = test['category']
            tem_mat_test['expr_id']         = iexp
            tem_mat_test['train/test_set']  = 'test'
            tem_mat_test['sum_text']        = tem_mat_test['text'].apply(lambda x: seqseq_model_fit(x,tokenizer,compiled_model))
            
            Big_Matrix                      = Big_Matrix.append(tem_mat_train,ignore_index=True)
            Big_Matrix                      = Big_Matrix.append(tem_mat_test,ignore_index=True)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [110]:
Big_Matrix.iloc[:,2:]

Unnamed: 0,text,category,sum_text
0,You shorted me money when I tried to make a wi...,wrong_amount_of_cash_received,<pad> you shorted me money
1,Is there a strict timeline regarding disputing...,direct_debit_payment_not_recognised,<pad> i noticed a fraudulent
2,"Hi, I used an ATM today to take money out of m...",wrong_amount_of_cash_received,<pad> the amount taken out of my
3,I haven't seen the cash yet from the cheque I ...,balance_not_updated_after_cheque_or_cash_deposit,<pad> the cash from the cheque I
4,I'm seeing a direct debit payment that is not me.,direct_debit_payment_not_recognised,<pad> i'm seeing
5,There is a debit paymet that I don't recognize.,direct_debit_payment_not_recognised,<pad> there is a debit pay
6,Why can't I take $100 out,wrong_amount_of_cash_received,<pad> Warum can't I
7,Hello! I randomly got charged an extra fee tod...,card_payment_fee_charged,<pad> i was charged an extra
8,i do not know what this charge is for,direct_debit_payment_not_recognised,<pad> i do not know what
9,What happened to my cash deposit?,balance_not_updated_after_cheque_or_cash_deposit,<pad> what happened to my cash deposit
