In [1]:
import os
import torch
import re
import pandas as pd

from typing import List,Dict,Literal
from pathlib import Path
from pydantic import BaseModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import Dataset,DataLoader
from tqdm import tqdm
from transformers import Trainer, TrainingArguments

In [2]:
class ConfigDataSet(BaseModel):
        split: Literal['train','dev','test']
        model_name: Literal['gpt2-large','gpt2-medium','gpt2','gpt2-xl'] = 'gpt2'
        trun_limit: int = 500    
        BASEPATH : Path = Path("../data/")
        debug: bool = True

class EnronEmailDataset(Dataset):
    # Read About MRO(Method Resolution Order)    
    
    def __init__(
        self,
        config: ConfigDataSet
    ):
        # As Config is at as just data we can us it with pydatic
        self.config = config
        self.tokenizer  = GPT2Tokenizer.from_pretrained(self.config.model_name)
        # Setting pad tokenizer as end of sent token
        self.tokenizer.pad_token = self.tokenizer.eos_token
        # self.file_paths: List[str] = [ self.config.BASEPATH/self.config.split/name 
        #                               for name in 
        #                               os.listdir(self.config.BASEPATH/self.config.split)]
        # self.emails: List[str] = [ open(self.file_paths[idx],'r').read().strip()\
        #                             for idx in tqdm(range(len(self.file_paths)),
        #                                             desc="Loading Email") ]
        data = pd.read_csv(f"../data/{self.config.split}.csv")
        self.email = data['body'].tolist()
        self.subject = data['subject'].tolist()
        if self.config.split != 'train':
            self.ann0 = data['ann0'].tolist()
            self.ann1 = data['ann1'].tolist()
            self.ann2 = data['ann2'].tolist()
            
        # if self.config.debug:
        #     print(f"Possible Max lenght for the model is\
        #             {self.tokenizer.model_max_length}")
        #     print(f"First Data point of {self.config.split} tokenized as :\n",self[0])
        #     for idx in tqdm(range(len(self)),desc="Length Test:"):
        #         if len(self[idx]['input_ids'])>=self.tokenizer.model_max_length:
        #             raise(f"ERROR: The length of {idx} data point \
        #             in {self.split} split is more the {self.tokenizer.model_max_length}")
        #     print("Passed all CHECKS")
        
    def clean_text(
        self,
        text:str
        ):
        # Updated so that it comes from config
        # ipdb.set_trace()
        text = re.sub(' +',' ',text)
        text = re.sub('\n+','\n',text)
        # If it is non-numeric char also char like /.- are not removed
        text = re.sub('[^A-Za-z0-9\n\s\\/.-]+','',text)
        return text
        
        
    def __getitem__(
        self,
        idx:int
    ):
        
        """ 
        returns the input_ids and attention_maks also tuncates if
        the email is longer that what is specified in config
        """
        
        # with open(self.file_paths[idx],'r') as f:
        #     email_with_subject = f.read().strip()
        

        email,subject = self.email[idx],self.subject[idx]
        
        email = self.clean_text(email)

        # email = ''.join(email.split()[:self.config.trun_limit])
        tok_email = self.tokenizer(email,truncation=True,max_length=self.config.trun_limit)
        tok_subject = self.tokenizer( "\n\n@subject\n"+ subject + "\n[ENDOFEMAIL]\n"+" <|endoftext|>",
                                     truncation=True,max_length=self.config.trun_limit)
        
        tok_email['input_ids'].extend(tok_subject['input_ids'])
        tok_email['attention_mask'].extend(tok_subject['attention_mask'])

        return tok_email


        # 
        # tok_subject = self.tokenizer( "@subject\n"+ subject + " <|endoftext|>",
        #                              truncation=True,max_length=self.config.trun_limit)
        
        # Token from which CLM will start Finetuning
        # st_gen_token = len(tok_email['input_ids'])
        
        # tok_email['input_ids'].extend(tok_subject['input_ids'])
        # tok_email['attention_mask'].extend([0]*len(tok_subject['attention_mask']))
        
        
         # return ({'input_ids':torch.tensor(tok_email['input_ids']),
         #         "attention_mask":torch.tensor(tok_email['attention_mask'])},st_gen_token)

        
    def __len__(
        self
    ):
        return len(self.email)

In [8]:
train_dataconfig = ConfigDataSet( split='train',
                            trun_limit=300)
train_dataset = EnronEmailDataset(train_dataconfig)

In [4]:
val_dataconfig = ConfigDataSet( split='dev',
                            trun_limit=300)
val_dataset = EnronEmailDataset(val_dataconfig)

In [5]:
data_collator = DataCollatorForLanguageModeling(tokenizer=train_dataset.tokenizer, mlm=False)

In [21]:
x = data_collator([train_dataset[0],train_dataset[1]])

In [22]:
x.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [29]:
x

{'input_ids': tensor([[  464,  1321, 18307,   416,   262,  1708,   304,    12,  4529,   318,
          5292,   691,   329,   262,   751,   411,  3826,   290,   743,  3994,
         15279,   290,    14,   273, 21929,  2587,    13,   198,  7149, 28759,
          2423,  1005, 26084,  3411, 44832,   393,   584,   779,   286,   393,
          2263,   286,   597,  2223,  2402,   428,  1321,   416,  6506,   393,
         12066,   584,   621,   262,  5292, 17800,   318, 12244,   416,  1099,
           290,   743,  2426,   606,   284,  4301,   393,  3026, 12247,    13,
           198,  1532,   345,  2722,   428,  6946,   287,  4049,  3387,  2800,
           514,  3393,   379, 22131,   767,  5332,    12,    24,  3064,   290,
         12233,   262,  6946,   422,   597,  3644,   393,  3127,  1080,    13,
           198,  7003,   428,  3053,   290,   597, 32161,   389,  4762,   284,
           307,  1479,   286,   597,  9471,   393,   584, 11855,   326,  1244,
         22533,  2689,   597,  3644,  

In [6]:
out_dir = "../model_weights/GPT2_FT_Model"
training_args = TrainingArguments(
    output_dir=out_dir,
    overwrite_output_dir=True,
    per_device_train_batch_size=4, # try with 2
    per_device_eval_batch_size=4,  #  try with 2
    num_train_epochs=20,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir='./logs',
    evaluation_strategy = 'epoch',
    load_best_model_at_end=True
    )

In [7]:
gpt2 = GPT2LMHeadModel.from_pretrained(train_dataconfig.model_name)

In [8]:
# Train the model
trainer = Trainer(
    model=gpt2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbss[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,3.1127,3.070092
2,2.8323,3.011306
3,2.6265,3.036852
4,2.5285,3.025615
5,2.3413,3.049123
6,2.254,3.092557
7,2.133,3.089626
8,2.0425,3.175704
9,1.9801,3.187019
10,1.9016,3.25263


TrainOutput(global_step=72180, training_loss=1.9949246016259536, metrics={'train_runtime': 14732.7529, 'train_samples_per_second': 19.597, 'train_steps_per_second': 4.899, 'total_flos': 3.5340600609792e+16, 'train_loss': 1.9949246016259536, 'epoch': 20.0})

In [43]:
model_output_path = '../model_weights'
trainer.save_model(model_output_path)
train_dataset.tokenizer.save_pretrained(model_output_path)

NameError: name 'tokenizer' is not defined

In [3]:
gpt2 = GPT2LMHeadModel.from_pretrained("../model_weights/GPT2_FT_Model/checkpoint-7218").to('cuda')

In [66]:
mail = train_dataset.clean_text(""""I just got off the phone with Darren Vanek.
We have been talking  about what is needed for about a month now.
Almost two weeks ago he said  he wanted a contract that allowed him to call for a letter of credit if one were  needed in the future and that would suffice.
Apparently, a paralegal, is involved in generating such a contract and she  was out all of last week.
Darren said that he has no control over when the  contract actually gets sent to me.
Please do what you can to expedite the  emailing of that contract to me so that I can order gas beyond Dec. 2001.
Thanks and regards,""") + "\n\n@subject\n"

In [67]:
mail

'I just got off the phone with Darren Vanek.\nWe have been talking about what is needed for about a month now.\nAlmost two weeks ago he said he wanted a contract that allowed him to call for a letter of credit if one were needed in the future and that would suffice.\nApparently a paralegal is involved in generating such a contract and she was out all of last week.\nDarren said that he has no control over when the contract actually gets sent to me.\nPlease do what you can to expedite the emailing of that contract to me so that I can order gas beyond Dec. 2001.\nThanks and regards \n\n@subject\n'

In [68]:
out = train_dataset.tokenizer(mail, return_tensors='pt').to('cuda')

In [69]:
# context = tokenizer('I want to fly a', return_tensors='pt')

prediction = gpt2.generate(**out,max_new_tokens= 100)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [71]:
output = train_dataset.tokenizer.decode(prediction[0])

In [80]:
output

'I just got off the phone with Darren Vanek.\nWe have been talking about what is needed for about a month now.\nAlmost two weeks ago he said he wanted a contract that allowed him to call for a letter of credit if one were needed in the future and that would suffice.\nApparently a paralegal is involved in generating such a contract and she was out all of last week.\nDarren said that he has no control over when the contract actually gets sent to me.\nPlease do what you can to expedite the emailing of that contract to me so that I can order gas beyond Dec. 2001.\nThanks and regards \n\n@subject\nGas Contract\n[ENDOFEMAIL]\n                                                                                          '

In [85]:
x = output.split("@subject\n")[-1].split("\n[ENDOFEMAIL]\n")[0]

## Testing on Dev and Test

In [4]:
dev = pd.read_csv("../data/dev.csv")
test = pd.read_csv("../data/test.csv")

In [5]:
bodys = dev['body'].tolist()

In [None]:
gpt2.

In [12]:
def emailSubjectGen(emails):
    subjects = []
    for email in tqdm(emails,desc="Generated:"):
        tok = train_dataset.tokenizer(train_dataset.clean_text(email)+"\n\n@subject\n"\
                                      , return_tensors='pt',
                                      max_length=train_dataset.config.trun_limit,
                                      truncation=True
                                     ).to('cuda')
        prediction = gpt2.generate(**tok,max_new_tokens=100,
                                   pad_token_id=train_dataset.tokenizer.eos_token_id)
        output = train_dataset.tokenizer.decode(prediction[0])
        x = output.split("@subject\n")[-1].split("\n[ENDOFEMAIL]\n")[0]
        subjects.append(x)
    return subjects

In [15]:
dev_gen_sub = emailSubjectGen(dev['body'].tolist())

Generated:: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1960/1960 [33:02<00:00,  1.01s/it]


In [16]:
test_gen_sub = emailSubjectGen(test['body'].tolist())

Generated:: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1906/1906 [32:08<00:00,  1.01s/it]


In [17]:
dev['gen_subject'] = dev_gen_sub
test['gen_subject']= test_gen_sub

In [22]:
dev.to_csv("../data/output/output_dev.csv")
test.to_csv("../data/output/output_test.csv")

In [25]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
def rouge_cal(data,an1,an2):
    r1,r2,rl = 0,0,0
    counter = 0
    for a0,a1 in zip(data[an1].tolist(),data[an2].tolist()):
        scores = scorer.score(a0,a1)
        # print(scores)
        r1 += scores['rouge1'].fmeasure
        r2 += scores['rouge2'].fmeasure
        rl += scores['rougeL'].fmeasure
        counter+=1
    print(f"Between {an1} and {an2} the scores are :\n \
    Rouge-1 {r1/counter} Rouge-2 {r2/counter} Rouge-L {rl/counter}")

In [26]:
rouge_cal(dev,'gen_subject','subject')

Between gen_subject and subject the scores are :
     Rouge-1 0.2732518349073312 Rouge-2 0.11974588874868865 Rouge-L 0.2668310967098438


In [27]:
rouge_cal(test,'gen_subject','subject')

Between gen_subject and subject the scores are :
     Rouge-1 0.2594523875188608 Rouge-2 0.11903202393279694 Rouge-L 0.2541247011201646


In [28]:
rouge_cal(dev,'gen_subject','ann0')

Between gen_subject and ann0 the scores are :
     Rouge-1 0.28433004494761366 Rouge-2 0.13417300498284032 Rouge-L 0.27501508418350806


In [29]:
rouge_cal(dev,'gen_subject','ann1')

Between gen_subject and ann1 the scores are :
     Rouge-1 0.27471732736901805 Rouge-2 0.1363361165898205 Rouge-L 0.2660431652985551


In [30]:
rouge_cal(dev,'gen_subject','ann2')

Between gen_subject and ann2 the scores are :
     Rouge-1 0.28043781097455694 Rouge-2 0.1309949964477969 Rouge-L 0.26979476279555104


In [31]:
rouge_cal(test,'gen_subject','ann0')

Between gen_subject and ann0 the scores are :
     Rouge-1 0.29036829341388176 Rouge-2 0.14427877439013237 Rouge-L 0.2808761198145867


In [32]:
rouge_cal(test,'gen_subject','ann1')

Between gen_subject and ann1 the scores are :
     Rouge-1 0.28732260159837997 Rouge-2 0.1381158409512439 Rouge-L 0.2779335316707248


In [33]:
rouge_cal(test,'gen_subject','ann2')

Between gen_subject and ann2 the scores are :
     Rouge-1 0.29493567042928837 Rouge-2 0.14614512483560554 Rouge-L 0.2846064299225586
