In [1]:
pip  install transformers dataset torch

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments




In [3]:
#load the processed data 
train_df = pd.read_csv('C:/Users/Adnan/cleaned_train_dataset.csv')
test_df = pd.read_csv('C:/Users/Adnan/cleaned_test_dataset.csv')


In [4]:
train_df['input_text'] = 'question' + train_df['processed_question'].astype(str)
train_df['target_text'] = train_df['processed_answer'].astype(str)

test_df['input_text']  = 'question' + test_df['processed_question'].astype(str)
test_df['target_text'] = test_df['processed_answer'].astype(str)

train_dataset = Dataset.from_pandas(train_df[['input_text','target_text']])
test_dataset = Dataset.from_pandas(test_df[['input_text','target_text']])

In [5]:
tokenizer = T5Tokenizer.from_pretrained('T5-base')

def tokenize_func(temp):
    model_inputs = tokenizer(temp['input_text'], max_length = 512, truncation =True, padding = 'max_length')
    labels = tokenizer(temp['target_text'], max_length = 512, truncation = True, padding = 'max_length')
    
    model_inputs['labels'] = labels['input_ids']
    
  
    
    return model_inputs

train_token = train_dataset.map(tokenize_func, batched= True)
test_token = test_dataset.map(tokenize_func, batched= True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/45121 [00:00<?, ? examples/s]

Map:   0%|          | 0/11281 [00:00<?, ? examples/s]

In [6]:
print(train_token[:3])

{'input_text': ['questionhowshouldireachouttosekhmet', 'questionasaproudtexanibelievethatforprofithealthcareisthebestsystemitallowsmaximumprofitforshareholdersandcompanieswhydofolksineuropebritaincanadaaustraliaandnewzealandthinkthattheyareentitledtofreehealthcare', 'questionwhatisthebestwaytostartlearningphp'], 'target_text': ['telephoneletteremail', 'beingproudofthattellsusalotabouttexanslikeyouthefactthatyouarecallinguniversalhealthcarefreeisatestamenttoyouroutstandingignoranceandinabilitytocompletebasicresearchintosubjectsyouliketopokeyournoseintoyouhavejustmadeahugefoolofyourselfyoumustbesoproud', 'learnstepbystepphpexampleslinkedtextphptutorialyoutubeurlhttpswwwyoutubecomplaylistlistplkxfhtjmizbmdyuiczhkvfvdhnxklearnphpmysqlilinkedtextphpmysqlitutorialyoutubeurlhttpswwwyoutubecomplaylistlistplkxfhtjmiaauomgqdtukgcwgwzp'], 'input_ids': [[822, 4067, 25351, 3477, 23135, 17, 235, 7, 15, 157, 107, 3493, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [7]:
print(test_token[:3])

{'input_text': ['questionifyouwalkintoabuffetrestauranttechnicallyduringtheirlunchtimesbutleaveduringdinnertimesassumingtheyaretwodifferentpriceswhichpricedoyougetcharged', 'questionwhatisabsolutelythebestindianrestaurantintheworldandwhy', 'questionwhatisitlikeraisingaveryhighiqchild'], 'target_text': ['inconnecticutatmohegansuncasinotheofficialdinnerpricebeganatpmhoweveryoupadiuponarrivalsoonesaturdaywewentforalatelunchafterwepaidthelunchpricebutwhileeatingtheybroughtoutallthedinneritemsincludingcrablegs', 'manyfantasticindianrestaurantsintheworldforcompletedetailsyoumayvisitlinkedtexthowtofindandenjoythebestindianfoodintheworldacompleteguideurlhttpwwwrealtimenewsanalysiscomindianfoodguidehtml', 'allofourkidswereconsideredhighlyintelligentunliketheirquiteaverageparentsthedealisintelligenceisfineeducationmaturitypersonalityandothertalentsplayintothingsalsowebelievedourkidswouldbestthriveinahomewheretheyfeltsafelovedandfreetoexploretheworldwithsomelimitsandwithourapprovalandguidancewith

In [8]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')

training_arg = TrainingArguments(
    output_dir = './results',
    num_train_epochs = 3,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps=2,
    warmup_steps = 500,
    weight_decay = 0.01,
    logging_dir = './logs',
    logging_steps = 100,
)

trainer = Trainer(
    model = model,
    args= training_arg,
    train_dataset = train_token,
    eval_dataset = test_token,
)

trainer.train()
eval_results = trainer.evaluate()
print(eval_results)

model.save_pretrained('./t5-chatbot-model')
tokenizer.save_pretrained('./t5-chatbot-model')


Step,Training Loss
100,9.9702


KeyboardInterrupt: 

In [None]:
def generate_response(question):
    input_text = 'question: ' + question
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Example usage
print(generate_response("What is the capital of France?"))
