## Importing Modules

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd
import nltk

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, GPT2Config, TextDataset
from tqdm.auto import tqdm
import random
import datetime
import time
import statistics
from nltk.translate.bleu_score import sentence_bleu
from transformers import TrainingArguments, Trainer, set_seed
from datasets import load_dataset
from datasets import Dataset

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

  from .autonotebook import tqdm as notebook_tqdm



GPU is available!


In [2]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'

In [3]:
models = os.listdir(MODEL_PATH)
models

['bert-base-cased',
 'bert-base-multilingual-cased',
 'bert-base-uncased',
 'bert-large-cased',
 'bert-large-uncased',
 'flan-t5-base',
 'flan-t5-large',
 'flan-t5-small',
 'gpt2',
 'gpt2-large',
 'gpt2-medium']

In [4]:
model_path = MODEL_PATH + '\\' + models[8]
model_path

'D:\\Python\\LLM_Environment\\models\\gpt2'

In [5]:
model = GPT2LMHeadModel.from_pretrained(model_path)

In [6]:
torch.cuda.empty_cache()

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

## Import Dataset

In [8]:
filenames = os.listdir(DATASET_PATH)
filenames

['cached_lm_GPT2Tokenizer_128_Shakespeare_Dataset.txt',
 'Customer.csv',
 'Html.csv',
 'Recipes.csv',
 'Recipes_1000.csv',
 'Shakespeare_Dataset.txt',
 'Taylor_Swift_Lyrics.csv']

In [9]:
file_path = DATASET_PATH + '\\' + filenames[1]
file_path

'D:\\Python\\LLM_Environment\\datasets\\Customer.csv'

In [10]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,Message,Category,Intent,Response
0,0,problem witth cancelling order,order,cancel_order,i fathom that you're experiencing a problem wi...
1,1,i need assistance with canceling order,order,cancel_order,i've ascertained that you need assistance with...
2,2,assistance cancelling order,order,cancel_order,i realized you're seeking assistance in cancel...
3,3,problem with cancelling purchase,order,cancel_order,"i've understood, you're experiencing difficult..."
4,4,i have a problem with cancelling order,order,cancel_order,i've got that you're experiencing difficulties...


In [11]:
# Combine the columns into a single text field for training
df['input_text'] = df.apply(lambda row: f"Query: {row['Message']} Response: {row['Response']}", axis=1)

In [12]:

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df[['input_text']])

In [13]:
dataset

Dataset({
    features: ['input_text'],
    num_rows: 8100
})

In [14]:
# Set padding token
tokenizer.pad_token = tokenizer.eos_token

In [15]:
def tokenize_function(examples):
    return tokenizer(examples['input_text'], truncation=True,padding='max_length', max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 8100/8100 [00:05<00:00, 1506.64 examples/s]


In [16]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # No masked language modeling for GPT-2
)

In [17]:
save_path = './model'
# Define training arguments
training_args = TrainingArguments(
    output_dir=save_path,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    save_steps=10_000,
    save_total_limit=2,
)


In [18]:
# Create Trainer and fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset
)

In [19]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
 33%|███▎      | 500/1518 [09:46<19:50,  1.17s/it]

{'loss': 1.3953, 'grad_norm': 3.3885138034820557, 'learning_rate': 3.3530961791831364e-05, 'epoch': 0.99}


 66%|██████▌   | 1000/1518 [19:48<10:22,  1.20s/it]

{'loss': 1.0604, 'grad_norm': 3.146991014480591, 'learning_rate': 1.7061923583662716e-05, 'epoch': 1.98}


 99%|█████████▉| 1500/1518 [29:49<00:21,  1.21s/it]

{'loss': 0.9858, 'grad_norm': 3.295767307281494, 'learning_rate': 5.928853754940711e-07, 'epoch': 2.96}


100%|██████████| 1518/1518 [30:13<00:00,  1.19s/it]

{'train_runtime': 1813.888, 'train_samples_per_second': 13.397, 'train_steps_per_second': 0.837, 'train_loss': 1.1452121056115674, 'epoch': 3.0}





TrainOutput(global_step=1518, training_loss=1.1452121056115674, metrics={'train_runtime': 1813.888, 'train_samples_per_second': 13.397, 'train_steps_per_second': 0.837, 'total_flos': 6346260873216000.0, 'train_loss': 1.1452121056115674, 'epoch': 2.9985185185185186})

In [24]:
model = GPT2LMHeadModel.from_pretrained(save_path + '/checkpoint-1518')
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Ensure pad_token is set to eos_token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [25]:
def generate_response(input_text):
    # Tokenize input with padding and attention mask
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    # Generate response with a high max_length
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],  # Include attention mask
        max_length=1024,  # Set to a high value to ensure complete responses
        num_return_sequences=1,
        temperature=0.5,  # Adjust temperature as needed
        top_k=50,  # Adjust top_k as needed
        top_p=0.95,  # Adjust top_p as needed
        no_repeat_ngram_size=2,  # Prevent repeating phrases
        early_stopping=True  # Stop early if end of sentence token is generated
    )

    # Decode and print the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(response)

# Example usage
generate_response("How can i know the cancellation charges")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


How can i know the cancellation charges? Response: i'll get right on it! i can assist you in checking the charges associated with canceling your subscription to our newsletter. to provide you with accurate information, could you please provide me with your email address or any other relevant details related to your account? once i have this information and the necessary details, i will be able to assist your request promptly. thank you for your cooperation, and i look forward to helping you navigate this process. let me know if there's anything else i need to do for you. your satisfaction is our top priority, so please don't hesitate to let us know. we're here to help you every step of the way! happy cancelling! is there anything specific you'd like to know about the cancellation charges or if you have any further questions? feel free to ask. i'm here every day to ensure you receive the information you need. happy cancellation!"

please note that the specific cancellation fees may vary