In [2]:
import json

# Load your chat4.json file
with open(r'C:\Users\ayush\Downloads\chat4.json') as f:
    intents = json.load(f)

# Prepare training data as dialogue pairs (user input and chatbot response)
training_data = []
for intent in intents['intents']:
    for pattern in intent['patterns']:
        response = intent['responses'][0]  # Use the first response for each pattern
        training_data.append(f"User: {pattern}\nChatbot: {response}\n")


In [3]:
from transformers import GPT2Tokenizer
from datasets import Dataset

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set eos_token as padding token

# Prepare the dataset for training
dataset = Dataset.from_dict({"text": training_data})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)




Map:   0%|          | 0/135 [00:00<?, ? examples/s]

In [4]:
from transformers import TFGPT2LMHeadModel
import tensorflow as tf

# Load the GPT-2 model for TensorFlow
model = TFGPT2LMHeadModel.from_pretrained("gpt2")
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), loss=model.compute_loss)

# Prepare inputs for TensorFlow training
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, return_tensors="tf")

# Convert dataset to TensorFlow format
tokenized_inputs = tokenized_dataset.map(preprocess_function, batched=True)






All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Map:   0%|          | 0/135 [00:00<?, ? examples/s]

In [5]:
pip install --upgrade accelerate


Note: you may need to restart the kernel to use updated packages.


In [6]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",            # Directory to save the model and other outputs
    evaluation_strategy="epoch",       # When to evaluate the model
    learning_rate=2e-5,                # Learning rate
    per_device_train_batch_size=4,     # Batch size per device during training
    per_device_eval_batch_size=4,      # Batch size per device during evaluation
    num_train_epochs=3,                # Total number of training epochs
    weight_decay=0.01,                 # Weight decay to apply (if any)
    logging_dir="./logs",              # Directory for storing logs
    logging_steps=10,                  # How often to log
    save_steps=10_000,                 # How often to save the model
    save_total_limit=2,                # Limit on the number of saved models
)

# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # GPT-2 is a causal model, so we do not use masked language modeling
)

# Define your dataset (make sure to tokenize your dataset)
# tokenized_dataset = ...

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Make sure this is defined
    data_collator=data_collator,
)

# Start training
trainer.train()




ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [None]:
# Prepare data for TensorFlow training
train_dataset = tokenized_inputs.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["input_ids"],  # GPT-2 generates text, so input and output are the same
    shuffle=True,
    batch_size=2,  # Adjust batch size based on memory
)

# Fine-tune GPT-2
model.fit(train_dataset, epochs=10)  # You can increase the number of epochs for better results


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [None]:
from datasets import load_dataset
with open(r'C:\Users\ayush\Downloads\chat4.json') as f:
    intents = json.load(f)
dataset = load_dataset(r'C:\Users\ayush\Downloads')
tokenized_dataset = dataset.map(lambda examples: tokenizer(examples['intents'], truncation=True, padding='max_length'), batched=True)


In [None]:
import json
import pandas as pd

# Load JSON data
with open(r'C:\Users\ayush\Downloads\chat4.json', 'r') as file:
    data = json.load(file)

# Convert JSON data to a DataFrame
df = pd.DataFrame(data)


In [None]:
from datasets import Dataset

# Convert DataFrame to Hugging Face Dataset
df['intents'] = df['intents'].astype(str)
dataset = Dataset.from_pandas(df)


In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.pad_token=tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['intents'], truncation=True, padding='max_length')

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Define training arguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Use the new parameter
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=10_000,
    save_total_limit=2,
)


# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Start training
trainer.train()


In [None]:
import accelerate
import transformers

print(f"Accelerate version: {accelerate.__version__}")
print(f"Transformers version: {transformers.__version__}")


In [None]:
import torch

print(f"PyTorch version: {torch.__version__}")


In [None]:
pip cache purge

In [None]:
pip uninstall accelerate 

In [None]:
pip install accelerate