In [None]:
from datasets import load_dataset
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load dataset
dataset = load_dataset("Kaludi/Customer-Support-Responses")

# Preprocessing function (replace with your desired approach)
def preprocess_text(text):
  # Lowercase, remove punctuation, etc.
  return processed_text

# Apply preprocessing (optional)
if "question" in dataset["train"].features:
  dataset["train"]["question"] = dataset["train"]["question"].apply(preprocess_text)
  dataset["train"]["answer"] = dataset["train"]["answer"].apply(preprocess_text)

# Tokenization
tokenizer = T5Tokenizer.from_pretrained("t5-base")

def tokenize_function(examples):
    return tokenizer(examples["query"], examples["response"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Create model and training arguments
model_name = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,  # Adjust based on GPU memory
    save_steps=10_000,
    num_train_epochs=3,
    logging_steps=500,
    evaluation_strategy="epoch"
)

# Create training and validation datasets
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["test"]  # Assuming "test" split represents validation

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Function to generate response
def generate_response(query):
  input_ids = tokenizer(query, return_tensors="pt")["input_ids"]
  output = model.generate(input_ids)
  return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage with Gradio (optional)
import gradio as gr

iface = gr.Interface(
    fn=generate_response,
    inputs="text",
    outputs="text",
    title="Customer Support Chatbot"
)

iface.launch()

# Remember to install required libraries: transformers, datasets, gradio (optional)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/74 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[