<a href="https://colab.research.google.com/github/YazCodes/y2k_data_model/blob/main/y2k_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
!ls


In [None]:
!pip install transformers datasets peft accelerate bitsandbytes


In [None]:
from datasets import load_dataset

dataset = load_dataset('text', data_files={'train': 'chat_data.txt'})
dataset

In [None]:
from transformers import AutoTokenizer # turns words into tokens

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token # ensures all inputs have the same lenght

def tokenize_function(examples): # a function converting raw text (the training data) into tokens (for the model to understand)
  tokenized_inputs = tokenizer(
      examples["text"],
      truncation =True,
      padding = "max_length",
      max_length = 128
  )
  tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy() # Add labels for training
  return tokenized_inputs

# applying the tokenize_function to every row of my dataset

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

model = AutoModelForCausalLM.from_pretrained(model_name)

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],  # GPT2 attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./y2k_model",
    # evaluation_strategy="no", # Removed unsupported argument
    learning_rate=5e-4,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"]
)

trainer.train()

In [None]:
trainer.save_model("y2k_model")
tokenizer.save_pretrained("y2k_model")


In [None]:
# testing the model

In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="y2k_model", tokenizer=tokenizer)

prompt = "User: What accessories for a Y2K outfit?\nBot:"
print(generator(prompt, max_length=80, do_sample=True, temperature=0.8)[0]["generated_text"])


In [None]:
import transformers
print(transformers.__version__)


In [None]:
import transformers
print(transformers.__file__)


In [None]:
!pip install trl

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    args=training_args,
    # max_seq_length=128, # Removed unsupported argument
)

trainer.train()