In [60]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, DataCollatorWithPadding, Trainer


In [69]:
data_file = "./resume_chatbot_training_data_full.json"
dataset = load_dataset("json", data_files={"train": data_file})

def preprocess_function(example):
    return {
        "input": example["question"],
        "output": example["answer"],
    }

processed_dataset = dataset.map(preprocess_function)
for example in processed_dataset["train"]:
    print(f"Input length: {len(example['input'])}, Output length: {len(example['output'])}")
    print(processed_dataset["train"][0])

   


Input length: 40, Output length: 195
{'question': 'What degrees has Aneesh Patne completed?', 'answer': 'Aneesh has completed an M.Tech in Electronics and Telecommunication from Veermata Jijabai Technological Institute and a B.Tech in the same field from Thakur College of Engineering and Technology.', 'input': 'What degrees has Aneesh Patne completed?', 'output': 'Aneesh has completed an M.Tech in Electronics and Telecommunication from Veermata Jijabai Technological Institute and a B.Tech in the same field from Thakur College of Engineering and Technology.'}
Input length: 35, Output length: 119
{'question': 'What degrees has Aneesh Patne completed?', 'answer': 'Aneesh has completed an M.Tech in Electronics and Telecommunication from Veermata Jijabai Technological Institute and a B.Tech in the same field from Thakur College of Engineering and Technology.', 'input': 'What degrees has Aneesh Patne completed?', 'output': 'Aneesh has completed an M.Tech in Electronics and Telecommunication 

In [62]:
model_path = "D:/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
model = model.to("cuda")
tokenizer.pad_token = tokenizer.eos_token

In [63]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Language modeling task
    r=16,                         # Rank of update matrices
    lora_alpha=32,                # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target specific layers
    lora_dropout=0.1              # Dropout rate
)
model = get_peft_model(model, lora_config)

In [64]:
def tokenize_function(example):
    input_text = f"Q: {example['input']} A: {example['output']}"
    return tokenizer(input_text, truncation=True, padding="max_length", max_length=100)

tokenized_dataset = processed_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["input", "output"])
tokenized_dataset.set_format("torch")
for batch in tokenized_dataset["train"]:
    print(batch["input_ids"].size())

torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size([])
torch.Size

In [65]:
training_args = TrainingArguments(
    output_dir="./lora-resume-chatbot",  # Output directory
    per_device_train_batch_size=4,      # Adjust batch size based on GPU memory
    num_train_epochs=3,                 # Number of training epochs
    learning_rate=5e-4,                 # Learning rate
    logging_dir="./logs",               # Log directory
    logging_steps=10,                   # Log frequency
    save_strategy="epoch",              # Save model at the end of each epoch
    evaluation_strategy="no",        # Evaluate at the end of each epoch
    save_total_limit=2,                 # Limit saved checkpoints
    fp16=True,                           # Enable mixed precision for faster training
    
)



In [66]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [67]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [68]:
trainer.train()



ValueError: not enough values to unpack (expected 3, got 2)