In [None]:
!git clone https://alihuss1017:@github.com/alihuss1017/hgss-llm.git

In [None]:
!pip install transformers sentence-transformers faiss-cpu

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Step 1: Load JSONL of QA pairs
dataset = load_dataset("json", data_files="data/qa_dataset.jsonl", split="train")

# Step 2: Train/Test Split
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Step 3: Format as Prompt Text
def format_qa(example):
    return {"text": f"Q: {example['question']}\nA: {example['answer']}\n"}

train_dataset = train_dataset.map(format_qa)
eval_dataset = eval_dataset.map(format_qa)

# Step 4: Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Required for padding

# Step 5: Tokenize
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(tokenize, batched=True, remove_columns=eval_dataset.column_names)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Load model
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m")

# Make sure tokenizer and model use same pad token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# Data collator handles dynamic padding and next-token shifting
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # causal language modeling
)

# Training arguments
training_args = TrainingArguments(
    output_dir="checkpoints/pythia-70m-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=5e-5,
    fp16=True,  # Use if running on GPU that supports it (e.g., A100, V100, 3090, etc.)
    report_to="tensorboard",  # or "wandb" if preferred
    run_name="pythia-70m-qa-finetune"
)

# Set up trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Save final model
trainer.save_model("checkpoints/pythia-70m-final")

In [15]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embedder = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = embedder.encode(documents, convert_to_numpy = True)

index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

In [16]:
def retrieve_relevant_context(query, top_k = 1):
  query_embedding = embedder.encode([query], convert_to_numpy = True)
  D, I = index.search(query_embedding, top_k)
  return [documents[i] for i in I[0]]

In [21]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer
import torch

model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-160m-deduped",
  revision="step3000",
  cache_dir="./pythia-160m-deduped/step3000",
)

tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-160m-deduped",
  revision="step3000",
  cache_dir="./pythia-160m-deduped/step3000",
)

def generate_answer(query):
  context = retrieve_relevant_context(query)
  prompt = (
      f'Context:\n{chr(10).join(context)}\n\n'
      f'Question: {query}\nAnswer:'
  )

  inputs = tokenizer(prompt, return_tensors = 'pt', truncation = True)
  outputs = model.generate(**inputs, max_new_tokens = 100)
  return tokenizer.decode(outputs[0], skip_special_tokens = True)

In [26]:
print(generate_answer("Based on the context, what is Focus Punch's accuracy value?"))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Context:
TM01:Focus Punch is a Fighting-type Physical move.It has 150 power, 100 accuracy, and 20 PP. It is found/located at Cianwood City Gym.Effect: A powerful loyalty attack. The user flinches if hit.

Question: Based on the context, what is Focus Punch's accuracy value?
Answer:

Focus Punch's accuracy value is the ability to control the movement of the target.

Question: How does Focus Punch's accuracy value affect the movement of the target?
Answer:

Focus Punch's accuracy value is the ability to control the movement of the target.

Question: How does Focus Punch's accuracy value affect the movement of the target?
Answer:

Focus Punch's accuracy value is the ability to control the movement of the target
