In [1]:
!pip install -q pandas torch transformers datasets peft accelerate bitsandbytes trl huggingface_hub

In [2]:
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import Dataset
from huggingface_hub import login
import json
import re
import zipfile
from collections import Counter

2026-01-27 12:22:50.569714: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769516570.591543     140 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769516570.597959     140 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769516570.615497     140 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769516570.615515     140 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769516570.615517     140 computation_placer.cc:177] computation placer alr

In [None]:
# Login to Hugging Face
login(token="MY_TOKEN")

MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
OUTPUT_DIR = "./cultural-qa-finetuned"

In [4]:
print("Loading datasets...")
train_mcq = pd.read_csv("/kaggle/input/bsllm-project-data/train_dataset_mcq.csv")
train_saq = pd.read_csv("/kaggle/input/bsllm-project-data/train_dataset_saq.csv")
test_mcq = pd.read_csv("/kaggle/input/bsllm-project-data/test_dataset_mcq.csv")
test_saq = pd.read_csv("/kaggle/input/bsllm-project-data/test_dataset_saq.csv")

print(f"Train MCQ: {len(train_mcq)} samples")
print(f"Train SAQ: {len(train_saq)} samples")
print(f"Test MCQ: {len(test_mcq)} samples")
print(f"Test SAQ: {len(test_saq)} samples")

# Check country distribution
print("\nCountry distribution in training data:")
print("MCQ:", train_mcq['country'].value_counts().to_dict())
print("SAQ:", train_saq['country'].value_counts().to_dict())

Loading datasets...
Train MCQ: 836 samples
Train SAQ: 1333 samples
Test MCQ: 419 samples
Test SAQ: 667 samples

Country distribution in training data:
MCQ: {'China': 219, 'US': 214, 'Iran': 206, 'UK': 197}
SAQ: {'US': 340, 'CN': 339, 'GB': 331, 'IR': 323}


In [5]:
def create_mcq_training_example(row):
    """Convert MCQ row to instruction-following format"""
    prompt = row['prompt']
    answer_idx = row['answer_idx']
    country = row['country']
    
    instruction = f"""You are an expert in {country} culture and customs. Answer the following multiple choice question accurately.

{prompt}"""
    
    response = json.dumps({"answer_choice": answer_idx})
    
    # Llama-3 chat format
    text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a cultural expert specializing in global customs and practices.<|eot_id|><|start_header_id|>user<|end_header_id|>

{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{response}<|eot_id|>"""
    
    return {"text": text, "country": country}

def create_saq_training_example(row):
    """Convert SAQ row to instruction-following format"""
    question = row['en_question']
    country = row['country']
    
    # Extract the top answer from annotations
    try:
        annotations = eval(row['annotations'])
        if annotations and len(annotations) > 0:
            # Get the most common answer
            answer = annotations[0]['en_answers'][0].lower().strip()
        else:
            return None
    except:
        return None
    
    instruction = f"""You are an expert in {country} culture and customs. Answer the following question with a concise, specific answer.

Question: {question}"""
    
    # Llama-3 chat format
    text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a cultural expert specializing in global customs and practices.<|eot_id|><|start_header_id|>user<|end_header_id|>

{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{answer}<|eot_id|>"""
    
    return {"text": text, "country": country}

# Create training examples
print("\nPreparing training examples...")
mcq_examples = [create_mcq_training_example(row) for _, row in train_mcq.iterrows()]
saq_examples = [create_saq_training_example(row) for _, row in train_saq.iterrows()]
saq_examples = [ex for ex in saq_examples if ex is not None]

# Oversample China and Iran to balance the dataset
china_iran_mcq = [ex for ex in mcq_examples if ex['country'] in ['China', 'IR']]
china_iran_saq = [ex for ex in saq_examples if ex['country'] in ['CN', 'IR']]

# Add oversampled examples (2x for underperforming countries)
all_examples = mcq_examples + saq_examples + china_iran_mcq + china_iran_saq

print(f"Total training examples: {len(all_examples)}")
print(f"  - MCQ: {len(mcq_examples)}")
print(f"  - SAQ: {len(saq_examples)}")
print(f"  - Oversampled China/Iran: {len(china_iran_mcq) + len(china_iran_saq)}")

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(pd.DataFrame(all_examples))


Preparing training examples...
Total training examples: 3004
  - MCQ: 836
  - SAQ: 1304
  - Oversampled China/Iran: 864


In [6]:
!pip install -U bitsandbytes



In [7]:
# ============================================================================
# Configure QLoRA (4-bit quantization + LoRA)
# ============================================================================

print("\nConfiguring QLoRA...")

# 4-bit quantization config - NO bfloat16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16  # Use float16 only
)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,  # Force float16
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Training args - NO mixed precision
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Reduced batch size
    gradient_accumulation_steps=8,  # Increased accumulation
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",  # Changed optimizer
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    # NO fp16 or bf16
)

`torch_dtype` is deprecated! Use `dtype` instead!



Configuring QLoRA...


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [8]:
print("\nStarting fine-tuning...")

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,
    formatting_func=lambda x: x["text"],
)

# Train
trainer.train()

# Save the fine-tuned model
print("\nSaving fine-tuned model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"âœ“ Model saved to {OUTPUT_DIR}")


Starting fine-tuning...




Applying formatting function to train dataset:   0%|          | 0/3004 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/3004 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3004 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/3004 [00:00<?, ? examples/s]

Step,Training Loss
10,3.9736
20,2.1706
30,1.0193
40,0.7005
50,0.548
60,0.4569
70,0.4369
80,0.368
90,0.3795
100,0.3889



Saving fine-tuned model...
âœ“ Model saved to ./cultural-qa-finetuned


In [9]:
print("\nLoading fine-tuned model for inference...")

# Reload base model
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Load fine-tuned LoRA weights
from peft import PeftModel
model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
model.eval()

print("âœ“ Fine-tuned model loaded!")


Loading fine-tuned model for inference...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

âœ“ Fine-tuned model loaded!


In [10]:
def generate_answer(prompt, max_new_tokens=150, temperature=0.1):
    """Generate answer using fine-tuned model"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    generated = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True
    )
    
    return generated

def answer_mcq_finetuned(row, n_samples=5):
    """Answer MCQ with self-consistency"""
    country = row['country']
    prompt_text = row['prompt']
    
    instruction = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a cultural expert specializing in global customs and practices.<|eot_id|><|start_header_id|>user<|end_header_id|>

You are an expert in {country} culture and customs. Answer the following multiple choice question accurately.

{prompt_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    
    answers = []
    for i in range(n_samples):
        temp = 0.7 if i > 0 else 0.1
        generated = generate_answer(instruction, max_new_tokens=50, temperature=temp)
        
        # Extract answer
        json_match = re.search(r'\{[^}]*"answer_choice"[^}]*:[^}]*"([ABCD])"[^}]*\}', generated, re.IGNORECASE)
        if json_match:
            answers.append(json_match.group(1).upper())
        else:
            letter_match = re.search(r'\b([ABCD])\b', generated.upper())
            if letter_match:
                answers.append(letter_match.group(1))
    
    if not answers:
        return 'A'
    
    # Majority vote
    answer_counts = Counter(answers)
    return answer_counts.most_common(1)[0][0]

def answer_saq_finetuned(row):
    """Answer SAQ question"""
    country = row['country']
    question = row['en_question']
    
    instruction = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a cultural expert specializing in global customs and practices.<|eot_id|><|start_header_id|>user<|end_header_id|>

You are an expert in {country} culture and customs. Answer the following question with a concise, specific answer.

Question: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    
    generated = generate_answer(instruction, max_new_tokens=30, temperature=0.1)
    
    # Clean answer
    answer = generated.strip().split('\n')[0].strip()
    answer = re.sub(r'^(Answer:|A:|The answer is:?)\s*', '', answer, flags=re.IGNORECASE)
    answer = re.sub(r'^["\']|["\']$', '', answer)
    
    if '.' in answer:
        answer = answer.split('.')[0]
    
    return answer.lower().strip()

In [11]:
print("\n" + "="*80)
print("Processing MCQ Test Data with Fine-tuned Model")
print("="*80)

mcq_predictions = []
for idx, row in test_mcq.iterrows():
    choice = answer_mcq_finetuned(row, n_samples=5)
    mcq_predictions.append(choice)
    
    if (idx + 1) % 10 == 0:
        print(f"Processed {idx + 1}/{len(test_mcq)} MCQ questions")

# Create submission
test_mcq['choice'] = mcq_predictions
mcq_submission = pd.get_dummies(test_mcq['choice'], dtype=bool)

for col in ['A', 'B', 'C', 'D']:
    if col not in mcq_submission.columns:
        mcq_submission[col] = False

mcq_submission = pd.concat([test_mcq[['MCQID']], mcq_submission[['A', 'B', 'C', 'D']]], axis=1)
mcq_submission.to_csv('mcq_prediction.tsv', sep='\t', index=False)
print(f"âœ“ MCQ predictions saved!")


Processing MCQ Test Data with Fine-tuned Model
Processed 10/419 MCQ questions
Processed 20/419 MCQ questions
Processed 30/419 MCQ questions
Processed 40/419 MCQ questions
Processed 50/419 MCQ questions
Processed 60/419 MCQ questions
Processed 70/419 MCQ questions
Processed 80/419 MCQ questions
Processed 90/419 MCQ questions
Processed 100/419 MCQ questions
Processed 110/419 MCQ questions
Processed 120/419 MCQ questions
Processed 130/419 MCQ questions
Processed 140/419 MCQ questions
Processed 150/419 MCQ questions
Processed 160/419 MCQ questions
Processed 170/419 MCQ questions
Processed 180/419 MCQ questions
Processed 190/419 MCQ questions
Processed 200/419 MCQ questions
Processed 210/419 MCQ questions
Processed 220/419 MCQ questions
Processed 230/419 MCQ questions
Processed 240/419 MCQ questions
Processed 250/419 MCQ questions
Processed 260/419 MCQ questions
Processed 270/419 MCQ questions
Processed 280/419 MCQ questions
Processed 290/419 MCQ questions
Processed 300/419 MCQ questions
P

In [12]:
print("\n" + "="*80)
print("Processing SAQ Test Data with Fine-tuned Model")
print("="*80)

saq_predictions = []
for idx, row in test_saq.iterrows():
    answer = answer_saq_finetuned(row)
    saq_predictions.append(answer)
    
    if (idx + 1) % 10 == 0:
        print(f"Processed {idx + 1}/{len(test_saq)} SAQ questions")
    
    if idx < 5:
        print(f"\nQ: {row['en_question']}")
        print(f"A: {answer}")

# Create submission
test_saq['answer'] = saq_predictions
saq_submission = test_saq[['ID', 'answer']]
saq_submission.to_csv('saq_prediction.tsv', sep='\t', index=False)
print(f"âœ“ SAQ predictions saved!")


Processing SAQ Test Data with Fine-tuned Model

Q: What is the most popular children's animation that is commonly watched by kids in Iran?
A: mia and dadi

Q: What type of food from UK is typically served in restaurants overseas?
A: fish & chips

Q: What is the most popular food in China among young people?
A: shaxian delicacies

Q: What are the common activities that seniors usually do in parks in US?
A: walk

Q: Which major is considered most conducive to employment in China?
A: computer science
Processed 10/667 SAQ questions
Processed 20/667 SAQ questions
Processed 30/667 SAQ questions
Processed 40/667 SAQ questions
Processed 50/667 SAQ questions
Processed 60/667 SAQ questions
Processed 70/667 SAQ questions
Processed 80/667 SAQ questions
Processed 90/667 SAQ questions
Processed 100/667 SAQ questions
Processed 110/667 SAQ questions
Processed 120/667 SAQ questions
Processed 130/667 SAQ questions
Processed 140/667 SAQ questions
Processed 150/667 SAQ questions
Processed 160/667 SAQ que

In [14]:
print("\n" + "="*80)
print("Creating Submission File")
print("="*80)

with zipfile.ZipFile('submission_finetuned.zip', 'w') as zipf:
    zipf.write('saq_prediction.tsv')
    zipf.write('mcq_prediction.tsv')

print("âœ“ Submission file created: submission_finetuned.zip")

# Download files
try:
    from google.colab import files
    files.download('submission_finetuned.zip')
    print("\nâœ“ Submission downloaded!")
except:
    print("\nâœ“ Submission ready for download!")

print("\n" + "="*80)
print("FINE-TUNING COMPLETE!")
print("="*80)
print("\nExpected improvements:")
print("  MCQ: 0.71 â†’ 0.78-0.82")
print("  SAQ: 0.58 â†’ 0.68-0.73")
print("  China/Iran performance should improve significantly")
print("="*80)


Creating Submission File
âœ“ Submission file created: submission_finetuned.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


âœ“ Submission downloaded!

FINE-TUNING COMPLETE!

Expected improvements:
  MCQ: 0.71 â†’ 0.78-0.82
  SAQ: 0.58 â†’ 0.68-0.73
  China/Iran performance should improve significantly
