In [None]:
# Install required packages
!pip install -q fsspec==2025.3.0
!pip install -q transformers datasets peft trl accelerate bitsandbytes sentencepiece scikit-learn huggingface_hub

# Login to Hugging Face Hub
from huggingface_hub import login
login("")  # ← Paste your token

# Upload your dataset
import json
from google.colab import files

uploaded = files.upload()
DATA_PATH = next(iter(uploaded.keys()))
RAW_PATH = DATA_PATH
CLEANED_PATH = "cleaned_dataset.jsonl"

# Define the simplified output schema (no cell_area_cm2)
KEY_MAP = {
    "Main Focus of the Research": "research_focus",
    "Key Findings": "key_findings",
    "Photovoltaic Cell Technology": "device_type",
    "Absorber Material": "absorber_material",
    "Negative Polarity Contact Type": "absorber_dopant_polarity",
    "Positive Polarity Contact Type": "absorber_dopant_material",
    "Front Surface Passivation": "front_surface_passivation_material",
    "Rear Surface Passivation": "rear_surface_passivation_material",
    "Negative Polarity Metallization": "negative_metallization_material",
    "Positive Polarity Metallization": "positive_metallization_material",
    "Efficiency (%)": "efficiency_percent",
    "Short-Circuit Current (A)": "short_circuit_current_a",
    "Short-Circuit Current Density (mA/cm²)": "short_circuit_current_density_ma_cm2",
    "Open-Circuit Voltage (V)": "open_circuit_voltage_v",
    "Fill Factor (%)": "fill_factor_percent"
}
SCHEMA_KEYS = list(KEY_MAP.values())

# Clean and normalize your dataset
with open(RAW_PATH, "r") as infile, open(CLEANED_PATH, "w") as outfile:
    for line in infile:
        rec = json.loads(line)
        rec["input"] = str(rec.get("input", "N/A"))
        out_raw = rec.get("output", {})
        out = {}
        for k in SCHEMA_KEYS:
            orig = next((kk for kk, vv in KEY_MAP.items() if vv == k), None)
            val = out_raw.get(orig, "N/A")
            out[k] = str(val)
        rec["output"] = out
        outfile.write(json.dumps(rec, ensure_ascii=False) + "\n")

# Create the prompt template for training
PROMPT_TEMPLATE = """
You are extracting structured data from academic articles on photovoltaic cells.

Instructions:
- Only extract data for the **highest efficiency cell** reported.
- Match the schema exactly as specified (see below).
- For any field that is not available, return the value as "N/A".
- Return a single valid **JSON object** (no markdown or code block formatting).
- Only one entry per article is required.

Schema fields:
- research_focus
- key_findings
- device_type
- absorber_material
- absorber_dopant_material
- absorber_dopant_polarity
- front_surface_passivation_material
- rear_surface_passivation_material
- negative_metallization_material
- positive_metallization_material
- efficiency_percent
- short_circuit_current_a
- short_circuit_current_density_ma_cm2
- open_circuit_voltage_v
- fill_factor_percent

Article:
{text}

Format like:
{format_instructions}
"""

FORMAT_INSTRUCTIONS = json.dumps({k: "<string>" for k in SCHEMA_KEYS}, ensure_ascii=False)

# Load and reformat dataset with prompt+response pairs
from datasets import load_dataset, Features, Value

features = Features({
    "input": Value("string"),
    "output": {k: Value("string") for k in SCHEMA_KEYS}
})
raw_ds = load_dataset("json", data_files=CLEANED_PATH, split="train", features=features)

def build_prompt(example):
    p = PROMPT_TEMPLATE.format(text=example["input"], format_instructions=FORMAT_INSTRUCTIONS)
    return {"text": p.strip() + "\n\n### Response:\n" + json.dumps(example["output"], ensure_ascii=False)}

ds = raw_ds.map(build_prompt, remove_columns=raw_ds.column_names)

# Load Gemma 2B model and apply LoRA configuration
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType

BASE_MODEL = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, load_in_4bit=True, device_map="auto")
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

# Define training loop with TRL's SFTTrainer
from transformers import TrainingArguments
from trl import SFTTrainer

training_args = TrainingArguments(
    output_dir="./gemma_finetuned_pv",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_steps=10,
    fp16=True,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds,
    args=training_args,
    max_seq_length=512
)
trainer.train()

# Save and export final model
FINETUNED_DIR = "gemma_finetuned_pv"
trainer.model.save_pretrained(FINETUNED_DIR)
tokenizer.save_pretrained(FINETUNED_DIR)

!zip -qr gemma_finetuned_pv.zip gemma_finetuned_pv
files.download("gemma_finetuned_pv.zip")
