In [None]:
import os
import json
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

def clean_turkish_text(text):
    import re
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("token")
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, token=hf_token, device_map="auto", torch_dtype="auto")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

project_folder = "/kaggle/input/bil-471/Projects"
main_projects = ["Projects.json"]
rows = []

for project_file in main_projects:
    project_name = project_file.split(".")[0].upper()
    file_path = os.path.join(project_folder, project_file)
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    question = clean_turkish_text(data["questions"][0])
    prompt = (
        f"Aşağıdaki çok maddeli programlama sorularının her birine ayrı ayrı, tam ve doğru bir Java kodu üret. "
        "Her alt sorunun başlığıyla birlikte tam kodunu döndür. Yalnızca kodları döndür, açıklama veya yorum ekleme. "
        "Soru: " + question + "\nCevap:"
    )

    out = generator(
        prompt,
        max_new_tokens=30000, # Increased to accommodate larger project answers
        temperature=0.1,
        pad_token_id=tokenizer.eos_token_id
    )[0]["generated_text"]
    ref_answer = out.split("Cevap:")[-1].strip()

    print(f"{project_name} için referans cevap üretildi.\n---\n{ref_answer[:500]}...\n---\n")

    rows.append({
        "project_name": project_name,
        "question": question,
        "reference_answer": ref_answer
    })

df = pd.DataFrame(rows)
df.to_csv("/kaggle/working/project_reference_answers.csv", index=False, encoding="utf-8")
