In [None]:
import os
import json
import torch
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
from collections import defaultdict
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("token")

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, token=hf_token, device_map="auto", torch_dtype="auto")
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

lc_folder = "/kaggle/input/bil-471/LCs"
lc_files = [f for f in os.listdir(lc_folder) if f.endswith(".json")]

def get_lc_name(filename):
    return filename.split(".")[0].upper()

lc_names = sorted(set(get_lc_name(f) for f in lc_files))

all_questions = set()
for file in lc_files:
    with open(os.path.join(lc_folder, file), encoding="utf-8") as f:
        data = json.load(f)
        for q in data["questions"]:
            if q.strip() and "öğrenci" not in q.lower():
                all_questions.add(q.strip())

reference_answers = {}
for q in sorted(all_questions):
    prompt = f"Aşağıdaki soruya kısa, açık ve doğru bir yanıt ver:\n\nSoru: {q}\nCevap:"
    output = text_generator(prompt, max_new_tokens=100, temperature=0.2)[0]["generated_text"]
    answer = output.split("Cevap:")[-1].strip()
    reference_answers[q] = answer

reference_embeddings = {
    q: embedding_model.encode(a) for q, a in reference_answers.items()
}

train_path = "/kaggle/input/bil-471/processed_train_student.csv"
test_path = "/kaggle/input/bil-471/processed_test_student.csv"
train_uids = pd.read_csv(train_path)["UID"].astype(int).tolist()
test_uids = pd.read_csv(test_path)["UID"].astype(int).tolist()
ref_uids = sorted(set(train_uids + test_uids))

lc_uid_scores = {lc: {} for lc in lc_names}

for file in lc_files:
    lc_name = get_lc_name(file)
    with open(os.path.join(lc_folder, file), encoding="utf-8") as f:
        data = json.load(f)
        questions = data["questions"]
        for answer in data["answers"]:
            uid = int(answer["id"])
            sims = []
            for i, q in enumerate(questions[1:], start=1):
                cevap_key = f"cevap{i}"
                student_answer = str(answer.get(cevap_key, "")).strip()
                if not student_answer:
                    continue
                try:
                    student_emb = embedding_model.encode(student_answer)
                    ref_emb = reference_embeddings.get(q)
                    if ref_emb is not None:
                        sim = cosine_similarity([student_emb], [ref_emb])[0][0]
                        sims.append(sim)
                except Exception:
                    continue
            mean_sim = sum(sims) / len(sims) if sims else 0
            lc_uid_scores[lc_name][uid] = mean_sim

result_rows = []
for uid in ref_uids:
    row = {"UID": int(uid)}
    scores = []
    for lc in lc_names:
        score = lc_uid_scores[lc].get(int(uid), 0)
        row[lc] = score
        scores.append(score)
    row["lc_similarity_mean"] = sum(scores) / len(scores) if scores else 0
    result_rows.append(row)

result_df = pd.DataFrame(result_rows)
result_df.to_csv("lc_similarity_full.csv", index=False)

