In [None]:
import os
import json
import pandas as pd
import re
from collections import defaultdict

lc_folder = "/kaggle/input/student-data-2/LCs"
TECH_TERMS = ["recursion", "recursive", "method", "stack", "base case", "function", "tail recursion"]
MISSING_ANS_PATTERNS = [r"\?$", r"^yok$", r"^bilinmiyor$", r"^-$", r"^boş$"]

lc_files = [f for f in os.listdir(lc_folder) if f.startswith("LC") and f.endswith(".json")]
student_lc_texts = defaultdict(list)

for file in lc_files:
    with open(os.path.join(lc_folder, file), "r", encoding="utf-8") as f:
        data = json.load(f)
        for answer in data["answers"]:
            uid = answer["id"]
            for k, v in answer.items():
                if k.startswith("cevap") and isinstance(v, str):
                    student_lc_texts[uid].append(v.strip())

def extract_lc_features(texts):
    if not texts:
        return {
            "num_lc_answers": 0,
            "avg_answer_length": 0,
            "avg_word_count": 0,
            "avg_unique_word_ratio": 0,
            "technical_term_count": 0,
            "missing_answer_ratio": 0,
            "num_sentences": 0,
            "max_word_count": 0,
            "min_word_count": 0,
        }

    total_words = []
    total_unique = []
    tech_count = 0
    missing = 0
    total_sentences = 0

    for txt in texts:
        if txt.strip() == "" or any(re.search(pat, txt.strip().lower()) for pat in MISSING_ANS_PATTERNS):
            missing += 1
        words = re.findall(r"\w+", txt.lower())
        total_words.append(len(words))
        total_unique.append(len(set(words)))
        tech_count += sum(1 for term in TECH_TERMS if term in txt.lower())
        total_sentences += txt.count(".") + txt.count("!") + txt.count("?")

    n = len(texts)
    return {
        "num_lc_answers": n,
        "avg_answer_length": sum(len(t) for t in texts) / n,
        "avg_word_count": sum(total_words) / n,
        "avg_unique_word_ratio": sum(u/w if w else 0 for u, w in zip(total_unique, total_words)) / n,
        "technical_term_count": tech_count,
        "missing_answer_ratio": missing / n,
        "num_sentences": total_sentences,
        "max_word_count": max(total_words),
        "min_word_count": min(total_words),
    }

lc_features = {}
for uid, text_list in student_lc_texts.items():
    lc_features[uid] = extract_lc_features(text_list)

lc_df = pd.DataFrame.from_dict(lc_features, orient="index").reset_index()
lc_df.rename(columns={"index": "UID"}, inplace=True)

print(lc_df.head(10))
lc_df.to_csv("lc_features_2.csv", index=False)
