In [None]:
import os
import json
import hashlib
import re
import pandas as pd
from collections import defaultdict

lab_folder = "/kaggle/input/student-data-2/Labs"
output_csv = "lab_features_2.csv"

def hash_code(code_str):
    return hashlib.md5(code_str.strip().encode("utf-8")).hexdigest()

def extract_code_features(code_list):
    full_code = "\n".join(code_list)
    lines = [l for l in full_code.splitlines() if l.strip() != ""]
    variable_names = re.findall(r"(int|double|String|boolean|char)\s+(\w+)", full_code)
    variable_set = set([v[1] for v in variable_names])

    def has_recursion(code):
        funcs = re.findall(r'public\s+static\s+\w+\s+(\w+)\s*\(', code)
        return any(re.search(r'\b{}\s*\('.format(f), code.split(f"{f}(")[-1]) for f in funcs)

    return {
        "total_lines": len(lines),
        "avg_line_length": sum(len(l) for l in lines) / len(lines) if lines else 0,
        "max_line_length": max((len(l) for l in lines), default=0),
        "min_line_length": min((len(l) for l in lines), default=0),
        "num_methods": len(re.findall(r"(public\s+static\s+\w+.*?\()", full_code)),
        "num_comments": len([l for l in lines if "//" in l or "/*" in l or l.strip().startswith("*")]),
        "comment_ratio": len([l for l in lines if "//" in l or "/*" in l or l.strip().startswith("*")]) / len(lines) if lines else 0,
        "used_recursion": int(has_recursion(full_code)),
        "used_loop": int(any(kw in full_code for kw in ["for(", "while(", "do{", "do {"])),
        "num_if": len(re.findall(r"\bif\s*\(", full_code)),
        "num_switch": len(re.findall(r"\bswitch\s*\(", full_code)),
        "num_break": len(re.findall(r"\bbreak\s*;", full_code)),
        "num_continue": len(re.findall(r"\bcontinue\s*;", full_code)),
        "num_variables": len(variable_set),
        "technical_term_count": sum(full_code.lower().count(term) for term in ["recursion", "loop", "sort", "array", "method", "function"]),
    }

lab_files = [f for f in os.listdir(lab_folder) if f.endswith(".json")]
student_lab_codes = defaultdict(dict)

for file_name in lab_files:
    with open(os.path.join(lab_folder, file_name), "r", encoding="utf-8") as f:
        data = json.load(f)
        for answer in data["answers"]:
            uid = answer["id"]
            for k, v in answer.items():
                if k.endswith(".java"):
                    code_hash = hash_code(v)
                    student_lab_codes[uid][code_hash] = v

lab_features = {}

for uid in student_lab_codes:
    code_list = list(student_lab_codes[uid].values())
    lab_features[uid] = extract_code_features(code_list)

lab_df = pd.DataFrame.from_dict(lab_features, orient="index").reset_index()
lab_df.rename(columns={"index": "UID"}, inplace=True)

print(lab_df.head())
lab_df.to_csv(output_csv, index=False)
