In [None]:
import os
import json
import re
import pandas as pd
from collections import defaultdict
import hashlib

quiz_folder = "/kaggle/input/student-data-2/Quizzes"
quiz_files = [f for f in os.listdir(quiz_folder) if f.startswith("Quiz") and f.endswith(".json")]

def hash_code(code_str):
    return hashlib.md5(code_str.strip().encode('utf-8')).hexdigest()

def extract_quiz_features(code_list):
    full_code = "\n".join(code_list)
    lines = full_code.splitlines()
    num_classes = full_code.count("class ")
    num_oop_methods = sum(full_code.count(keyword) for keyword in ["get", "set", "constructor", "hunt"])
    used_array = int("[]" in full_code or "ArrayList" in full_code)
    num_methods = len(re.findall(r"(public\s+.*?\s+\w+\s*\(.*?\)\s*\{)", full_code))
    num_comments = len([l for l in lines if "//" in l or "/*" in l or l.strip().startswith("*")])
    avg_quiz_line_length = sum(len(l) for l in lines) / len(lines) if lines else 0

    return {
        "num_quiz_files": len(code_list),
        "total_quiz_lines": len(lines),
        "avg_quiz_line_length": avg_quiz_line_length,
        "num_classes": num_classes,
        "num_methods": num_methods,
        "num_comments": num_comments,
        "num_oop_methods": num_oop_methods,
        "used_array": used_array,
    }

student_quiz_codes = defaultdict(dict)

for file_name in quiz_files:
    with open(os.path.join(quiz_folder, file_name), "r", encoding="utf-8") as f:
        data = json.load(f)
        for answer in data["answers"]:
            uid = answer["id"]
            for k, v in answer.items():
                if k.endswith(".java"):
                    h = hash_code(v)
                    student_quiz_codes[uid][h] = v

quiz_features = {}

for uid in student_quiz_codes:
    code_list = list(student_quiz_codes[uid].values())
    quiz_features[uid] = extract_quiz_features(code_list)

quiz_df = pd.DataFrame.from_dict(quiz_features, orient="index").reset_index()
quiz_df.rename(columns={"index": "UID"}, inplace=True)

quiz_df.to_csv("quiz_features_2.csv", index=False)
