In [None]:
import os
import json
import re
import pandas as pd
from collections import defaultdict

project_file = "/kaggle/input/student-data-2/Projects/Projects.json"
output_csv = "project_features.csv"

def extract_project_features(code_list):
    full_code = "\n".join(code_list)
    lines = [l for l in full_code.splitlines() if l.strip()]

    class_count = len(re.findall(r"\bclass\s+\w+", full_code))
    method_matches = re.findall(r"(public|private|protected)?\s+(static\s+)?\w+\s+\w+\s*\(.*?\)\s*{", full_code)
    method_lines = [l for l in lines if re.search(r"\b(public|private|protected)\b.*\(.*\)\s*{", l)]
    method_lengths = [len(m.splitlines()) for m in re.split(r"\b(public|private|protected)\b.*?\(.*?\)\s*{", full_code) if m.strip()]

    oop_terms = ["abstract", "interface", "extends", "implements", "super", "this"]
    used_oop = int(any(term in full_code for term in oop_terms))

    num_imports = len(re.findall(r"\bimport\s+.*?;", full_code))
    num_exceptions = len(re.findall(r"\btry\s*{|\bcatch\s*\(|\bfinally\s*{", full_code))
    num_comments = len([l for l in lines if "//" in l or "/*" in l or l.strip().startswith("*")])
    total_lines = len(lines)

    cyclomatic_complexity = sum(len(re.findall(pat, full_code)) for pat in [r"\bif\s*\(", r"\bfor\s*\(", r"\bwhile\s*\(", r"\bcase\s+"])

    return {
        "num_classes": class_count,
        "num_methods": len(method_matches),
        "avg_method_length": sum(method_lengths)/len(method_lengths) if method_lengths else 0,
        "used_oop_features": used_oop,
        "num_imports": num_imports,
        "num_exceptions": num_exceptions,
        "cyclomatic_complexity_approx": cyclomatic_complexity,
        "num_comments": num_comments,
        "comment_ratio": num_comments / total_lines if total_lines else 0,
        "total_lines": total_lines,
    }

with open(project_file, "r", encoding="utf-8") as f:
    data = json.load(f)

project_features = {}

for answer in data["answers"]:
    uid = answer["id"]
    code_list = [v for k, v in answer.items() if k.endswith(".java")]
    project_features[uid] = extract_project_features(code_list)

project_df = pd.DataFrame.from_dict(project_features, orient="index").reset_index()
project_df.rename(columns={"index": "UID"}, inplace=True)

print(project_df.head())
project_df.to_csv(output_csv, index=False)
