In [1]:
# Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Configuration
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# Loop over the 5 intervals
for i in range(1, 6):
    print("="*80)
    print(f"📁 EDA for Interval {i}")
    print("="*80)
    
    # Load dataset
    file_path = f"student_features_interval_{i}.csv"
    df = pd.read_csv(file_path)

    # ------------------------------
    # 1. Check for duplicates
    duplicates = df[df.duplicated()]
    print(f"🔁 Duplicates found in Interval {i}: {len(duplicates)}")
    if not duplicates.empty:
        display(duplicates)

    # ------------------------------
    # 2. Correlation matrix
    corr = df.select_dtypes(include='number').corr()
    
    plt.figure(figsize=(15, 10))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', center=0)
    plt.title(f"🔗 Correlation Matrix - Interval {i}")
    plt.tight_layout()
    plt.show()

    # ------------------------------
    # 3. Distributions
    features_to_plot = [
        "total_commits", "sum_lines_added", "sum_lines_deleted",
        "active_days", "last_minute_commits", "total_merge_requests",
        "merged_requests", "review_comments_given", "review_comments_received",
        "issue_participation", "branches_created", "merges_to_main_branch", "Final Grade"
    ]

    for feature in features_to_plot:
        if feature in df.columns:
            plt.figure()
            sns.histplot(df[feature], kde=True, bins=20)
            plt.title(f"📊 Distribution of {feature} - Interval {i}")
            plt.xlabel(feature)
            plt.ylabel("Frequency")
            plt.tight_layout()
            plt.show()

    # ------------------------------
    # 4. Feature vs Grade
    features_vs_grade = [
        "total_commits", "sum_lines_added", "active_days", 
        "issue_participation", "review_comments_given", "merged_requests"
    ]

    for feature in features_vs_grade:
        if feature in df.columns:
            plt.figure()
            sns.scatterplot(x=df[feature], y=df["Final Grade"])
            plt.title(f"📈 {feature} vs Final Grade - Interval {i}")
            plt.xlabel(feature)
            plt.ylabel("Final Grade")
            plt.tight_layout()
            plt.show()

    # ------------------------------
    # 5. Boxplot: Grade by Group
    if "group_id" in df.columns:
        plt.figure()
        sns.boxplot(x="group_id", y="Final Grade", data=df)
        plt.title(f"📦 Final Grade by Group - Interval {i}")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    # ------------------------------
    # 6. Commit Efficiency (new feature)
    if "sum_lines_added" in df.columns and "total_commits" in df.columns:
        df["commit_efficiency"] = df["sum_lines_added"] / df["total_commits"].replace(0, np.nan)

        plt.figure()
        sns.scatterplot(x=df["commit_efficiency"], y=df["Final Grade"])
        plt.title(f"Commit Efficiency vs Final Grade - Interval {i}")
        plt.xlabel("Lines Added per Commit")
        plt.ylabel("Final Grade")
        plt.tight_layout()
        plt.show()


: 

In [None]:
# Imports
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# Features selecionadas
selected_features = [
    "total_issues_created", "total_issues_assigned",
    "issues_resolved", "issue_participation"
]

# Recolher os ficheiros de intervalo 1 a 5
all_files = sorted(glob.glob("student_features_interval_*.csv"))

# Guardar resultados
results = []

# Loop por intervalo
for i, file_path in enumerate(all_files, start=1):
    print(f"\n📁 Interval {i}: {file_path}")
    
    # 1. Carregar dados
    df = pd.read_csv(file_path)
    df = df.dropna(subset=selected_features + ["Final Grade"])

    # 2. Separar X e y
    X_all = df.drop(columns=["Final Grade", "project_id", "group_id", "mention_handle", "interval"], errors='ignore')
    y = df["Final Grade"]

    # 3. Normalizar todas as features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_all)

    # 4. Selecionar apenas as 4 features alvo
    selected_indices = [X_all.columns.get_loc(col) for col in selected_features]
    X_selected = X_scaled[:, selected_indices]

    # 5. Dividir treino/teste
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

    # 6. Regressão Linear
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    lr_preds = lr_model.predict(X_test)

    results.append({
        "Interval": i,
        "Model": "Linear Regression",
        "MSE": mean_squared_error(y_test, lr_preds),
        "MAE": mean_absolute_error(y_test, lr_preds),
        "R2": r2_score(y_test, lr_preds)
    })

    # 7. Rede Neural
    nn_model = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)
    nn_model.fit(X_train, y_train)
    nn_preds = nn_model.predict(X_test)

    results.append({
        "Interval": i,
        "Model": "Neural Network",
        "MSE": mean_squared_error(y_test, nn_preds),
        "MAE": mean_absolute_error(y_test, nn_preds),
        "R2": r2_score(y_test, nn_preds)
    })

# 8. Mostrar resultados
results_df = pd.DataFrame(results)
display(results_df)



In [3]:
import joblib
joblib.dump(nn_model, "NeuralNetwork_interval5.pkl")