<a href="https://colab.research.google.com/github/andrepilo/Data-science/blob/main/students_performance_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Caricamento dataset dal repo GitHub
url = "https://raw.githubusercontent.com/andrepilo/Data-science/c8e1f1d1a494117b726a29b6b6ac66004984fe78/StudentsPerformance.csv"
df = pd.read_csv(url)

# Controllo dati
print(df.head())

# Preprocessing
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
df_encoded = pd.get_dummies(df, drop_first=True)

X = df_encoded.drop('math_score', axis=1)
y = df_encoded['math_score']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modello 1: Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

# Modello 2: Random Forest Regressor
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)

# Metriche
def print_metrics(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")
    return mae, rmse, r2

mae_lin, rmse_lin, r2_lin = print_metrics(y_test, y_pred_lin, "Linear Regression")
mae_rf, rmse_rf, r2_rf = print_metrics(y_test, y_pred_rf, "Random Forest Regressor")

# Grafici
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.histplot(df['math_score'], kde=True)
plt.title('Distribuzione Math Score')

plt.subplot(1,2,2)
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Matrice di correlazione')
plt.show()

# Feature importance Random Forest
importances = rf_reg.feature_importances_
features = X.columns
indices = np.argsort(importances)

plt.figure(figsize=(10,6))
plt.barh(features[indices], importances[indices])
plt.title("Feature Importances - Random Forest")
plt.show()

# Scatter plot valori veri vs predetti
plt.figure(figsize=(10,5))
plt.scatter(y_test, y_pred_lin, label='Linear Regression', alpha=0.7)
plt.scatter(y_test, y_pred_rf, label='Random Forest', alpha=0.7)
plt.plot([0, 100], [0, 100], 'k--')
plt.xlabel("Valori reali")
plt.ylabel("Valori predetti")
plt.legend()
plt.title("Valori reali vs predetti")
plt.show()
