# Student Performance Analysis and Prediction

This notebook analyzes student performance data and builds a predictive model for final grades (G3) using linear regression.

## 1. Import Libraries

In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 100
print("✓ All libraries imported successfully")

✓ All libraries imported successfully


## 2. Load and Explore Data

In [4]:
# Load dataset
df = pd.read_csv('student-mat.csv', sep=';')

print(f"Dataset Shape: {df.shape[0]} rows × {df.shape[1]} columns\n")
print("="*60)
print("First 5 rows:")
print("="*60)
df.head()

Dataset Shape: 395 rows × 33 columns

First 5 rows:


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


### 2.1 Dataset Information

In [None]:
print("Dataset Information:")
print("="*60)
df.info()
print("\n" + "="*60)
print("Missing Values Check:")
print("="*60)
missing_values = df.isnull().sum()
if missing_values.sum() == 0:
    print("✓ No missing values found!")
else:
    print(missing_values[missing_values > 0])

### 2.2 Statistical Summary

In [None]:
print("Statistical Summary (Numerical Features):")
print("="*60)
df.describe().round(2)

## 3. Data Preprocessing

In [None]:
# Encode categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

print(f"Original features: {df.shape[1]}")
print(f"Features after encoding: {df_encoded.shape[1]}")
print(f"\n✓ Categorical variables encoded successfully")
print("\nFirst 5 rows after encoding:")
print("="*60)
df_encoded.head()

### 3.1 Feature Standardization

In [None]:
# Standardize numerical features (excluding G3 which is our target)
scaler = StandardScaler()
num_cols = ['age', 'studytime', 'failures', 'absences', 'G1', 'G2']

print("Standardizing numerical features:")
print("="*60)
for col in num_cols:
    print(f"  • {col}")
    
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])
print("\n✓ Numerical features standardized successfully")

## 4. Exploratory Data Analysis (EDA)

### 4.1 Distribution of Final Grades

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['G3'], bins=20, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Final Grade (G3)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[0].set_title('Distribution of Final Grades (G3)', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)
axes[0].axvline(df['G3'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["G3"].mean():.2f}')
axes[0].axvline(df['G3'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {df["G3"].median():.2f}')
axes[0].legend()

# Box plot
box = axes[1].boxplot(df['G3'], vert=True, patch_artist=True, widths=0.5)
box['boxes'][0].set_facecolor('lightblue')
box['boxes'][0].set_edgecolor('black')
box['medians'][0].set_color('red')
box['medians'][0].set_linewidth(2)
axes[1].set_ylabel('Final Grade (G3)', fontsize=12, fontweight='bold')
axes[1].set_title('Box Plot of Final Grades', fontsize=14, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Mean: {df['G3'].mean():.2f} | Median: {df['G3'].median():.2f} | Std: {df['G3'].std():.2f}")

### 4.2 Correlation Analysis

In [None]:
plt.figure(figsize=(14, 10))
correlation_matrix = df_encoded.corr()
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0, 
            linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap of All Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Show top correlations with G3
print("\nTop 10 Features Correlated with Final Grade (G3):")
print("="*60)
g3_corr = correlation_matrix['G3'].abs().sort_values(ascending=False)[1:11]
for idx, (feature, corr) in enumerate(g3_corr.items(), 1):
    print(f"{idx:2d}. {feature:20s} : {correlation_matrix['G3'][feature]:7.4f}")

### 4.3 Key Relationships with Final Grade

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Absences vs G3
axes[0, 0].scatter(df['absences'], df['G3'], alpha=0.6, color='coral', edgecolor='black', s=50)
axes[0, 0].set_xlabel('Absences', fontsize=11, fontweight='bold')
axes[0, 0].set_ylabel('Final Grade (G3)', fontsize=11, fontweight='bold')
axes[0, 0].set_title('Absences vs Final Grade', fontsize=13, fontweight='bold')
axes[0, 0].grid(alpha=0.3)
z = np.polyfit(df['absences'], df['G3'], 1)
p = np.poly1d(z)
axes[0, 0].plot(df['absences'], p(df['absences']), "r--", alpha=0.8, linewidth=2, label='Trend Line')
axes[0, 0].legend()

# Study Time vs G3
axes[0, 1].scatter(df['studytime'], df['G3'], alpha=0.6, color='skyblue', edgecolor='black', s=50)
axes[0, 1].set_xlabel('Study Time (1-4 scale)', fontsize=11, fontweight='bold')
axes[0, 1].set_ylabel('Final Grade (G3)', fontsize=11, fontweight='bold')
axes[0, 1].set_title('Study Time vs Final Grade', fontsize=13, fontweight='bold')
axes[0, 1].grid(alpha=0.3)

# G1 vs G3
axes[1, 0].scatter(df['G1'], df['G3'], alpha=0.6, color='lightgreen', edgecolor='black', s=50)
axes[1, 0].set_xlabel('First Period Grade (G1)', fontsize=11, fontweight='bold')
axes[1, 0].set_ylabel('Final Grade (G3)', fontsize=11, fontweight='bold')
axes[1, 0].set_title('G1 vs G3 - Strong Positive Correlation', fontsize=13, fontweight='bold')
axes[1, 0].grid(alpha=0.3)
z = np.polyfit(df['G1'], df['G3'], 1)
p = np.poly1d(z)
axes[1, 0].plot(df['G1'], p(df['G1']), "r--", alpha=0.8, linewidth=2, label='Trend Line')
axes[1, 0].legend()

# G2 vs G3
axes[1, 1].scatter(df['G2'], df['G3'], alpha=0.6, color='plum', edgecolor='black', s=50)
axes[1, 1].set_xlabel('Second Period Grade (G2)', fontsize=11, fontweight='bold')
axes[1, 1].set_ylabel('Final Grade (G3)', fontsize=11, fontweight='bold')
axes[1, 1].set_title('G2 vs G3 - Strong Positive Correlation', fontsize=13, fontweight='bold')
axes[1, 1].grid(alpha=0.3)
z = np.polyfit(df['G2'], df['G3'], 1)
p = np.poly1d(z)
axes[1, 1].plot(df['G2'], p(df['G2']), "r--", alpha=0.8, linewidth=2, label='Trend Line')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## 5. Model Training

In [None]:
# Prepare features and target
X = df_encoded.drop("G3", axis=1)
y = df_encoded["G3"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data Split:")
print("="*60)
print(f"Training set: {X_train.shape[0]} samples ({(X_train.shape[0]/len(X))*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({(X_test.shape[0]/len(X))*100:.1f}%)")
print(f"Number of features: {X_train.shape[1]}")

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

print("\n✓ Model trained successfully!")

## 6. Model Evaluation

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Performance Metrics:")
print("="*60)
print(f"R² Score:                 {r2:.4f}  ({r2*100:.2f}% variance explained)")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root MSE (RMSE):          {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print("="*60)

if r2 > 0.8:
    print("✓ Excellent model performance!")
elif r2 > 0.6:
    print("✓ Good model performance!")
elif r2 > 0.4:
    print("⚠ Moderate model performance")
else:
    print("⚠ Model needs improvement")

### 6.1 Prediction Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Actual vs Predicted
axes[0].scatter(y_test, y_pred, alpha=0.6, color='dodgerblue', edgecolor='black', s=60)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
             'r--', lw=3, label='Perfect Prediction')
axes[0].set_xlabel('Actual Final Grade (G3)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Predicted Final Grade (G3)', fontsize=12, fontweight='bold')
axes[0].set_title('Actual vs Predicted Final Grades', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(alpha=0.3)
axes[0].text(0.05, 0.95, f'R² = {r2:.4f}', transform=axes[0].transAxes, 
             fontsize=12, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

# Residual Plot
residuals = y_test - y_pred
axes[1].scatter(y_pred, residuals, alpha=0.6, color='coral', edgecolor='black', s=60)
axes[1].axhline(y=0, color='red', linestyle='--', linewidth=2, label='Zero Residual')
axes[1].set_xlabel('Predicted Final Grade (G3)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Residuals (Actual - Predicted)', fontsize=12, fontweight='bold')
axes[1].set_title('Residual Plot', fontsize=14, fontweight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Residual Statistics:")
print(f"  Mean: {np.mean(residuals):.4f} (should be close to 0)")
print(f"  Std:  {np.std(residuals):.4f}")

## 7. Feature Importance Analysis

In [None]:
# Extract feature importance
importance = model.coef_
features = X.columns

# Create DataFrame and sort
imp_df = pd.DataFrame({
    "Feature": features, 
    "Coefficient": importance,
    "Abs_Coefficient": np.abs(importance)
})
imp_df_sorted = imp_df.sort_values(by="Abs_Coefficient", ascending=False)

# Display top 15 features
print("Top 15 Most Important Features:")
print("="*60)
for idx, row in imp_df_sorted.head(15).iterrows():
    print(f"{row['Feature']:25s} : {row['Coefficient']:8.4f}")

# Visualize top 10 features
plt.figure(figsize=(12, 7))
top_10 = imp_df_sorted.head(10)
colors = ['green' if x > 0 else 'red' for x in top_10['Coefficient']]
bars = plt.barh(range(len(top_10)), top_10['Coefficient'], color=colors, edgecolor='black', alpha=0.7)
plt.yticks(range(len(top_10)), top_10['Feature'], fontsize=11)
plt.xlabel('Coefficient Value', fontsize=12, fontweight='bold')
plt.title('Top 10 Most Important Features (Linear Regression Coefficients)', 
          fontsize=14, fontweight='bold', pad=20)
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
plt.grid(axis='x', alpha=0.3)

# Add value labels on bars
for i, (bar, val) in enumerate(zip(bars, top_10['Coefficient'])):
    plt.text(val, bar.get_y() + bar.get_height()/2, f'{val:.3f}', 
             ha='left' if val > 0 else 'right', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n✓ Note: Positive coefficients increase G3, negative coefficients decrease G3")