<a href="https://colab.research.google.com/github/Zuhair0000/Student_Preformance_Prediction/blob/main/student_performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**

In [138]:
import pandas as pd
import numpy as np

# **Load Datasets**

In [139]:
df = pd.read_csv("student-mat.csv", sep=';')
X = df.drop(columns=["G1", "G2","G3"])
y = df['G3']

# **train-test split**

In [140]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Data Preprocessing**

In [141]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [142]:
categorical_features = [
    "school", "sex",
    "address", "famsize", "Pstatus",
    "Mjob", "Fjob", "reason",
    "guardian", "schoolsup",
    "famsup", "paid", "activities",
    "nursery", "higher", "internet",
    "romantic"
]

numercial_features = [
    "Medu", "Fedu", "age",
    "traveltime", "studytime", "failures",
    "famrel", "freetime", "goout",
    "Dalc", "Walc", "health", "absences"
]

In [143]:
categorical_enc = Pipeline([
    ("Imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

numercial_enc = Pipeline([
    ("Imputer", SimpleImputer(strategy='mean')),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("cat", categorical_enc, categorical_features),
    ("num", numercial_enc, numercial_features)
])

# **Model Training**

### Linear Regression

In [144]:
from sklearn.linear_model import LinearRegression

lr = Pipeline([
    ("preprocessing", preprocessor),
    ("model", LinearRegression())
])

lr.fit(X_train, y_train)

In [145]:
lr_pred = lr.predict(X_test)

### Random Forest

In [146]:
from sklearn.ensemble import RandomForestRegressor

rf = Pipeline([
    ("preprocessing", preprocessor),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])

rf.fit(X_train, y_train)

In [147]:
rf_pred = rf.predict(X_test)

In [148]:
rf_model = rf.named_steps["model"]

In [149]:
rf_importance = rf_model.feature_importances_

In [150]:
rf_feature_name = rf.named_steps["preprocessing"].get_feature_names_out()

In [151]:
rf_importance_df = pd.DataFrame({
    "Features": rf_feature_name,
    "Importance": rf_importance
}).sort_values(by="Importance", ascending=False)

In [152]:
rf_importance_df

Unnamed: 0,Features,Importance
55,num__absences,0.189024
48,num__failures,0.141866
54,num__health,0.051395
51,num__goout,0.048068
45,num__age,0.039512
47,num__studytime,0.032664
50,num__freetime,0.028443
46,num__traveltime,0.026329
44,num__Fedu,0.026089
53,num__Walc,0.025998


### XGBoost

In [153]:
from xgboost import XGBRegressor

xgb = Pipeline([
    ("preprocessing", preprocessor),
    ("model", XGBRegressor(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="rmse",
        random_state=42))
])

xgb.fit(X_train, y_train)

In [154]:
xgb_pred = xgb.predict(X_test)

In [155]:
xgb_model = xgb.named_steps["model"]

In [156]:
xgb_importance = xgb_model.feature_importances_

In [157]:
xgb_feature_names = xgb.named_steps["preprocessing"].get_feature_names_out()

In [158]:
xgb_importance_df = pd.DataFrame({
    "Features": xgb_feature_names,
    "Importance": xgb_importance
}).sort_values(by="Importance", ascending=False)

In [159]:
xgb_importance_df

Unnamed: 0,Features,Importance
48,num__failures,0.103044
26,cat__guardian_other,0.045605
37,cat__higher_no,0.041872
55,num__absences,0.037978
27,cat__schoolsup_no,0.032227
28,cat__schoolsup_yes,0.027737
54,num__health,0.026882
8,cat__Pstatus_A,0.024614
43,num__Medu,0.022591
52,num__Dalc,0.022328


# **Evaluation**

In [160]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [161]:
def evaluate_model(y_test, y_pred):
  return{
      "MAE": mean_absolute_error(y_test, y_pred),
      "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
      "R2": r2_score(y_test, y_pred),
  }

In [162]:
lr_result = evaluate_model(y_test, lr_pred)
lr_result

{'MAE': 3.395260925801919,
 'RMSE': np.float64(4.195680802721378),
 'R2': 0.1414924741119582}

In [163]:
rf_result = evaluate_model(y_test, rf_pred)
rf_result

{'MAE': 2.9786075949367086,
 'RMSE': np.float64(3.766468479906155),
 'R2': 0.3081564443784579}

In [164]:
xgb_result = evaluate_model(y_test, xgb_pred)
xgb_result

{'MAE': 3.095374822616577,
 'RMSE': np.float64(3.852074269926564),
 'R2': 0.27635014057159424}

In [165]:
results_df = pd.DataFrame({
    "LinearRegression": lr_result,
    "RandomForest": rf_result,
    "XGBoost": xgb_result
})

results_df

Unnamed: 0,LinearRegression,RandomForest,XGBoost
MAE,3.395261,2.978608,3.095375
RMSE,4.195681,3.766468,3.852074
R2,0.141492,0.308156,0.27635


In [166]:
import joblib
joblib.dump(rf, "student_performance_prediction.pkl")

['student_performance_prediction.pkl']