In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

import joblib


In [2]:
df = pd.read_csv(r"C:\Users\yshel\Desktop\DivyaPath-Ai\data\mudule1_student\student_performance.csv")
df.head()


Unnamed: 0,Student ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
0,S00001,12.5,,75.0,Yes,Master,Yes
1,S00002,9.3,95.3,60.6,No,High School,No
2,S00003,13.2,,64.0,No,Associate,No
3,S00004,17.6,76.8,62.4,Yes,Bachelor,No
4,S00005,8.8,89.3,72.7,No,Master,No


In [3]:
df.columns = [
    "student_id",
    "study_hours",
    "attendance",
    "previous_grade",
    "extra",
    "parent_edu",
    "passed"
]

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   student_id      40000 non-null  object 
 1   study_hours     38005 non-null  float64
 2   attendance      38008 non-null  float64
 3   previous_grade  38006 non-null  float64
 4   extra           38000 non-null  object 
 5   parent_edu      38000 non-null  object 
 6   passed          38000 non-null  object 
dtypes: float64(3), object(4)
memory usage: 2.1+ MB


In [4]:
# Fill numeric NaNs with median
for col in ["study_hours", "attendance", "previous_grade"]:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical NaNs with mode
for col in ["extra", "parent_edu", "passed"]:
    df[col].fillna(df[col].mode()[0], inplace=True)

df.isna().sum()


student_id        0
study_hours       0
attendance        0
previous_grade    0
extra             0
parent_edu        0
passed            0
dtype: int64

In [5]:
def make_grade(x):
    if x >= 85:
        return "A"
    elif x >= 70:
        return "B"
    elif x >= 55:
        return "C"
    else:
        return "D"

df["Grade"] = df["previous_grade"].apply(make_grade)
df[["previous_grade", "Grade"]].head()


Unnamed: 0,previous_grade,Grade
0,75.0,B
1,60.6,C
2,64.0,C
3,62.4,C
4,72.7,B


In [6]:
extra_enc = LabelEncoder()
parent_enc = LabelEncoder()
grade_enc = LabelEncoder()

df["extra_enc"] = extra_enc.fit_transform(df["extra"])
df["parent_enc"] = parent_enc.fit_transform(df["parent_edu"])
y = grade_enc.fit_transform(df["Grade"])


In [7]:
X = df[["study_hours", "attendance", "previous_grade", "extra_enc", "parent_enc"]]


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [9]:
model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

model.fit(X_train, y_train)


In [10]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=grade_enc.classes_))


Accuracy: 1.0
              precision    recall  f1-score   support

           A       1.00      1.00      1.00       729
           B       1.00      1.00      1.00      2067
           C       1.00      1.00      1.00      3316
           D       1.00      1.00      1.00      1888

    accuracy                           1.00      8000
   macro avg       1.00      1.00      1.00      8000
weighted avg       1.00      1.00      1.00      8000



In [11]:
joblib.dump(model, r"C:\Users\yshel\Desktop\DivyaPath-Ai\models\student_model_v2.pkl")
joblib.dump(grade_enc, r"C:\Users\yshel\Desktop\DivyaPath-Ai\models\grade_encoder_v2.pkl")
joblib.dump(extra_enc, r"C:\Users\yshel\Desktop\DivyaPath-Ai\models\extra_encoder_v2.pkl")
joblib.dump(parent_enc, r"C:\Users\yshel\Desktop\DivyaPath-Ai\models\parent_encoder_v2.pkl")

print("New Grade Model Saved Successfully")


New Grade Model Saved Successfully
