- Load & understand the dataset

- Clean and prepare data

- Train a prediction model

- Wrap it into a Streamlit interface

In [124]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv(r"C:\Users\yshel\Desktop\DivyaPath-Ai\data\mudule1_student\student_performance.csv")

# Basic info
print(df.shape)

(40000, 7)


In [125]:
df.head()



Unnamed: 0,Student ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
0,S00001,12.5,,75.0,Yes,Master,Yes
1,S00002,9.3,95.3,60.6,No,High School,No
2,S00003,13.2,,64.0,No,Associate,No
3,S00004,17.6,76.8,62.4,Yes,Bachelor,No
4,S00005,8.8,89.3,72.7,No,Master,No


In [126]:

# Drop ID
df = df.drop(columns=["Student ID"])

# Handle missing values
df["Attendance Rate"] = df["Attendance Rate"].fillna(df["Attendance Rate"].mean())
df["Study Hours per Week"] = df["Study Hours per Week"].fillna(df["Study Hours per Week"].mean())
df["Previous Grades"] = df["Previous Grades"].fillna(df["Previous Grades"].mean())

# Check
df.isnull().sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 6 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Study Hours per Week                         40000 non-null  float64
 1   Attendance Rate                              40000 non-null  float64
 2   Previous Grades                              40000 non-null  float64
 3   Participation in Extracurricular Activities  38000 non-null  object 
 4   Parent Education Level                       38000 non-null  object 
 5   Passed                                       38000 non-null  object 
dtypes: float64(3), object(3)
memory usage: 1.8+ MB


In [127]:
# Fill missing categorical values with the most common value (mode)
df["Participation in Extracurricular Activities"] = df["Participation in Extracurricular Activities"].fillna(
    df["Participation in Extracurricular Activities"].mode()[0]
)

df["Parent Education Level"] = df["Parent Education Level"].fillna(
    df["Parent Education Level"].mode()[0]
)

df["Passed"] = df["Passed"].fillna(
    df["Passed"].mode()[0]
)

# Verify
df.isnull().sum()


Study Hours per Week                           0
Attendance Rate                                0
Previous Grades                                0
Participation in Extracurricular Activities    0
Parent Education Level                         0
Passed                                         0
dtype: int64

In [145]:
# create a new column 'Grade' based on 'Previous Grades'

def make_grade(x):
    if x >= 85:
        return "A"
    elif x >= 70:
        return "B"
    elif x >= 55:
        return "C"
    else:
        return "D"

df["Grade"] = df["Previous Grades"].apply(make_grade)

df[["Previous Grades", "Grade"]].head()

df["Grade"] = df["Previous Grades"].apply(make_grade)

# Target
y = df["Grade"]


In [129]:
from sklearn.preprocessing import LabelEncoder

# Separate features and target
X = df.drop(columns=["Grade", "Passed", "Previous Grades"])
y = df["Grade"]


# Encode categorical feature columns
for col in X.select_dtypes(include="object").columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Encode target (Grade)
le_y = LabelEncoder()
y_enc = le_y.fit_transform(y)

# Check shapes
print(X.shape, y_enc.shape)

# Preview
X.head(), y.head()


(40000, 4) (40000,)


(   Study Hours per Week  Attendance Rate  \
 0                  12.5        75.276323   
 1                   9.3        95.300000   
 2                  13.2        75.276323   
 3                  17.6        76.800000   
 4                   8.8        89.300000   
 
    Participation in Extracurricular Activities  Parent Education Level  
 0                                            1                       4  
 1                                            0                       3  
 2                                            0                       0  
 3                                            1                       1  
 4                                            0                       4  ,
 0    B
 1    C
 2    C
 3    C
 4    B
 Name: Grade, dtype: object)

In [130]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [131]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42
)

In [132]:
# Train model
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

In [133]:
# Predict
y_pred = model.predict(X_test)

In [134]:

# Evaluate
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

Accuracy: 0.4125


In [135]:
print("\nClassification Report:\n")
classification_report(y_test, y_pred)


Classification Report:



'              precision    recall  f1-score   support\n\n           0       0.12      0.00      0.00       729\n           1       0.28      0.01      0.02      2067\n           2       0.41      0.99      0.58      3316\n           3       0.38      0.00      0.01      1888\n\n    accuracy                           0.41      8000\n   macro avg       0.30      0.25      0.15      8000\nweighted avg       0.34      0.41      0.25      8000\n'

ValueError: Found input variables with inconsistent numbers of samples: [32000, 40000]

In [137]:
import joblib

joblib.dump(model, r"C:\Users\yshel\Desktop\DivyaPath-Ai\models\student_model.pkl")
joblib.dump(le_passed, r"C:\Users\yshel\Desktop\DivyaPath-Ai\models\grade_encoder.pkl")


['C:\\Users\\yshel\\Desktop\\DivyaPath-Ai\\models\\grade_encoder.pkl']

In [139]:
model = joblib.load("C:\\Users\\yshel\\Desktop\\DivyaPath-Ai\\models\\student_model.pkl")
le_y = joblib.load("C:\\Users\\yshel\\Desktop\\DivyaPath-Ai\\models\\grade_encoder.pkl")

In [121]:
model = joblib.load("C:\\Users\\yshel\\Desktop\\DivyaPath-Ai\\models\\student_model.pkl")
le_y = joblib.load("C:\\Users\\yshel\\Desktop\\DivyaPath-Ai\\models\\grade_encoder.pkl")

In [141]:
input_data=[[1,2,4,4],[1,2,4,6]]
pred = model.predict(input_data)
grade = le_y.inverse_transform(pred)[0]




In [143]:
pred = [2]
grade = le_y.inverse_transform(pred)[0]
print(grade)


ValueError: y contains previously unseen labels: [2]

In [144]:
pred = [2]
grade = le_y.inverse_transform(pred)[0]
print(grade)

ValueError: y contains previously unseen labels: [2]