In [32]:
import pandas as pd
df = pd.read_csv('Student.csv')

### Exploratory Data Analysis

In [5]:
df.head(5)

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [6]:
df.isnull().sum()

StudentID            0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music                0
Volunteering         0
GPA                  0
GradeClass           0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df.dtypes

StudentID              int64
Age                    int64
Gender                 int64
Ethnicity              int64
ParentalEducation      int64
StudyTimeWeekly      float64
Absences               int64
Tutoring               int64
ParentalSupport        int64
Extracurricular        int64
Sports                 int64
Music                  int64
Volunteering           int64
GPA                  float64
GradeClass           float64
dtype: object

In [9]:
df.shape

(2392, 15)

In [19]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

anime_colors = ['#FF6F61', '#6B5B95', '#88B04B', '#F7CAC9', '#92A8D1', '#955251', '#B565A7', '#009B77', '#DD4124']

plots_info = [
    ('StudyTimeWeekly', 'GPA', 'GPA vs. Study Time Weekly', 'Study Time Weekly (Hours)', 'GPA'),
    ('Absences', 'GPA', 'GPA vs. Absences', 'Number of Absences', 'GPA'),
    ('ParentalSupport', 'GPA', 'GPA vs. Parental Support', 'Parental Support Level', 'GPA'),
    ('ParentalEducation', 'GPA', 'GPA vs. Parental Education', 'Parental Education Level', 'GPA'),
    ('Extracurricular', 'GPA', 'GPA vs. Extracurricular Activities', 'Extracurricular Activities', 'GPA'),
    ('Sports', 'GPA', 'GPA vs. Sports Participation', 'Sports Participation', 'GPA'),
    ('Music', 'GPA', 'GPA vs. Music Participation', 'Music Participation', 'GPA'),
    ('Volunteering', 'GPA', 'GPA vs. Volunteering', 'Volunteering', 'GPA')
]

fig = make_subplots(rows=4, cols=4, subplot_titles=[info[2] for info in plots_info], 
                    vertical_spacing=0.1, horizontal_spacing=0.1)

for i, (x, y, title, x_label, y_label) in enumerate(plots_info):
    row = i // 4 + 1
    col = i % 4 + 1
    scatter = go.Scatter(x=df[x], y=df[y], mode='markers', marker=dict(color=df['GradeClass'], colorscale=anime_colors, showscale=False))
    fig.add_trace(scatter, row=row, col=col)
    fig.update_xaxes(title_text=x_label, row=row, col=col)
    fig.update_yaxes(title_text=y_label, row=row, col=col)

fig.update_layout(height=1200, width=1200, title_text="Scatter Plots of GPA vs. Various Factors", showlegend=False)

fig.show()

In [24]:
# Scatter Plot: GPA vs. Study Time Weekly
scatter_study_time_fig = px.scatter(df, x='StudyTimeWeekly', y='GPA', color='GradeClass',
                                    color_discrete_sequence=anime_colors,
                                    title='GPA vs. Study Time Weekly',
                                    labels={'StudyTimeWeekly': 'Study Time Weekly (Hours)', 'GPA': 'GPA'})
scatter_study_time_fig.update_layout(plot_bgcolor='rgba(255, 255, 255, 1)', paper_bgcolor='rgba(255, 255, 255, 1)', title_font_size=24)

# Scatter Plot: GPA vs. Absences
scatter_absences_fig = px.scatter(df, x='Absences', y='GPA', color='GradeClass',
                                  color_discrete_sequence=anime_colors,
                                  title='GPA vs. Absences',
                                  labels={'Absences': 'Number of Absences', 'GPA': 'GPA'})
scatter_absences_fig.update_layout(plot_bgcolor='rgba(255, 255, 255, 1)', paper_bgcolor='rgba(255, 255, 255, 1)', title_font_size=24)


# Heatmap: Correlation between variables
correlation = df.corr()
heatmap_fig = go.Figure(data=go.Heatmap(
    z=correlation.values,
    x=correlation.columns,
    y=correlation.columns,
    colorscale=anime_colors,
    zmin=-1, zmax=1
))
heatmap_fig.update_layout(title='Correlation Heatmap', xaxis_nticks=36, title_font_size=24)

heatmap_fig.show()
scatter_study_time_fig.show()
scatter_absences_fig.show()



### PreProcessing and Modelling

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [33]:



features = ['Age', 'StudyTimeWeekly', 'Absences', 'GPA']
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])




In [34]:
X = df.drop(['StudentID', 'GradeClass'], axis=1)
y = df['GradeClass']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



#### Modelling

In [35]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print("\n")

Model: Logistic Regression
Accuracy: 0.7453027139874739
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        22
         1.0       0.43      0.47      0.45        49
         2.0       0.60      0.62      0.61        85
         3.0       0.71      0.57      0.63        86
         4.0       0.88      0.98      0.93       237

    accuracy                           0.75       479
   macro avg       0.53      0.53      0.52       479
weighted avg       0.71      0.75      0.73       479

[[  0  15   3   2   2]
 [  4  23  17   1   4]
 [  1  12  53  14   5]
 [  0   2  14  49  21]
 [  0   1   1   3 232]]


Model: Random Forest
Accuracy: 0.9102296450939458
              precision    recall  f1-score   support

         0.0       0.83      0.45      0.59        22
         1.0       0.81      0.86      0.83        49
         2.0       0.92      0.86      0.89        85
         3.0       0.89      0.90      0.89        86
         4.0    

#### Trying improve the accuracy by tuning parameter

In [36]:
from sklearn.model_selection import GridSearchCV

param_grids = {
    "Logistic Regression": {
        'C': [0.1, 1, 10, 100],
        'solver': ['liblinear', 'lbfgs']
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    "SVM": {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly']
    },
    "K-Nearest Neighbors": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    }
}

best_estimators = {}

for name, model in models.items():
    print(f"Tuning {name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_estimators[name] = grid_search.best_estimator_
    print(f"Best Parameters for {name}: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy for {name}: {grid_search.best_score_}")
    print("\n")
    
for name, model in best_estimators.items():
    y_pred = model.predict(X_test)
    print(f"Model: {name} (Tuned)")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print("\n")

Tuning Logistic Regression...



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Best Parameters for Logistic Regression: {'C': 10, 'solver': 'lbfgs'}
Best Cross-Validation Accuracy for Logistic Regression: 0.7908862247618005


Tuning Random Forest...
Best Parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best Cross-Validation Accuracy for Random Forest: 0.923148743045398


Tuning SVM...
Best Parameters for SVM: {'C': 100, 'kernel': 'linear'}
Best Cross-Validation Accuracy for SVM: 0.86304047680888


Tuning K-Nearest Neighbors...
Best Parameters for K-Nearest Neighbors: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best Cross-Validation Accuracy for K-Nearest Neighbors: 0.7600631553046355


Tuning Gradient Boosting...
Best Parameters for Gradient Boosting: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
Best Cross-Validation Accuracy for Gradient Boosting: 0.9309912102032726


Model: Logistic Regression (Tuned)
Accuracy: 0.7432150313152401
              precision    recall  f1-score   suppo


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

