In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('StudentsPerformance.csv')

In [3]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
data.shape

(1000, 8)

In [5]:
data = data.drop(columns=['parental level of education','lunch'])

In [6]:
data.head()

Unnamed: 0,gender,race/ethnicity,test preparation course,math score,reading score,writing score
0,female,group B,none,72,72,74
1,female,group C,completed,69,90,88
2,female,group B,none,90,95,93
3,male,group A,none,47,57,44
4,male,group C,none,76,78,75


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   gender                   1000 non-null   object
 1   race/ethnicity           1000 non-null   object
 2   test preparation course  1000 non-null   object
 3   math score               1000 non-null   int64 
 4   reading score            1000 non-null   int64 
 5   writing score            1000 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 47.0+ KB


In [8]:
data.isnull().sum()

gender                     0
race/ethnicity             0
test preparation course    0
math score                 0
reading score              0
writing score              0
dtype: int64

In [9]:
data.duplicated().sum()

2

In [10]:
data = data.drop_duplicates()

In [11]:
data.head()

Unnamed: 0,gender,race/ethnicity,test preparation course,math score,reading score,writing score
0,female,group B,none,72,72,74
1,female,group C,completed,69,90,88
2,female,group B,none,90,95,93
3,male,group A,none,47,57,44
4,male,group C,none,76,78,75


In [12]:
data.duplicated().sum()

0

In [13]:
data['race/ethnicity'].value_counts()

race/ethnicity
group C    318
group D    262
group B    190
group E    139
group A     89
Name: count, dtype: int64

In [14]:
data['test preparation course'].value_counts()

test preparation course
none         640
completed    358
Name: count, dtype: int64

In [15]:
data['gender'] = data.gender.map({'male':1,'female':0})

In [16]:
data.head()

Unnamed: 0,gender,race/ethnicity,test preparation course,math score,reading score,writing score
0,0,group B,none,72,72,74
1,0,group C,completed,69,90,88
2,0,group B,none,90,95,93
3,1,group A,none,47,57,44
4,1,group C,none,76,78,75


In [17]:
new_column_names = {
    'race/ethnicity': 'Grade',
    'test preparation course': 'test',
    'math score': 'math-score',
    'reading score': 'reading-score',
    'writing score': 'writing-score',
}
data.rename(columns=new_column_names, inplace=True)

In [18]:
data.head()

Unnamed: 0,gender,Grade,test,math-score,reading-score,writing-score
0,0,group B,none,72,72,74
1,0,group C,completed,69,90,88
2,0,group B,none,90,95,93
3,1,group A,none,47,57,44
4,1,group C,none,76,78,75


In [19]:
data['test'] = data.test.map({'completed':1,'none':0})

In [20]:
data.head()

Unnamed: 0,gender,Grade,test,math-score,reading-score,writing-score
0,0,group B,0,72,72,74
1,0,group C,1,69,90,88
2,0,group B,0,90,95,93
3,1,group A,0,47,57,44
4,1,group C,0,76,78,75


In [21]:
data['Grade'] = data['Grade'].map({'group A': 1, 'group B': 2, 'group C': 3, 'group D': 4, 'group E': 5})

In [22]:
data.head()

Unnamed: 0,gender,Grade,test,math-score,reading-score,writing-score
0,0,2,0,72,72,74
1,0,3,1,69,90,88
2,0,2,0,90,95,93
3,1,1,0,47,57,44
4,1,3,0,76,78,75


In [23]:
data['Grade'].value_counts()

Grade
3    318
4    262
2    190
5    139
1     89
Name: count, dtype: int64

In [24]:
data.isnull().sum()

gender           0
Grade            0
test             0
math-score       0
reading-score    0
writing-score    0
dtype: int64

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 998 entries, 0 to 999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   gender         998 non-null    int64
 1   Grade          998 non-null    int64
 2   test           998 non-null    int64
 3   math-score     998 non-null    int64
 4   reading-score  998 non-null    int64
 5   writing-score  998 non-null    int64
dtypes: int64(6)
memory usage: 54.6 KB


# Divide Data into x and y

In [27]:
x = data.drop(columns=['Grade'])
y = data['Grade']

In [28]:
x.head()

Unnamed: 0,gender,test,math-score,reading-score,writing-score
0,0,0,72,72,74
1,0,1,69,90,88
2,0,0,90,95,93
3,1,0,47,57,44
4,1,0,76,78,75


In [29]:
y.head()

0    2
1    3
2    2
3    1
4    3
Name: Grade, dtype: int64

# Split Data

In [31]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2,random_state=42)

In [32]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

In [33]:
x_train_new = pd.DataFrame(x_train_scaled, columns=x_train.columns)
x_test_new = pd.DataFrame(x_test_scaled, columns=x_test.columns)

In [34]:
x_train_new.head()

Unnamed: 0,gender,test,math-score,reading-score,writing-score
0,1.051447,-0.737238,-0.752075,-1.004828,-1.433771
1,1.051447,1.356414,2.012408,0.868011,1.29785
2,-0.95107,1.356414,1.946587,2.116571,2.097349
3,-0.95107,-0.737238,-0.686254,-0.796735,-0.301148
4,-0.95107,-0.737238,0.959272,1.284198,1.4311


In [35]:
x_test.head()

Unnamed: 0,gender,test,math-score,reading-score,writing-score
453,1,0,65,58,49
793,1,1,89,84,77
209,0,0,58,61,66
309,0,0,49,57,52
740,1,0,80,73,72


In [36]:
y_train.head()

744    2
286    5
165    3
960    1
493    3
Name: Grade, dtype: int64

In [37]:
y_test.head()

453    3
793    5
209    2
309    4
740    4
Name: Grade, dtype: int64

In [38]:
data['Grade'].value_counts()

Grade
3    318
4    262
2    190
5    139
1     89
Name: count, dtype: int64

# Model Selection

In [40]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')

# Decision Tree

In [42]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4], 
    'max_features': [None, 'sqrt', 'log2'] 
}
DT = DecisionTreeClassifier()

grid_search = GridSearchCV(estimator=DT, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(x_train_new, y_train)
print(f"Best Parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test_new)
accuracy = accuracy_score(y_test, y_pred)
print(f"Improved Accuracy: {accuracy * 100:.2f}%")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5}
Improved Accuracy: 31.00%


# Logistic Regression

In [44]:
# LR = LogisticRegression()
# LR.fit(x_train_new,y_train)
# y_pred = LR.predict(x_test_new)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy:{accuracy * 100:.2f}%")

Accuracy:35.00%


In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'],
    'max_iter': [100, 200, 500] 
}
LR = LogisticRegression()
grid_search = GridSearchCV(estimator=LR, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(x_train_new, y_train)
print(f"Best Parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test_new)
accuracy = accuracy_score(y_test, y_pred)
print(f"Improved Accuracy: {accuracy * 100:.2f}%")

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Best Parameters: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Improved Accuracy: 36.50%


# Random Forest

In [47]:
RF = RandomForestClassifier()
RF.fit(x_train_new,y_train)
y_pred = RF.predict(x_test_new)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100: .2f}")

Accuracy:  23.50


In [90]:
import joblib

# Save the trained model to a file
joblib.dump(best_model, 'logistic_regression_model.pkl')
print("Model saved successfully!")

Model saved successfully!
