In [89]:
import pandas as pd
df = pd.read_csv('StudentPerformanceFactors.csv')

In [90]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


### Checking and Removing Entries with Missing Values

In [91]:
df.isna().sum()

Hours_Studied                  0
Attendance                     0
Parental_Involvement           0
Access_to_Resources            0
Extracurricular_Activities     0
Sleep_Hours                    0
Previous_Scores                0
Motivation_Level               0
Internet_Access                0
Tutoring_Sessions              0
Family_Income                  0
Teacher_Quality               78
School_Type                    0
Peer_Influence                 0
Physical_Activity              0
Learning_Disabilities          0
Parental_Education_Level      90
Distance_from_Home            67
Gender                         0
Exam_Score                     0
dtype: int64

In [92]:
df = df.dropna()

In [93]:
df.isna().sum()

Hours_Studied                 0
Attendance                    0
Parental_Involvement          0
Access_to_Resources           0
Extracurricular_Activities    0
Sleep_Hours                   0
Previous_Scores               0
Motivation_Level              0
Internet_Access               0
Tutoring_Sessions             0
Family_Income                 0
Teacher_Quality               0
School_Type                   0
Peer_Influence                0
Physical_Activity             0
Learning_Disabilities         0
Parental_Education_Level      0
Distance_from_Home            0
Gender                        0
Exam_Score                    0
dtype: int64

### Normalization of Numerical Features

In [94]:
X = df.drop(columns = 'Exam_Score')
Y = df['Exam_Score']
features_to_be_scaled = []
for feature in X:
    if X[feature].nunique() > 5 and (X[feature].dtype == 'int64' or X[feature].dtype == 'float64'):
        features_to_be_scaled.append(feature)

features_to_be_scaled

['Hours_Studied',
 'Attendance',
 'Sleep_Hours',
 'Previous_Scores',
 'Tutoring_Sessions',
 'Physical_Activity']

In [95]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

for feature in features_to_be_scaled:
    X[feature] = scaler.fit_transform(X[feature].values.reshape(-1, 1))

### One-Hot Encoding of Categorical Features

In [96]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

cat_features = []

for feature in X:
    if X[feature].nunique() >= 2 and X[feature].dtype == 'O':
        cat_features.append(feature)

X = pd.get_dummies(X, columns = cat_features)

In [97]:
df_new = pd.concat([X, Y], axis = 1)

In [98]:
df_new.head()

Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Parental_Involvement_High,Parental_Involvement_Low,Parental_Involvement_Medium,Access_to_Resources_High,...,Learning_Disabilities_Yes,Parental_Education_Level_College,Parental_Education_Level_High School,Parental_Education_Level_Postgraduate,Distance_from_Home_Far,Distance_from_Home_Moderate,Distance_from_Home_Near,Gender_Female,Gender_Male,Exam_Score
0,0.511628,0.6,0.5,0.46,0.0,0.5,False,True,False,True,...,False,False,True,False,False,False,True,False,True,67
1,0.418605,0.1,0.666667,0.18,0.25,0.666667,False,True,False,False,...,False,True,False,False,False,True,False,True,False,61
2,0.534884,0.95,0.5,0.82,0.25,0.666667,False,False,True,False,...,False,False,False,True,False,False,True,False,True,74
3,0.651163,0.725,0.666667,0.96,0.125,0.666667,False,True,False,False,...,False,False,True,False,False,True,False,False,True,71
4,0.418605,0.8,0.333333,0.3,0.375,0.666667,False,False,True,False,...,False,True,False,False,False,False,True,True,False,70
