# importing the library

In [129]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from model.ipynb import LinearRegression

# insight of the data

In [130]:
df = pd.read_csv('StudentsPerformance.csv')

print(df.head())

   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  


In [131]:
# check for missing values
print("\nData types of each column:")
print(df.info())


Data types of each column:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
None


In [132]:
print("Summary of 'object' columns:")
display(df.describe(include='object'))

print("Summary of 'non-object' columns:")
display(df.describe())

Summary of 'object' columns:


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
count,1000,1000,1000,1000,1000
unique,2,5,6,2,2
top,female,group C,some college,standard,none
freq,518,319,226,645,642


Summary of 'non-object' columns:


Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [133]:
print("Number of missing values in each column:")
print(df.isnull().sum())

Number of missing values in each column:
gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64


In [134]:
for columns in df.select_dtypes(include=['object']).columns:
    print(f"\nValue counts for column '{columns}':")
    print(df[columns].value_counts())


Value counts for column 'gender':
gender
female    518
male      482
Name: count, dtype: int64

Value counts for column 'race/ethnicity':
race/ethnicity
group C    319
group D    262
group B    190
group E    140
group A     89
Name: count, dtype: int64

Value counts for column 'parental level of education':
parental level of education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

Value counts for column 'lunch':
lunch
standard        645
free/reduced    355
Name: count, dtype: int64

Value counts for column 'test preparation course':
test preparation course
none         642
completed    358
Name: count, dtype: int64


# visualisation of the data

# data preprocessing

In [135]:
two_category_cols = []
mul_category_cols = []

for columns in df.select_dtypes(include=['object']).columns:
    if df[columns].nunique() == 2:
        two_category_cols.append(columns)
    else:
        mul_category_cols.append(columns)

# Apply One-Hot Encoding using sklearn's ColumnTransformer
if mul_category_cols:
    ohe = OneHotEncoder(drop='first', sparse_output=False)    
    encoded_data = ohe.fit_transform(df[mul_category_cols])
    encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(mul_category_cols))
    df = pd.concat([df.drop(columns=mul_category_cols), encoded_df], axis=1)


# Apply Label Encoding for binary categorical columns
if two_category_cols:
    le = LabelEncoder()
    for col in two_category_cols:
        df[col] = le.fit_transform(df[col])

print("\nData after encoding:")
print(df.head())



Data after encoding:
   gender  lunch  test preparation course  math score  reading score  \
0       0      1                        1          72             72   
1       0      1                        0          69             90   
2       0      1                        1          90             95   
3       1      0                        1          47             57   
4       1      1                        1          76             78   

   writing score  race/ethnicity_group B  race/ethnicity_group C  \
0             74                     1.0                     0.0   
1             88                     0.0                     1.0   
2             93                     1.0                     0.0   
3             44                     0.0                     0.0   
4             75                     0.0                     1.0   

   race/ethnicity_group D  race/ethnicity_group E  \
0                     0.0                     0.0   
1                     0.0     

In [136]:
X = df.drop(['math score', 'reading score', 'writing score'], axis=1)
y = df[['math score', 'reading score', 'writing score']]

