### Practice Exercise Data Proprocessing

In [None]:
# Import data processing and ML preprocessing tools
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# Load and explore the student performance dataset
df =  pd.read_csv('student_performance.csv')
print(f'Dataset:\n {df.head(1)}\n')
print(f'Shape of the Dataset:\n {df.shape}\n')
print(f'Missing values in each columns:\n\n {df.isnull().sum()}')

# Extract features and target - remove ID column (first) and keep everything except the last column (target)
X = df.iloc[:,1:-1].values

# Target is the final grade we want to predict
y = df[['FinalGrade']].values

# Define which columns are categorical (text) and which are numerical (numbers)
Categorial_col = [1,2,5,6]
integer_col = [0,3,4,7]

# Fill missing values: use mean for numerical data, most frequent value for categorical data
impute_mean = SimpleImputer(missing_values=np.nan,strategy='mean')

X[:,integer_col] = (impute_mean.fit_transform(X[:,integer_col]))

impute_frequent = SimpleImputer(missing_values=np.nan,strategy='most_frequent')

X[:,Categorial_col] = impute_frequent.fit_transform(X[:,Categorial_col])

y = impute_frequent.fit_transform(y)

# Convert categorical text values to numerical (e.g., "Pass"/"Fail" -> 0/1)
ct = ColumnTransformer(transformers=[('encode',OneHotEncoder(),Categorial_col)],remainder='passthrough')

X = np.array(ct.fit_transform(X))

Dataset:
   StudentID   Age Gender  ... ExtraCurricular  PreviousGrade  FinalGrade
0      S001  16.0   Male  ...             Yes           72.0        78.0

[1 rows x 10 columns]

Shape of the Dataset:
 (20, 10)

Missing values in each columns:

 StudentID            0
Age                  3
Gender               1
ParentEducation      0
StudyHoursPerWeek    2
Attendance           3
InternetAccess       2
ExtraCurricular      0
PreviousGrade        1
FinalGrade           1
dtype: int64


In [None]:
# Split data into training and testing sets - model learns from training, evaluates on testing
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# Normalize features to similar ranges - improves model performance by preventing large values from dominating
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)

In [None]:
# Display the final preprocessed data ready for machine learning model
print(f'Final training set:\n {X_train}')
print(f'\nFinal Test Set:\n {X_test}')

print(f'\nFinal y_train Set:\n {y_train}')

Final training set:
 [[ 1.         -1.          1.4832397  -0.77459667 -0.48038446 -0.37796447
  -0.67419986  0.67419986  1.         -1.          1.39787893  0.38903827
   0.1002373   0.35156626]
 [ 1.         -1.         -0.67419986  1.29099445 -0.48038446 -0.37796447
  -0.67419986  0.67419986  1.         -1.          1.39787893 -1.40365008
  -1.22989575 -1.16118878]
 [-1.          1.          1.4832397  -0.77459667 -0.48038446 -0.37796447
  -0.67419986  0.67419986  1.         -1.          0.06755549 -0.28321986
  -0.58073991  0.09678647]
 [ 1.         -1.          1.4832397  -0.77459667 -0.48038446 -0.37796447
   1.4832397  -1.4832397   1.         -1.         -0.01558973  0.07780765
   0.5011865  -0.0266225 ]
 [ 1.         -1.         -0.67419986 -0.77459667  2.081666   -0.37796447
  -0.67419986  0.67419986  1.         -1.          0.06755549  0.61312431
   0.60937914  0.57847952]
 [ 1.         -1.         -0.67419986  1.29099445 -0.48038446 -0.37796447
   1.4832397  -1.4832397   1. 