# __Pre-Processing__

### __Import Packages__

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import warnings

from library.sb_utils import save_file
warnings.filterwarnings('ignore')

### __Load Data__

In [9]:
df = pd.read_csv('./data/df_cleaned.csv')
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nationality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,54.954087,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,6.16595,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,54.954087,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,0.000759,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,6.16595,Graduate


In [10]:
# Drop features we won't be using: Displaced, Unemployment rate, Inflation rate, GDP, as well as Target feature, which is put into its own df and we're removing the 'Enrolled' rows
df = df.drop(df[df['Target'] == 'Enrolled'].index)
df_target = pd.DataFrame(df['Target'])
df = df.drop(['Displaced', 'Unemployment rate', 'Inflation rate', 'GDP', 'Target'], axis=1)
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nationality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations)
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0.0,0,0,0,0,0,0.0,0
1,1,15,1,9254,1,1,160.0,1,1,3,...,6,6,14.0,0,0,6,6,6,13.666667,0
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,0,0.0,0,0,6,0,0,0.0,0
3,1,17,2,9773,1,1,122.0,1,38,37,...,8,6,13.428571,0,0,6,10,5,12.4,0
4,2,39,1,8014,0,1,100.0,1,37,38,...,9,5,12.333333,0,0,6,6,6,13.0,0


In [11]:
# Split df into numerical and categorical features
df_numerical = df[['Previous qualification (grade)', 'Admission grade', 'Age at enrollment', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)']]

df_categorical = df.drop(columns=df_numerical.columns)

In [23]:
# Numerical features to be scaled: Previous qualification (grade), Admission grade, Age at enrollment, Curricular units 1st sem (credited)	Curricular units 1st sem (enrolled)	Curricular units 1st sem (evaluations)	Curricular units 1st sem (approved)	Curricular units 1st sem (grade)	Curricular units 1st sem (without evaluations)	Curricular units 2nd sem (credited)	Curricular units 2nd sem (enrolled)	Curricular units 2nd sem (evaluations)	Curricular units 2nd sem (approved)	Curricular units 2nd sem (grade)	Curricular units 2nd sem (without evaluations) 
df_numerical.head()

Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations)
0,122.0,127.3,20,0,0,0,0,0.0,0,0,0,0,0,0.0,0
1,160.0,142.5,19,0,6,6,6,14.0,0,0,6,6,6,13.666667,0
2,122.0,124.8,19,0,6,0,0,0.0,0,0,6,0,0,0.0,0
3,122.0,119.6,20,0,6,8,6,13.428571,0,0,6,10,5,12.4,0
4,100.0,141.5,45,0,6,9,5,12.333333,0,0,6,6,6,13.0,0


In [26]:
df_numerical.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3630 entries, 0 to 4423
Data columns (total 15 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Previous qualification (grade)                  3630 non-null   float64
 1   Admission grade                                 3630 non-null   float64
 2   Age at enrollment                               3630 non-null   int64  
 3   Curricular units 1st sem (credited)             3630 non-null   int64  
 4   Curricular units 1st sem (enrolled)             3630 non-null   int64  
 5   Curricular units 1st sem (evaluations)          3630 non-null   int64  
 6   Curricular units 1st sem (approved)             3630 non-null   int64  
 7   Curricular units 1st sem (grade)                3630 non-null   float64
 8   Curricular units 1st sem (without evaluations)  3630 non-null   int64  
 9   Curricular units 2nd sem (credited)           

In [13]:
df_categorical.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Nationality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,International
0,1,17,5,171,1,1,1,19,12,5,9,0,0,1,1,0,0
1,1,15,1,9254,1,1,1,1,3,3,3,0,0,0,1,0,0
2,1,1,5,9070,1,1,1,37,37,9,9,0,0,0,1,0,0
3,1,17,2,9773,1,1,1,38,37,5,3,0,0,1,0,0,0
4,2,39,1,8014,0,1,1,37,38,9,9,0,0,1,0,0,0


### __Scale Numerical Features and Encode for Categorical Features__

In [14]:
# Scale
scaler = preprocessing.StandardScaler()
scaled_df = scaler.fit_transform(df_numerical)
scaled_df = pd.DataFrame(scaled_df, columns=df_numerical.columns)
scaled_df.head()

Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations)
0,-0.825034,0.000415,-0.442212,-0.304517,-2.465538,-1.883107,-1.480034,-2.083224,-0.189871,-0.287686,-2.782691,-1.958586,-1.429014,-1.831085,-0.190148
1,2.045805,1.040849,-0.569976,-0.304517,-0.131288,-0.483214,0.373306,0.685217,-0.189871,-0.287686,-0.131002,-0.444817,0.468555,0.662383,-0.190148
2,-0.825034,-0.170709,-0.569976,-0.304517,-0.131288,-1.883107,-1.480034,-2.083224,-0.189871,-0.287686,-0.131002,-1.958586,-1.429014,-1.831085,-0.190148
3,-0.825034,-0.526647,-0.442212,-0.304517,-0.131288,-0.016583,0.373306,0.572219,-0.189871,-0.287686,-0.131002,0.564362,0.152293,0.431281,-0.190148
4,-2.487099,0.9724,2.751894,-0.304517,-0.131288,0.216733,0.064416,0.355641,-0.189871,-0.287686,-0.131002,-0.444817,0.468555,0.54075,-0.190148


In [15]:
# One-hot encoding
encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
encoded_df = encoder.fit_transform(df_categorical)
encoded_df = pd.DataFrame(encoded_df.toarray(), columns=encoder.get_feature_names_out(df_categorical.columns))
encoded_df.head()

Unnamed: 0,Marital status_1,Marital status_2,Marital status_3,Marital status_4,Marital status_5,Marital status_6,Application mode_1,Application mode_2,Application mode_5,Application mode_7,...,Debtor_0,Debtor_1,Tuition fees up to date_0,Tuition fees up to date_1,Gender_0,Gender_1,Scholarship holder_0,Scholarship holder_1,International_0,International_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [16]:
df_features = pd.concat([encoded_df, scaled_df], axis=1)
df_features.head()

Unnamed: 0,Marital status_1,Marital status_2,Marital status_3,Marital status_4,Marital status_5,Marital status_6,Application mode_1,Application mode_2,Application mode_5,Application mode_7,...,Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations)
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.883107,-1.480034,-2.083224,-0.189871,-0.287686,-2.782691,-1.958586,-1.429014,-1.831085,-0.190148
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.483214,0.373306,0.685217,-0.189871,-0.287686,-0.131002,-0.444817,0.468555,0.662383,-0.190148
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-1.883107,-1.480034,-2.083224,-0.189871,-0.287686,-0.131002,-1.958586,-1.429014,-1.831085,-0.190148
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.016583,0.373306,0.572219,-0.189871,-0.287686,-0.131002,0.564362,0.152293,0.431281,-0.190148
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.216733,0.064416,0.355641,-0.189871,-0.287686,-0.131002,-0.444817,0.468555,0.54075,-0.190148


In [17]:
df_target.head()

Unnamed: 0,Target
0,Dropout
1,Graduate
2,Dropout
3,Graduate
4,Graduate


In [18]:
df_target.value_counts()

Target  
Graduate    2209
Dropout     1421
Name: count, dtype: int64

## __Test-Train Split__

In [19]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df_features, df_target, train_size=0.75)

Because some categorical features have many labels, like the occupation columns, it may be better to do target encoding instead. But I will change that if the current implementation has a large effect on the model training in the next notebook.

In [20]:
datapath = './data/'
save_file(df, 'df_cleaned_processed.csv', datapath)

Writing file.  "./data/df_cleaned_processed.csv"


## __Save Data__

In [21]:
datapath = './data/'
save_file(df_features, 'df_features.csv', datapath)
save_file(df_target, 'df_target.csv', datapath)
save_file(Xtrain, 'Xtrain.csv', datapath)
save_file(ytrain, 'ytrain.csv', datapath)
save_file(Xtest, 'Xtest.csv', datapath)
save_file(ytest, 'ytest.csv', datapath)

print('Data Preprocessing and Preparation completed successfully.')

A file already exists with this name.


Please re-run this cell with a new filename.
A file already exists with this name.


Please re-run this cell with a new filename.
A file already exists with this name.


Please re-run this cell with a new filename.
A file already exists with this name.


Please re-run this cell with a new filename.
A file already exists with this name.


Please re-run this cell with a new filename.
A file already exists with this name.


Please re-run this cell with a new filename.
Data Preprocessing and Preparation completed successfully.
