In [150]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

# Load data

In [151]:
# categorical colummns
cat_attrs = [
    'Marital status',
    'Application mode',
    'Application order',
    'Course',
    'Daytime/evening attendance\t',
    'Previous qualification',
    'Nacionality',
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
    'Displaced',
    'Educational special needs',
    'Debtor',
    'Tuition fees up to date',
    'Gender',
    'Scholarship holder',
    'International',
    # 'Age at enrollment',
    # 'Curricular units 1st sem (grade)',
    # 'Curricular units 2nd sem (grade)',
]

In [152]:
student = pd.read_csv('datasets/student-dropout/data.csv', sep=';')
# student = student[student['Target'] != 'Enrolled'] # remove 'enrolled' value
student = student.drop(['Curricular units 1st sem (credited)',
                        'Curricular units 1st sem (enrolled)',
                        'Curricular units 1st sem (evaluations)',
                        'Curricular units 1st sem (approved)',
                        'Curricular units 1st sem (without evaluations)',
                        'Curricular units 2nd sem (credited)',
                        'Curricular units 2nd sem (enrolled)',
                        'Curricular units 2nd sem (evaluations)',
                        'Curricular units 2nd sem (approved)',
                        'Curricular units 2nd sem (without evaluations)',
                        ], axis=1)

for attr in cat_attrs:
    student[attr] = student[attr].astype('category')

student

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (grade),Curricular units 2nd sem (grade),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,1,0,20,0,0.000000,0.000000,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,1,0,19,0,14.000000,13.666667,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,1,0,19,0,0.000000,0.000000,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,0,20,0,13.428571,12.400000,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,0,45,0,12.333333,13.000000,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,1,0,19,0,13.600000,12.666667,15.5,2.8,-4.06,Graduate
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,0,18,1,12.000000,11.000000,11.1,0.6,2.02,Dropout
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,1,30,0,14.912500,13.500000,13.9,-0.3,0.79,Dropout
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,1,20,0,13.800000,12.000000,9.4,-0.8,-3.12,Graduate


In [153]:
student.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 27 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   Marital status                    4424 non-null   category
 1   Application mode                  4424 non-null   category
 2   Application order                 4424 non-null   category
 3   Course                            4424 non-null   category
 4   Daytime/evening attendance	       4424 non-null   category
 5   Previous qualification            4424 non-null   category
 6   Previous qualification (grade)    4424 non-null   float64 
 7   Nacionality                       4424 non-null   category
 8   Mother's qualification            4424 non-null   category
 9   Father's qualification            4424 non-null   category
 10  Mother's occupation               4424 non-null   category
 11  Father's occupation               4424 non-null   catego

In [154]:
student.Target.value_counts()

Graduate    2209
Dropout     1421
Enrolled     794
Name: Target, dtype: int64

In [155]:
# student.hist(bins=50, figsize=(20,20))

# Split train and test set

In [156]:
from sklearn.model_selection import StratifiedShuffleSplit

strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idxs, test_idxs in strat_split.split(student, student['Target']):
    strat_train_set = student.iloc[train_idxs]
    strat_test_set = student.iloc[test_idxs]

Compare distribution of train set and test set

In [157]:
# Train set
strat_train_set.Target.value_counts() / len(strat_train_set)

Graduate    0.499294
Dropout     0.321277
Enrolled    0.179429
Name: Target, dtype: float64

In [158]:
# Test set
strat_test_set.Target.value_counts() / len(strat_test_set)

Graduate    0.499435
Dropout     0.320904
Enrolled    0.179661
Name: Target, dtype: float64

In [159]:
X_train = strat_train_set.drop("Target", axis=1)
y_train = strat_train_set["Target"].copy().values

In [160]:
X_test = strat_test_set.drop('Target', axis=1)
y_test = strat_test_set.Target.copy().values

In [161]:
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test', X_test.shape)
print('y_test:', y_test.shape)

X_train: (3539, 26)
y_train: (3539,)
X_test (885, 26)
y_test: (885,)


# Preprocessing

## Categorical attributes

### An example on 'Marital status'

**Notes:** When discrete values in an attribute don't have an inherent order, use One Hot encoding. For example, values in 'Marital status', even though ranging from 1 to 6, each has its own value and doesn't have a meaning of smaller or larger

In [162]:
# marital_stat = X_train[['Marital status']]
# marital_stat.value_counts()

# from sklearn.preprocessing import OrdinalEncoder

# ordinal_encoder = OrdinalEncoder()
# marital_stat_enc = ordinal_encoder.fit_transform(marital_stat)
# print(marital_stat_enc[:10])

# print(ordinal_encoder.categories_)

In [163]:
# One hot encoding
marital_stat = marital_stat = X_train[['Marital status']]

In [164]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
marital_stat_1hot = cat_encoder.fit_transform(marital_stat)
marital_stat_1hot

<3539x6 sparse matrix of type '<class 'numpy.float64'>'
	with 3539 stored elements in Compressed Sparse Row format>

In [165]:
cat_encoder.categories_

[array([1, 2, 3, 4, 5, 6])]

## Full pipeline

Seperate attributes into different types for specific preprocessing

In [166]:
# Categorical attributes
unordered_cat_attrs = [
    'Marital status',
    'Application mode',
    'Application order',
    'Course',
    'Daytime/evening attendance\t',
    'Previous qualification',
    'Nacionality',
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
    'Displaced',
    'Educational special needs',
    'Debtor',
    'Tuition fees up to date',
    'Gender',
    'Scholarship holder',
    'International',
]

ordered_cat_attrs = [
    'Age at enrollment',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (grade)',
]

num_attrs = [attr for attr in list(X_train) if attr not in unordered_cat_attrs and attr not in ordered_cat_attrs]

In [167]:
print('Number of:')
print('- Unordered categorial attributes:', len(unordered_cat_attrs))
print('- Ordered categorical attributes:', len(ordered_cat_attrs))
print('- Numerical attributes:', len(num_attrs))
print('TOTAL:', len(unordered_cat_attrs) + len(ordered_cat_attrs) + len(num_attrs))

Number of:
- Unordered categorial attributes: 18
- Ordered categorical attributes: 3
- Numerical attributes: 5
TOTAL: 26


Create pipeline

In [168]:
# For unordered categorical attributes
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

unordered_cat_pipeline = Pipeline([
    ('one_hot', OneHotEncoder())
])

In [169]:
# For numerical attributes
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

In [170]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attrs + ordered_cat_attrs),
    ('unordered_cat', unordered_cat_pipeline, unordered_cat_attrs)
])

X_train_ready = full_pipeline.fit_transform(X_train)

Encode target labels

In [171]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(dtype=np.int8)
y_train_ready = ordinal_encoder.fit_transform(y_train.reshape(-1,1))
y_train_ready[:10]

array([[2],
       [0],
       [1],
       [2],
       [2],
       [1],
       [0],
       [0],
       [0],
       [0]], dtype=int8)

In [172]:
y_train_ready = y_train_ready.reshape(-1)

In [173]:
ordinal_encoder.categories_

[array(['Dropout', 'Enrolled', 'Graduate'], dtype=object)]

# Train

In [174]:
from sklearn import tree

decision_tree_clf = tree.DecisionTreeClassifier()
# decision_tree_clf.fit(X_train_ready, y_train_ready)

In [175]:
from sklearn.ensemble import RandomForestClassifier

random_forest_clf = RandomForestClassifier()
# random_forest_clf.fit(X_train_ready, y_train_ready)

In [177]:
from sklearn.model_selection import cross_val_score

cross_val_score(decision_tree_clf, X_train_ready, y_train_ready, cv=3, scoring='accuracy')
# cross_val_score(random_forest_clf, X_train_ready, y_train_ready, cv=3, scoring='accuracy')

array([0.71186441, 0.74491525, 0.73452078])

## Evaluation

In [None]:
# TODO