In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
# load data
data_path = Path('../data')
submissions_path = Path('../submissions')

train_df = pd.read_csv(data_path / 'train.csv')
test_df = pd.read_csv(data_path / 'test.csv')

train_df.head()

In [None]:
# prepare data
features = [c for c in train_df.columns if c not in ('id', 'Target')]
target = 'Target'

cat_features = [
    'Marital status', 
    'Application mode',
    'Course', 
    'Previous qualification',
    'Nacionality',
    "Mother's qualification", 
    "Father's qualification", 
    "Mother's occupation", 
    "Father's occupation" 
]
num_features = [c for c in features if c not in cat_features]

for c in cat_features:
    train_df[c] = train_df[c].astype('category')
    test_df[c] = test_df[c].astype('category')

In [None]:
# define the preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(min_frequency=0.01, handle_unknown='infrequent_if_exist'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)])

In [None]:
# define the list of classifiers
models = {
    
    'lr': Pipeline(steps=[('preprocessor', preprocessor), ('clf', LogisticRegression(class_weight='balanced', random_state=42))]),
    'dt': Pipeline(steps=[('preprocessor', preprocessor), ('clf', DecisionTreeClassifier(class_weight='balanced', random_state=42))]),

    #'knn': Pipeline(steps=[('preprocessor', preprocessor), ('clf', KNeighborsClassifier(n_neighbors=10))]),
    #'svc': Pipeline(steps=[('preprocessor', preprocessor), ('clf', SVC(random_state=42))]),
    'mlp': Pipeline(steps=[('preprocessor', preprocessor), ('clf', MLPClassifier(max_iter=500, hidden_layer_sizes=(16, 16), random_state=42))]),

    'rf': Pipeline(steps=[('preprocessor', preprocessor), ('clf', RandomForestClassifier(n_estimators=500, class_weight='balanced', random_state=42))]),
    'et': Pipeline(steps=[('preprocessor', preprocessor), ('clf', ExtraTreesClassifier(n_estimators=500, class_weight='balanced', random_state=42))]),

    'catb': CatBoostClassifier(cat_features=cat_features, verbose=False, random_state=42),
    'lgb': LGBMClassifier(boosting_type = 'gbdt', verbose=-1, class_weight = 'balanced', random_state=42),
    'xgb': XGBClassifier(enable_categorical=True, seed=42),
    
}

In [None]:
# evaluate models via cross validation 
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []
for name, model in tqdm(models.items()):

    if 'name' in ['lgb']:
        scores = cross_validate(model, train_df[features], train_df[target], cv=cv, scoring=metrics, fit_params={'categorical_feature': cat_features}, n_jobs=-1)
    else:
        scores = cross_validate(model, train_df[features], train_df[target], cv=cv, scoring=metrics, n_jobs=-1)
    
    results.append({
        'model': name,
        'accuracy': scores['test_accuracy'].mean(),
        'precision': scores['test_precision_macro'].mean(),
        'recall': scores['test_recall_macro'].mean(),
        'f1': scores['test_f1_macro'].mean()
    })

In [None]:
# save results
results_df = pd.DataFrame(results)
results_df.sort_values(by = 'accuracy', ascending=False).to_csv('saved/model_selection__cv_metrics.csv', index=False)

results_df.sort_values(by = 'accuracy', ascending=False)