# Compare several models

## Setup

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


In [2]:
import os
import numpy as np
import pandas as pd

from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

In [3]:
train = pd.read_csv('../data/final/train.csv')
test = pd.read_csv('../data/final/test.csv')

train.sample(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group_count,Cabin_deck,Cabin_side
5369,Earth,True,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,True,1.0,E,P
4651,Mars,True,TRAPPIST-1e,42.0,False,0.0,0.0,0.0,0.0,0.0,True,1.0,F,S
7786,,False,55 Cancri e,29.0,True,0.0,2949.0,0.0,2.0,654.0,True,2.0,D,S
7207,Earth,False,55 Cancri e,0.0,False,0.0,0.0,0.0,0.0,0.0,True,1.0,G,P
6748,Earth,False,TRAPPIST-1e,20.0,False,415.0,479.0,0.0,1019.0,239.0,False,1.0,F,P


In [4]:
TARGET = 'Transported'
FEATURES = [col for col in train.columns if col not in [TARGET]]

numerical = train[FEATURES].select_dtypes(include=np.number).columns
categorical = train[FEATURES].select_dtypes(exclude=np.number).columns

train[numerical] = train[numerical].astype(float)
train[categorical] = train[categorical].astype(str)

test[numerical] = test[numerical].astype(float)
test[categorical] = test[categorical].astype(str)

print(f'Target: {TARGET}')
print(f'Fetaures:\n\tnumerical: {numerical.to_list()}\n\tcategorical:{categorical.to_list()}')
print(f'Shapes:\n\ttrain: {train.shape}\n\ttest: {test.shape}')

Target: Transported
Fetaures:
	numerical: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group_count']
	categorical:['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_deck', 'Cabin_side']
Shapes:
	train: (8693, 14)
	test: (4277, 13)


## Models

In [5]:
# data splitting
x, x_val, y, y_val = train_test_split(
    train[FEATURES], # lazypredict should have preprocessing but it doesn't work
    train[TARGET].astype(int),
    train_size=0.8, random_state=42
)

In [6]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import RidgeClassifierCV, LogisticRegressionCV, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, HistGradientBoostingClassifier
from sklearn.svm import LinearSVC, NuSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

classifiers = [
    DummyClassifier,
    RidgeClassifierCV, LogisticRegressionCV,
    Perceptron, LinearSVC, NuSVC,
    LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis,
    DecisionTreeClassifier, RandomForestClassifier, ExtraTreesClassifier,
    BaggingClassifier, HistGradientBoostingClassifier,
    XGBClassifier, LGBMClassifier,
    KNeighborsClassifier, MLPClassifier
]

# many regressors with default parameters (preprocessing steps are included)
clf = LazyClassifier(
    verbose=0,
    ignore_warnings=True,
    custom_metric=None,
    predictions=True,
    random_state=42,
    classifiers=classifiers #'all'
)

metrics, predictions = clf.fit(x, x_val, y, y_val)
models = clf.provide_models(x, x_val, y, y_val)

100%|██████████| 17/17 [00:27<00:00,  1.60s/it]


In [7]:
pd.reset_option('display.precision')
pd.reset_option('display.float_format')
metrics

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HistGradientBoostingClassifier,0.791834,0.791713,0.791713,0.791791,1.835029
XGBClassifier,0.791834,0.791668,0.791668,0.791757,0.725761
MLPClassifier,0.791834,0.791646,0.791646,0.791737,12.663046
LGBMClassifier,0.791259,0.79111,0.79111,0.791196,0.341326
LogisticRegressionCV,0.780909,0.780612,0.780612,0.780674,2.278231
LinearSVC,0.780334,0.780076,0.780076,0.780153,0.651413
RandomForestClassifier,0.779758,0.77999,0.77999,0.779667,1.268831
BaggingClassifier,0.778033,0.778439,0.778439,0.777704,0.459449
NuSVC,0.775733,0.775599,0.775599,0.775678,4.566534
KNeighborsClassifier,0.765382,0.765315,0.765315,0.765367,0.415824


## Model blending

In [8]:
val_preds_df = pd.DataFrame()
test_preds_df = pd.DataFrame()

for name, model in models.items():
    try:
        val_preds = model.predict_proba(x_val)[:,1]
        test_preds = model.predict_proba(test)[:,1]
    except:
        continue
    val_preds_df[name] = pd.Series(val_preds)
    test_preds_df[name] = pd.Series(test_preds)

In [9]:
val_preds_df

Unnamed: 0,DummyClassifier,LogisticRegressionCV,LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis,DecisionTreeClassifier,RandomForestClassifier,ExtraTreesClassifier,BaggingClassifier,HistGradientBoostingClassifier,XGBClassifier,LGBMClassifier,KNeighborsClassifier,MLPClassifier
0,0.503307,0.203875,0.418310,3.455360e-02,0.0,0.190000,0.160000,0.100000,0.057604,0.018745,0.047741,0.2,0.027373
1,0.503307,0.570621,0.356394,9.893043e-01,1.0,0.610000,0.540000,0.600000,0.791611,0.708300,0.847433,0.4,0.753093
2,0.503307,0.739721,0.794327,9.990817e-01,0.0,0.400000,0.000000,0.300000,0.779383,0.784395,0.741797,0.8,0.702629
3,0.503307,0.245312,0.186797,9.718111e-01,0.0,0.190000,0.310000,0.300000,0.249054,0.188175,0.218502,0.4,0.350404
4,0.503307,0.908187,0.890509,1.000000e+00,1.0,1.000000,1.000000,1.000000,0.990518,0.989582,0.984848,1.0,0.999030
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1734,0.503307,0.772443,0.836442,9.990911e-01,0.8,0.756988,0.800000,0.646667,0.732703,0.753538,0.738494,0.8,0.767101
1735,0.503307,0.794061,0.864468,9.991380e-01,1.0,0.973294,0.968571,0.966667,0.747084,0.808336,0.772318,1.0,0.857346
1736,0.503307,0.382845,0.307229,8.546844e-01,0.0,0.490000,0.670000,0.200000,0.536209,0.406011,0.269923,0.6,0.626618
1737,0.503307,0.797966,0.834670,1.000000e+00,1.0,1.000000,1.000000,1.000000,0.977519,0.981171,0.977641,1.0,0.990735


In [10]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=12, shuffle=True, random_state=42)
glm = LogisticRegressionCV(cv=cv, penalty='elasticnet', solver='saga', l1_ratios=np.arange(0, 1.01, .1))

_ = glm.fit(val_preds_df, y_val)

# Submission

In [11]:
sub = pd.read_csv('../data/raw/sample_submission.csv')

test_preds_stacking = glm.predict(test_preds_df)
sub[TARGET] = test_preds_stacking.astype(bool)
sub

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [12]:
os.makedirs('../submissions', exist_ok=True)
sub.to_csv('../submissions/lazypredict_blending.csv', index=False)