# Compare several models

## Setup

In [9]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


In [10]:
import numpy as np
import pandas as pd

from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

In [11]:
train = pd.read_csv('../data/final/train.csv')
test = pd.read_csv('../data/final/test.csv')

train.sample(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group_count,Cabin_deck,Cabin_side
3013,Earth,False,TRAPPIST-1e,26.0,False,0.0,56.0,322.0,6.0,243.0,True,1,G,P
3844,Europa,False,TRAPPIST-1e,45.0,False,0.0,2524.0,0.0,166.0,1.0,True,2,B,P
7925,Earth,False,TRAPPIST-1e,21.0,False,32.0,640.0,0.0,0.0,0.0,True,1,F,S
4948,Earth,False,TRAPPIST-1e,29.0,False,0.0,10.0,0.0,594.0,120.0,False,1,F,S
5934,Mars,True,TRAPPIST-1e,42.0,False,0.0,0.0,0.0,0.0,0.0,True,1,F,P


In [12]:
TARGET = 'Transported'
FEATURES = [col for col in train.columns if col not in [TARGET]]

numerical = train[FEATURES].select_dtypes(include=np.number).columns
categorical = train[FEATURES].select_dtypes(exclude=np.number).columns

train[numerical] = train[numerical].astype(float)
train[categorical] = train[categorical].astype(str)

print(f'Target: {TARGET}')
print(f'Fetaures:\n\tnumerical: {numerical.to_list()}\n\tcategorical:{categorical.to_list()}')
print(f'Shapes:\n\ttrain: {train.shape}\n\ttest: {test.shape}')

Target: Transported
Fetaures:
	numerical: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group_count']
	categorical:['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_deck', 'Cabin_side']
Shapes:
	train: (8693, 14)
	test: (4277, 13)


## Models

In [13]:
# data splitting
x, x_val, y, y_val = train_test_split(
    train[FEATURES], # lazypredict should have preprocessing but it doesn't work
    train[TARGET].astype(int),
    train_size=0.8, random_state=42
)

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numerical_preproc = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ]
)

categorical_preproc = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoding", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

preprocessor = ColumnTransformer([
    ('numerical', numerical_preproc, numerical.to_list()),
    ('categorical', categorical_preproc, categorical.to_list())])

preprocessor

In [15]:
x = preprocessor.fit_transform(x)
x_val = preprocessor.transform(x_val)

In [16]:
# many regressors with default parameters (preprocessing steps are included)
clf = LazyClassifier(
    verbose=0,
    ignore_warnings=True,
    custom_metric=None,
    predictions=True,
    random_state=42,
    classifiers='all'
)

models, predictions = clf.fit(x, x_val, y, y_val)

100%|██████████| 29/29 [00:35<00:00,  1.24s/it]


In [21]:
pd.reset_option('display.float_format')
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.7918,0.7917,0.7917,0.7918,0.7912
SVC,0.7901,0.7901,0.7901,0.7901,4.1312
NuSVC,0.7872,0.7873,0.7873,0.7872,5.2095
LGBMClassifier,0.7867,0.7865,0.7865,0.7866,0.4118
AdaBoostClassifier,0.7849,0.7845,0.7845,0.7845,0.8522
LogisticRegression,0.7815,0.7812,0.7812,0.7813,0.0766
CalibratedClassifierCV,0.7803,0.7801,0.7801,0.7802,5.7559
LinearSVC,0.7803,0.7801,0.7801,0.7802,1.3689
RandomForestClassifier,0.7798,0.78,0.78,0.7797,1.2992
BaggingClassifier,0.778,0.7784,0.7784,0.7777,0.707


In [22]:
predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1739 entries, 0 to 1738
Data columns (total 27 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   AdaBoostClassifier             1739 non-null   int32
 1   BaggingClassifier              1739 non-null   int32
 2   BernoulliNB                    1739 non-null   int32
 3   CalibratedClassifierCV         1739 non-null   int32
 4   DecisionTreeClassifier         1739 non-null   int32
 5   DummyClassifier                1739 non-null   int32
 6   ExtraTreeClassifier            1739 non-null   int32
 7   ExtraTreesClassifier           1739 non-null   int32
 8   GaussianNB                     1739 non-null   int32
 9   KNeighborsClassifier           1739 non-null   int32
 10  LabelPropagation               1739 non-null   int32
 11  LabelSpreading                 1739 non-null   int32
 12  LinearDiscriminantAnalysis     1739 non-null   int32
 13  LinearSVC         