# Compare several models

## Setup

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


In [None]:
import os
import numpy as np
import pandas as pd

from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

: 

In [None]:
train = pd.read_csv('../data/final/train.csv')
test = pd.read_csv('../data/final/test.csv')

train.sample(5)

: 

In [None]:
TARGET = 'Transported'
FEATURES = [col for col in train.columns if col not in [TARGET]]

numerical = train[FEATURES].select_dtypes(include=np.number).columns
categorical = train[FEATURES].select_dtypes(exclude=np.number).columns

print(f'Target: {TARGET}')
print(f'Fetaures:\n\tnumerical: {numerical.to_list()}\n\tcategorical:{categorical.to_list()}')
print(f'Shapes:\n\ttrain: {train.shape}\n\ttest: {test.shape}')

: 

## Models

In [None]:
# data splitting
x, x_val, y, y_val = train_test_split(
    train[FEATURES], # lazypredict should have preprocessing but it doesn't work
    train[TARGET].astype(int),
    train_size=0.8, random_state=42
)

: 

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import RidgeClassifierCV, LogisticRegressionCV, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, HistGradientBoostingClassifier
from sklearn.svm import LinearSVC, NuSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

classifiers = [
    DummyClassifier,
    RidgeClassifierCV, LogisticRegressionCV,
    Perceptron, LinearSVC, NuSVC,
    LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis,
    DecisionTreeClassifier, RandomForestClassifier, ExtraTreesClassifier,
    BaggingClassifier, HistGradientBoostingClassifier,
    XGBClassifier, LGBMClassifier,
    KNeighborsClassifier, MLPClassifier
]

# many regressors with default parameters (preprocessing steps are included)
clf = LazyClassifier(
    verbose=0,
    ignore_warnings=True,
    custom_metric=None,
    predictions=True,
    random_state=42,
    #classifiers=classifiers # 'all'
)

metrics, predictions = clf.fit(x, x_val, y, y_val)
models = clf.provide_models(x, x_val, y, y_val)

: 

In [None]:
pd.reset_option('display.precision')
pd.reset_option('display.float_format')
metrics

: 

## Model blending

In [None]:
val_preds_df = pd.DataFrame()
test_preds_df = pd.DataFrame()

for name, model in models.items():
    try:
        val_preds = model.predict_proba(x_val)[:,1]
        test_preds = model.predict_proba(test)[:,1]
    except:
        continue
    val_preds_df[name] = pd.Series(val_preds)
    test_preds_df[name] = pd.Series(test_preds)

: 

In [None]:
val_preds_df

: 

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
glm = LogisticRegressionCV(cv=cv, penalty='elasticnet', solver='saga', l1_ratios=np.arange(0, 1.01, .1))

_ = glm.fit(val_preds_df, y_val)

: 

# Submission

In [None]:
sub = pd.read_csv('../data/raw/sample_submission.csv')

test_preds_stacking = glm.predict(test_preds_df)
sub[TARGET] = test_preds_stacking.astype(bool)
sub

: 

In [None]:
os.makedirs('../submissions', exist_ok=True)
sub.to_csv('../submissions/lazypredict_blending.csv', index=False)

: 