1. обучить несколько разных моделей на наборе данных ССЗ (train_case2.csv): логрег, бустинг, лес и т.д - на ваш выбор 2-3 варианта
2. при обучении моделей обязательно использовать кроссвалидацию
3. вывести сравнение полученных моделей по основным метрикам классификации: pr/rec/auc/f_score (можно в виде таблицы, где строки - модели, а столбцы - метрики)
4. сделать выводы о том, какая модель справилась с задачей лучше других

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [2]:
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler

In [3]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

In [4]:
pd.set_option('display.float_format', '{:.4f}'.format)
warnings.filterwarnings('ignore')
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('ggplot')
plt.rcParams["font.family"] = "Times New Roman"

In [5]:
df = pd.read_csv('train_case2.csv', ';')
df.head(3)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1


In [6]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df.drop('cardio', 1),
                                                    df['cardio'], random_state=13, stratify=df['cardio'])

In [7]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]


class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]


class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(
            X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]

In [8]:
continuos_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
cat_cols = ['gender', 'cholesterol']
base_cols = ['gluc', 'smoke', 'alco', 'active']

continuos_transformers = []
cat_transformers = []
base_transformers = []

for cont_col in continuos_cols:
    transfomer = Pipeline([
        ('selector', NumberSelector(key=cont_col)),
        ('standard', StandardScaler())
    ])
    continuos_transformers.append((cont_col, transfomer))

for cat_col in cat_cols:
    cat_transformer = Pipeline([
        ('selector', ColumnSelector(key=cat_col)),
        ('ohe', OHEEncoder(key=cat_col))
    ])
    cat_transformers.append((cat_col, cat_transformer))

for base_col in base_cols:
    base_transformer = Pipeline([
        ('selector', NumberSelector(key=base_col))
    ])
    base_transformers.append((base_col, base_transformer))

In [9]:
feats = FeatureUnion(continuos_transformers+cat_transformers+base_transformers)
feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

array([[-2.01572213, -1.01054901, -1.39812656, ...,  0.        ,
         0.        ,  0.        ],
       [-1.70022838, -0.16209898, -0.29084942, ...,  0.        ,
         0.        ,  1.        ],
       [-1.06275258, -2.10141335, -1.88256031, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.36507967,  0.44393676,  0.05517468, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.61975533, -0.52572042,  0.7472229 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.72842466,  0.08031532, -0.70607835, ...,  0.        ,
         0.        ,  0.        ]])

In [10]:
model_catb = CatBoostClassifier(random_state=13)
model_xgb = XGBClassifier(random_state=13)
model_lgbm = LGBMClassifier(random_state=13)
model_rf = RandomForestClassifier(random_state=13)

In [11]:
cv_result = {'metrics': ['test_roc_auc',
                         'test_f1',
                         'test_precision',
                         'test_recall'
                        ]}

for model in [model_catb, model_xgb, model_lgbm, model_rf]:
    model_scores = []
    cv_scores = cross_validate(model, X_train, y_train, cv=5, scoring=[
                               'roc_auc', 'f1', 'precision', 'recall'], n_jobs=-1, verbose=1)
    for metric in cv_result['metrics']:
        model_scores.append(np.mean(cv_scores[metric]))
    cv_result[model.__class__.__name__] = model_scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   39.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.4s finished


In [12]:
df_model_scores = pd.DataFrame(cv_result)
df_model_scores

Unnamed: 0,metrics,CatBoostClassifier,XGBClassifier,LGBMClassifier,RandomForestClassifier
0,test_roc_auc,0.8004,0.7939,0.8008,0.7849
1,test_f1,0.7249,0.7192,0.7258,0.7166
2,test_precision,0.7549,0.7491,0.7578,0.7364
3,test_recall,0.6972,0.6915,0.6965,0.6979


In [13]:
# модели примерно одинаково справились с предсказанием (с учетом того, что гиперпараметры еще не подбирались).

5. (опциональный вопрос) какой график (precision_recall_curve или roc_auc_curve) больше подходит в случае сильного дисбаланса классов? (когда объектов одного из классов намного больше чем другого, например, 1 к 1000).
***************

Ответ: В случае сильного дисбаланса класса метрика FPR плохо реагирует на рост FP предсказаний. В результате площадь под кривой ROC получается выской даже у плохих моделей (с очень низким значения precision). Таким образом precision_recall_curve больше подходит для случая дисбаланса классов