# Ноутбук обучения и теста модели

### Импорты

In [44]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline

### Загрузка данных

In [45]:
data_path = "./Хакатон/data/train_dataset_Самолет.csv"
df = pd.read_csv(data_path, low_memory=False)
df

Unnamed: 0,report_date,client_id,target,col1,col2,col3,col4,col5,col6,col7,...,col2654,col2655,col2656,col2657,col2658,col2659,col2660,col2661,col2662,col2663
0,2022-11-01,1,0,,,,,,,,...,,,,,,,,,,0.256261
1,2022-11-01,5,0,,,,,,,,...,7616803.0,7616803.0,7616803.0,,,,7616803.0,7616803.0,7616803.0,0.256261
2,2022-05-01,6,0,,,,,,,,...,,,,,,,,,,0.256261
3,2022-09-01,7,0,,,,,,,,...,,,,,,,,,,0.258682
4,2022-08-01,8,0,,,,,,,,...,,,,,,,,,,0.254164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14451,2022-07-01,1241,0,,,,,,,,...,,,,,,,,,,0.276902
14452,2022-09-01,1969,1,,,,,,,,...,,,,,,,,,,0.260516
14453,2022-02-01,7116,1,,,,,,,,...,,,,,,,,,,0.260005
14454,2021-08-01,7117,0,,,,,,,,...,,,,,,,,,,0.256261


### Предобработка данных

In [46]:
df_copy = df.copy()
df_copy.dtypes.unique()

array([dtype('O'), dtype('int64'), dtype('float64')], dtype=object)

In [47]:
cols_for_drop = df_copy.isna().sum()[df_copy.isna().sum() == df_copy.shape[0]].index
cols_for_drop

Index(['col773', 'col774', 'col775', 'col776', 'col777', 'col778', 'col779',
       'col780', 'col781', 'col782', 'col783', 'col784', 'col785', 'col786',
       'col787', 'col788', 'col789', 'col790', 'col791', 'col792', 'col1069',
       'col1173', 'col1655', 'col1656', 'col1657', 'col1658', 'col1659',
       'col1660', 'col1673', 'col1674', 'col1675', 'col1679', 'col1680',
       'col1681', 'col1703', 'col1704', 'col1705', 'col1727', 'col1728',
       'col1729', 'col1751', 'col1752', 'col1753', 'col1775', 'col1776',
       'col1777', 'col1799', 'col1800', 'col1801', 'col2247', 'col2248',
       'col2249', 'col2250', 'col2251', 'col2252', 'col2253', 'col2254',
       'col2255', 'col2256', 'col2257', 'col2258', 'col2259', 'col2260',
       'col2261', 'col2262', 'col2263', 'col2264', 'col2265', 'col2266',
       'col2267', 'col2268', 'col2269', 'col2270', 'col2535', 'col2536',
       'col2537', 'col2538', 'col2539', 'col2540', 'col2541', 'col2542'],
      dtype='object')

In [48]:
df_copy = df_copy.drop(cols_for_drop, axis=1)

In [49]:
df_obj = df_copy.select_dtypes(object).fillna("empty").astype('category')
le = LabelEncoder()
for col in df_obj:
    df_obj[col] = le.fit_transform(df_obj[col].astype(str))
df_obj.head()

Unnamed: 0,report_date,col49,col50,col51,col52,col53,col54,col55,col56,col57,...,col1653,col1654,col2191,col2192,col2193,col2194,col2195,col2196,col2197,col2198
0,21,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,21,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,15,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,19,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,18,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [50]:
df_float = df_copy.select_dtypes(float).fillna(0.0)
df_float.head()

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,...,col2654,col2655,col2656,col2657,col2658,col2659,col2660,col2661,col2662,col2663
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.256261
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7616803.0,7616803.0,7616803.0,0.0,0.0,0.0,7616803.0,7616803.0,7616803.0,0.256261
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.256261
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258682
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254164


In [51]:
df_int = df_copy.select_dtypes(int)
df_int.head()

Unnamed: 0,client_id,target,col1453
0,1,0,0
1,5,0,1
2,6,0,0
3,7,0,0
4,8,0,0


In [52]:
df_prepared = df_int.join(df_float).join(df_obj)
df_prepared.head()

Unnamed: 0,client_id,target,col1453,col1,col2,col3,col4,col5,col6,col7,...,col1653,col1654,col2191,col2192,col2193,col2194,col2195,col2196,col2197,col2198
0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### Разбиение на тестовую и тренировочную выборку

In [53]:
df_copy = df_prepared.copy()

In [54]:
X, y = df_copy.drop("target", axis=1), df_copy.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

### Обучение и выбор модели + гиперпараметры

In [56]:
class ClfSwitcher(BaseEstimator):
    def __init__(self, estimator = SGDClassifier()):
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)

In [57]:
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', ClfSwitcher()),
])

parameters = [
    #{
    #    'clf__estimator': [LogisticRegression()]
    #},
    {
        'clf__estimator': [XGBClassifier(enable_categorical=True, objective='binary:logistic', tree_method= 'hist')],
        'clf__estimator__learning_rate': [0.01, 0.05],
        'clf__estimator__subsample': [0.3, 0.5],
        'clf__estimator__n_estimators': [100, 200], 
        'clf__estimator__max_depth': [5, 7], 
        'clf__estimator__gamma': [0.3, 0.5], 
        'clf__estimator__colsample_bytree': [0.3, 0.5]
    },
    #{
    #    'clf__estimator': [RandomForestClassifier()]
    #},
    #{
    #    'clf__estimator': [KNeighborsClassifier()]
    #}
]

#gscv = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1, return_train_score=False, verbose=3)
#gscv.fit(X_train, y_train)
#gscv.best_estimator_

In [58]:
#gscv.best_params_

In [59]:
#y_pred_prob = gscv.predict_proba(X_test)[:, 1]
#roc_auc_score(y_test, y_pred_prob)

### Модель для быстрого запуска

In [60]:
clf_params = {
    'enable_categorical': True, 
    'objective': 'binary:logistic', 
    'tree_method': 'hist',
    #'subsample': 0.7, 
    #'n_estimators': 100, 
    #'max_depth': 6, 
    #'learning_rate': 0.1, 
    #'gamma': 0.3, 
    #'colsample_bytree': 0.5
}
clf = XGBClassifier(**clf_params)

In [61]:
clf.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=True, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [62]:
y_pred_prob = clf.predict_proba(X_test)[:,1]

In [63]:
roc_auc_score(y_test, y_pred_prob)

0.9472618179046292

### Проверка на тестовых данных

In [64]:
test = pd.read_csv("./Хакатон/data/test.csv", on_bad_lines='skip', sep=';', low_memory=False).drop("id", axis=1)
df_copy = test.copy()
#cols_for_drop = df_copy.isna().sum()[df_copy.isna().sum() == df_copy.shape[0]].index
#df_copy = df_copy.drop(cols_for_drop, axis=1)
df_obj = df_copy.select_dtypes(object).fillna("empty").astype('category')
le = LabelEncoder()
for col in df_obj:
    df_obj[col] = le.fit_transform(df_obj[col].astype(str))
df_float = df_copy.select_dtypes(float).fillna(0.0)
df_int = df_copy.select_dtypes(int)
X_ = df_int.join(df_float).join(df_obj)

In [65]:
X_

Unnamed: 0,client_id,col1453,col1,col2,col3,col4,col5,col6,col7,col8,...,col1652,col1654,col2191,col2192,col2193,col2194,col2195,col2196,col2197,col2198
0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,3,3,4,5,0,5
2,9,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,12,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,14,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3637,4366,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3638,8298,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3639,835,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3640,8300,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
model_cols = clf.get_booster().feature_names
clf.predict_proba(X_[model_cols])[:, 1]

array([1.7817477e-04, 7.4406811e-03, 2.5270373e-04, ..., 8.7366527e-04,
       8.7417757e-05, 1.7785152e-03], dtype=float32)

In [67]:
submission = pd.read_csv('./Хакатон/data/submission.csv', sep=';')

In [68]:
submission

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
3637,3637,0
3638,3638,0
3639,3639,0
3640,3640,0


In [69]:
submission['target'] = clf.predict_proba(X_[model_cols])[:, 1]

In [70]:
submission.to_csv('sample_submission.csv', index = False, sep = ';')