<a href="https://colab.research.google.com/github/Yanina-Kutovaya/projects/blob/main/Car_insurance_renewal/notebooks/02_Car_insurance_renewal_AutoML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Прогноз пролонгации полиса автострахования - AutoML.

## [EDA](https://github.com/Yanina-Kutovaya/projects/blob/main/Car_insurance_renewal/notebooks/01_Car_insurance_renewal_EDA.ipynb)

In [1]:
!pip install pycaret -q
!pip3 install jinja2==3.0.0 -q

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import category_encoders as ce
from sklearn.pipeline import Pipeline

from pycaret.classification import ClassificationExperiment

In [4]:
PATH = '/content/drive/MyDrive/ML_projects/vsk/data/'
RANDOM_SEED = 25

## 1. Чтение данных

In [5]:
data_file = 'Задание (пролонгация).xlsx'
field_description = pd.read_excel(PATH + data_file, 0, header=None)
data = pd.read_excel(PATH + data_file, 1)

In [6]:
TARGET_NAME = 'target'

## 2. Разделение на обучающую и тестовую выбоки

In [7]:
train = data[data[TARGET_NAME] != -1]
train.shape

(7598, 30)

In [8]:
test = data[data[TARGET_NAME] == -1]
test.shape

(3265, 30)

## 3. Подготовка данных

In [9]:
categorical_variables = data.dtypes[data.dtypes=='object'].index.tolist()
train[categorical_variables].describe().T['unique']

Filial              22
Type_ts              2
passport_region     85
VIP                  2
Brand               23
Model              190
GAP                  2
married              2
deduct               3
deduct_amount        3
product              4
Bank                 3
channel - map        2
category             4
kvs_type             2
sex                  3
Name: unique, dtype: object

In [10]:
features = [i for i in train.columns[1:] if i != TARGET_NAME]

cols_1 = ['Filial', 'passport_region', 'Brand', 'Model']
cols_2 = [i for i in categorical_variables if not i in cols_1]
count_encoder = ce.count.CountEncoder(cols=cols_1)
glmm_encoder = ce.glmm.GLMMEncoder(cols=cols_1)
one_hot_encoder = ce.OneHotEncoder(cols=cols_2)

categorical_transformer = Pipeline(
    steps=[
        ("glmm_encoder", ce.glmm.GLMMEncoder(cols=cols_1,)),
        ("one_hot_encoder", ce.OneHotEncoder(cols=cols_2)),
    ]
)

In [11]:
numeric_variables = [i for i in features if not i in categorical_variables]

y_train = train[TARGET_NAME]

df_cat = categorical_transformer.fit_transform(train[categorical_variables], y_train)
X_train = pd.concat([df_cat, train[numeric_variables+[TARGET_NAME]]], axis=1)

df_cat_ = categorical_transformer.transform(test[categorical_variables])
X_test = pd.concat([df_cat_, test[numeric_variables]], axis=1)

In [12]:
X_train.shape, X_test.shape

((7598, 50), (3265, 49))

## 4. Модели

In [13]:
s = ClassificationExperiment()
s.setup(X_train, target = TARGET_NAME, session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Binary
3,Original data shape,"(7598, 50)"
4,Transformed data shape,"(7598, 50)"
5,Transformed train set shape,"(5318, 50)"
6,Transformed test set shape,"(2280, 50)"
7,Numeric features,49
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x7d387b3e2aa0>

In [14]:
best = s.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.7414,0.7754,0.4229,0.6277,0.5045,0.3388,0.3512,0.096
gbc,Gradient Boosting Classifier,0.7401,0.7766,0.3813,0.644,0.4774,0.3195,0.3398,0.182
ada,Ada Boost Classifier,0.7379,0.7633,0.4139,0.6207,0.4944,0.3278,0.3409,0.134
ridge,Ridge Classifier,0.7366,0.0,0.3717,0.6346,0.4666,0.308,0.3283,0.065
lda,Linear Discriminant Analysis,0.7366,0.7582,0.397,0.6234,0.483,0.3186,0.334,0.146
lightgbm,Light Gradient Boosting Machine,0.7277,0.7635,0.4084,0.5926,0.4827,0.3067,0.3168,0.181
rf,Random Forest Classifier,0.7245,0.7535,0.391,0.589,0.4687,0.2934,0.3051,0.214
xgboost,Extreme Gradient Boosting,0.724,0.7496,0.4512,0.5735,0.5046,0.3172,0.3218,0.081
et,Extra Trees Classifier,0.7211,0.7387,0.4217,0.5713,0.4844,0.2997,0.3064,0.306
nb,Naive Bayes,0.6935,0.612,0.0699,0.5896,0.1234,0.0609,0.1139,0.076


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

* Лучший AUC дает Gradient Boosting Classifier
* При необходимости максимизировать Recall, можно попробовать Quadratic Discriminant Analysis