<a href="https://colab.research.google.com/github/YaninaK/churn-prediction/blob/main/notebooks/04_Inference_results_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Прогнозирование оттока клиентов
## Анализ результатов инференса нейросетевой модели.

## [EDA и отбор признаков](https://github.com/YaninaK/churn-prediction/blob/main/notebooks/01_EDA_and_Feature_selection.ipynb)

## [Базовая нейросетевая модель](https://github.com/YaninaK/churn-prediction/blob/main/notebooks/02_Baseline_model_NN.ipynb)

## [Конвейер для инференса нейросетевой модели](https://github.com/YaninaK/churn-prediction/blob/main/notebooks/03_Inference_pipeline_nn.ipynb)

[Ссылка на данные](https://drive.google.com/file/d/1TAVECAfnel9lPfcpfel6qXhZSW2yNqdX/view?usp=sharing)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!git clone https://github.com/YaninaK/churn-prediction.git -q

In [3]:
%cd churn-prediction

/content/churn-prediction


In [4]:
import sys
import os

sys.path.append(os.getcwd())
sys.path.append(os.path.join(os.getcwd(), "src", "churn_prediction"))

In [5]:
import logging
import pandas as pd
import numpy as np
import easydict

from data.make_dataset import load_data
from data.validation import train_test_data_split

from models.inference_tools import preprocessing_pipeline
from models.serialize import load

from models.utilities import (
    get_initial_bias_and_class_weight,
    plot_loss,
    plot_metrics,
    plot_cm,
    plot_roc,
    plot_prc,
    plot_lift,
)

import matplotlib.pyplot as plt

In [6]:
PATH = '/content/drive/MyDrive/ML_projects/06_Churn_prediction/'

In [7]:
ID = 'customerid'
TARGET_NAME = 'churn'

## 1. Загрузка модели

In [8]:
lstm_model = load('LSTM_emb_model_v1', PATH + 'models/')

## 2. Загрузка и подготовка данных

In [9]:
data = load_data(PATH)
_, data_test = train_test_data_split(data)
test_features, test_labels = preprocessing_pipeline(data_test, PATH)

## 3. Инференс

In [10]:
predictions = lstm_model.predict(test_features)



In [11]:
results = data_test.groupby(ID, as_index=False)[TARGET_NAME ].max()
results['predictions'] = predictions
results = results.sort_values(by='predictions', ascending=False).set_index(ID)
results['y_pred'] = (
    (results['predictions'] >= 0.5)
    .where(results['predictions'] < 0.5, True)
    .astype(int)
)
results

Unnamed: 0_level_0,churn,predictions,y_pred
customerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3466,0,9.964615e-01,1
4052,1,9.913965e-01,1
3494,1,9.887042e-01,1
3601,0,9.867486e-01,1
7149,1,9.844918e-01,1
...,...,...,...
4678,0,3.952400e-11,0
4140,0,1.064870e-11,0
2407,0,5.623193e-12,0
7387,0,2.812500e-13,0


## 4. False Positive

In [12]:
fp = results[(results['y_pred'] == 1) & (results[TARGET_NAME ] == 0)].index.tolist()
df_fp = (
    data_test[data_test[ID].isin(fp)].groupby(ID).first()
    .join(results[['predictions']], how='left')
    .sort_values(by='predictions', ascending=False)
)
df_fp.head(2)

Unnamed: 0_level_0,age,annualincome,calldroprate,callfailurerate,callingnum,customersuspended,education,gender,homeowner,maritalstatus,...,unpaidbalance,usesinternetservice,usesvoiceservice,percentagecalloutsidenetwork,totalcallduration,avgcallduration,churn,year,month,predictions
customerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3466,48,64938,0.0,0.01,4251038772,Yes,Bachelor or equivalent,Female,Yes,Single,...,228,No,No,0.92,1047,261,0,2015,1,0.996462
3601,75,55291,0.01,0.03,4251036245,Yes,Bachelor or equivalent,Female,No,Married,...,231,No,No,0.9,2646,661,0,2015,1,0.986749


In [13]:
df_fp['predictions_bins'] = pd.cut(df_fp['predictions'], 10)

df_fp.groupby('predictions_bins').agg(
      id_count = ("callingnum", "count"),
      age_mean=("age", "mean"),
      numberofcomplaints_mean=("numberofcomplaints", "mean"),
      callfailurerate_mean=("callfailurerate", "mean"),

      female_fraction=("gender", lambda x: x.value_counts(normalize=True)["Female"]),
      education_hs=("education", lambda x: x.value_counts(normalize=True)["High School or below"]),
      single_frac=("maritalstatus", lambda x: x.value_counts(normalize=True)["Single"]),
      homeowner_frac=("homeowner", lambda x: x.value_counts(normalize=True)["Yes"]),
      customersuspended_frac=("customersuspended", lambda x: x.value_counts(normalize=True)["Yes"]),
      not_usesinternetservice_frac=("usesinternetservice", lambda x: x.value_counts(normalize=True)["No"]),
      not_usesvoiceservice_frac=("usesvoiceservice", lambda x: x.value_counts(normalize=True)["No"]),

      unpaidbalance_mean=("unpaidbalance", "mean"),

).sort_index(ascending=False)

Unnamed: 0_level_0,id_count,age_mean,numberofcomplaints_mean,callfailurerate_mean,female_fraction,education_hs,single_frac,homeowner_frac,customersuspended_frac,not_usesinternetservice_frac,not_usesvoiceservice_frac,unpaidbalance_mean
predictions_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"(0.947, 0.996]",11,54.272727,2.636364,0.020909,0.818182,0.363636,0.818182,0.454545,1.0,0.818182,0.909091,153.909091
"(0.897, 0.947]",12,43.833333,1.583333,0.013333,0.833333,0.25,0.5,0.583333,1.0,0.916667,0.833333,103.583333
"(0.848, 0.897]",28,42.5,1.964286,0.016429,0.357143,0.535714,0.464286,0.642857,0.964286,0.964286,0.892857,122.178571
"(0.798, 0.848]",44,36.477273,1.704545,0.016591,0.522727,0.568182,0.409091,0.681818,0.954545,1.0,0.909091,143.772727
"(0.748, 0.798]",56,39.267857,1.732143,0.014643,0.607143,0.5,0.5,0.803571,1.0,0.964286,0.839286,142.821429
"(0.699, 0.748]",84,36.142857,1.833333,0.015238,0.52381,0.47619,0.511905,0.833333,0.97619,0.964286,0.916667,120.583333
"(0.649, 0.699]",75,33.72,1.426667,0.014933,0.573333,0.586667,0.573333,0.866667,0.973333,0.96,0.946667,128.12
"(0.6, 0.649]",84,38.25,1.857143,0.015952,0.511905,0.535714,0.571429,0.821429,0.988095,0.952381,0.940476,144.035714
"(0.55, 0.6]",85,36.294118,1.8,0.015176,0.517647,0.517647,0.552941,0.694118,1.0,0.988235,0.964706,150.694118
"(0.5, 0.55]",84,38.595238,1.761905,0.01631,0.416667,0.559524,0.47619,0.845238,0.97619,0.97619,0.892857,134.904762


* Чаще всего модель ошибочно классифицирует как отток женщин старше 43 лет, с numberofcomplaints больше 2, callfailurerate 0.02 и выше.

## 5. False Negative

In [14]:
fn = results[(results['y_pred'] == 0) & (results[TARGET_NAME ] == 1)].index.tolist()

df_fn = (
    data_test[data_test[ID].isin(fn)].groupby(ID).first()
    .join(results[['predictions']], how='left')
    .sort_values(by='predictions')
)
df_fp.head(2)

Unnamed: 0_level_0,age,annualincome,calldroprate,callfailurerate,callingnum,customersuspended,education,gender,homeowner,maritalstatus,...,usesinternetservice,usesvoiceservice,percentagecalloutsidenetwork,totalcallduration,avgcallduration,churn,year,month,predictions,predictions_bins
customerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3466,48,64938,0.0,0.01,4251038772,Yes,Bachelor or equivalent,Female,Yes,Single,...,No,No,0.92,1047,261,0,2015,1,0.996462,"(0.947, 0.996]"
3601,75,55291,0.01,0.03,4251036245,Yes,Bachelor or equivalent,Female,No,Married,...,No,No,0.9,2646,661,0,2015,1,0.986749,"(0.947, 0.996]"


In [15]:
df_fn['predictions_bins'] = pd.cut(df_fn['predictions'], 4)

df_fn.groupby('predictions_bins').agg(
      id_count = ("callingnum", "count"),
      age_mean=("age", "mean"),
      numberofcomplaints_mean=("numberofcomplaints", "mean"),
      callfailurerate_mean=("callfailurerate", "mean"),

      female_fraction=("gender", lambda x: x.value_counts(normalize=True)["Female"]),
      education_hs=("education", lambda x: x.value_counts(normalize=True)["High School or below"]),
      single_frac=("maritalstatus", lambda x: x.value_counts(normalize=True)["Single"]),
      homeowner_frac=("homeowner", lambda x: x.value_counts(normalize=True)["Yes"]),
      customersuspended_frac=("customersuspended", lambda x: x.value_counts(normalize=True)["Yes"]),
      not_usesinternetservice_frac=("usesinternetservice", lambda x: x.value_counts(normalize=True)["No"]),
      not_usesvoiceservice_frac=("usesvoiceservice", lambda x: x.value_counts(normalize=True)["No"]),

      unpaidbalance_mean=("unpaidbalance", "mean"),

).sort_index()

Unnamed: 0_level_0,id_count,age_mean,numberofcomplaints_mean,callfailurerate_mean,female_fraction,education_hs,single_frac,homeowner_frac,customersuspended_frac,not_usesinternetservice_frac,not_usesvoiceservice_frac,unpaidbalance_mean
predictions_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"(0.222, 0.291]",7,36.857143,2.142857,0.02,0.714286,0.571429,0.714286,1.0,1.0,1.0,0.857143,135.571429
"(0.291, 0.361]",12,40.5,1.166667,0.016667,0.583333,0.666667,0.333333,0.833333,1.0,1.0,1.0,145.416667
"(0.361, 0.43]",7,28.285714,2.142857,0.01,0.285714,0.714286,0.285714,0.714286,1.0,1.0,0.857143,93.285714
"(0.43, 0.499]",13,41.230769,1.615385,0.018462,0.615385,0.384615,0.461538,0.692308,1.0,0.923077,1.0,162.769231


* Модель затрудняется правильно определить отток у клентов моложе 40 лет, даже если numberofcomplaints больше 2 или callfailurerate_mean равна 0.02.