In [2]:
# Подключение необходимых библиотек
import pandas as pd
import numpy as np
import datetime as dtime
import matplotlib.pyplot as plt
import scipy.stats as stats
from ydata_profiling import ProfileReport
%matplotlib inline

import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler

### Получение исходных данных

In [None]:
df = pd.read_csv('../datasets/gym_churn.csv')

### Исследовательский анализ данных (EDA)

In [None]:
df.info()

In [None]:
ProfileReport(df, minimal=True) # Формирования отчета для исследования данных

In [None]:
df.mean()

In [None]:
df.groupby(by='Churn').mean()

In [13]:
left = df.query("Churn == 1")
stay = df.query("Churn == 0")

In [14]:
def col(i):
    pvt = pd.pivot_table(data=df, 
                          index="Churn", 
                          columns = df.iloc[:,i],
                          values='Age',
                          aggfunc='count').reset_index()
    pvt.columns=['Churn', df.columns[i]+'_0', df.columns[i]+'_1']
    m = plt.bar(x = pvt['Churn']-0.1, height = pvt.iloc[:,1], width=0.2, label = pvt.columns[1])
    w = plt.bar(x = pvt['Churn']+0.1, height = pvt.iloc[:,2], width=0.2, label = pvt.columns[2])
    plt.xticks(ticks=[0,1], labels=['остались', 'отток'])
    plt.legend()
    plt.bar_label(m)
    plt.bar_label(w)
    plt.show()

In [15]:
category_cols = pd.Series([0,1,2,3,4,6])

In [None]:
for i in category_cols:
    col(i)

### Клиенты, работающие в копаниях-партнерах более лоялны.

In [None]:
category_cols_2 = pd.Series([7,8,11,12])

for i in category_cols_2:
    n, bins, patches = plt.hist(left.iloc[:,i], bins = 20, alpha= 0.5, label = 'Left')
    plt.hist(stay.iloc[:,i], bins = bins, alpha = 0.5, label = 'Stayed')
    plt.legend()
    plt.title(left.columns[i])
    plt.show()

In [None]:
category_cols_3 = pd.Series([5,9,10])

for i in category_cols_3:
    plt.hist(stay.iloc[:,i], alpha = 0.5, label = 'Left')
    plt.hist(stay.iloc[:,i], alpha = 0.5, label = 'Stayed')
    plt.legend()
    plt.title(left.columns[i])
    plt.show()

In [None]:
correl = df.corr()
fig, ax = plt.subplots(figsize=(15,5))
sns.heatmap(correl, annot=True, fmt = ".2f")

### Формирование прогнозной модели на основе ...

In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=0)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)[:,1]

acc = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
print(acc, precision, recall)

### Формирование прогнозной модели на основе ...

In [None]:
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred)
precision_rf = precision_score(y_test, y_pred)
recall_rf = recall_score(y_test, y_pred)
print(acc_rf, precision_rf, recall_rf)

### Категоризация клиентов

In [None]:
sc = StandardScaler()
X_sc =sc.fit_transform(X)

linked = linkage(X_sc, method='ward')
plt.figure(figsize=(15,10))

dendrogram(linked, orientation='top')
plt.show()

### Кластеризация

In [25]:
cl = 4
km = KMeans(n_clusters=cl)
labels = km.fit_predict(X_sc)

In [26]:
df['cluster'] = labels

In [None]:
pvtt = pd.pivot_table(data = df,
                      index = 'cluster',
                      columns = 'Churn',
                      values='Age',
                      aggfunc='count').reset_index()
pvtt.columns = ['cluster', 'Churn_0', 'Churn_1']

pvtt['perc'] = pvtt['Churn_1'] / (pvtt['Churn_0'] + pvtt['Churn_1'])
pvtt['number'] = pvtt['Churn_1'] + pvtt['Churn_0']

print(pvtt.head())

Доля оттока существенно изменяется по кластерам

In [28]:
left = df.query('Churn == 1')
stay = df.query('Churn == 0')

In [None]:
category_cat = pd.Series(np.arange(0,cl,1))
print(category_cat)
print(category_cols_2)

In [None]:
for i in category_cols_2:
    for j in category_cat:
        n, bins, patches = plt.hist(left[left['cluster'] == j].iloc[:,i],
                                    bins=10,
                                    alpha=0.5,
                                    label='Left')
        
        plt.hist(stay[stay['cluster'] == j].iloc[:,i], 
                 bins=bins,
                 alpha=0.5,
                 label='Stayed')
        
        plt.legend()
        plt.title(left.columns[i]+'_%i' %j)
        plt.show()

In [None]:
for i in category_cols_3:
    for j in category_cat:
        plt.hist(left[left['cluster'] == j].iloc[:,i], alpha=0.5, label='Left')
        plt.hist(stay[stay['cluster'] == j].iloc[:,i], alpha=0.5, label='Stayed')
        
        plt.legend()
        plt.title(left.columns[i]+'_%i' %j)
        plt.show()

In [None]:
for i in category_cols:
    for j in category_cat:
        pvt = pd.pivot_table(data=df[df['cluster'] == j],
                             index='Churn',
                             columns=df.iloc[:,i],
                             values='Age',
                             aggfunc='count')
        print(pvt.columns)
        print(pvt.shape)
        
        pvt.columns = ['Churn', df.columns[i]+'_0', df.columns[i]+'_1']
        
        m = plt.bar(x = pvt['Churn']-0.1,
                    height=pvt.iloc[:,1],
                    width=0.2,
                    label=pvt.columns[1])
        
        w = plt.bar(x = pvt['Churn']+0.1,
                    height=pvt.iloc[:,2],
                    width=0.2,
                    label=pvt.columns[2])
        
        plt.xticks(ticks=[0,1], labels=['остались', 'отток'])
        plt.legends()
        plt.bar_label(m)
        plt.bar_label(w)
        plt.title(df.columns[i]+'_%i' %j)
        plt.show()

In [None]:
cor = df.corr()
fig, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(cor, annot = True, fmt='.2f')

In [None]:
df.groupby(by='cluster').mean()