# Импорт библиотек

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt

%matplotlib inline

# Подготовка исходных данных

In [3]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [4]:
len(df)

In [5]:
df.head()

In [6]:
df.head().T

In [7]:
df.dtypes

In [8]:
total_charges = pd.to_numeric(df.TotalCharges, errors='coerce')
total_charges

In [9]:
df[total_charges.isnull()][['customerID', 'TotalCharges']]

In [10]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.TotalCharges = df.TotalCharges.fillna(0)
df.TotalCharges

In [11]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [12]:
df.churn = (df.churn == 'yes').astype(int)

In [13]:
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [14]:
df_train_full.head()

In [15]:
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

y_train = df_train.churn.values
y_val = df_val.churn.values

del df_train['churn']
del df_val['churn']

# Исследовательский анализ данных

In [16]:
df_train_full.isnull().sum()

In [17]:
df_train_full.churn.value_counts()

In [18]:
global_mean = df_train_full.churn.mean()
round(global_mean, 3)

In [19]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
'phoneservice', 'multiplelines', 'internetservice',
'onlinesecurity', 'onlinebackup', 'deviceprotection',
'techsupport', 'streamingtv', 'streamingmovies',
'contract', 'paperlessbilling', 'paymentmethod']

numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [20]:
df_train_full[categorical].nunique()

In [23]:
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()
print('gender == female:', round(female_mean, 3))

male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()
print('gender == male:', round(male_mean, 3))

In [24]:
partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean()
print('partner == yes:', round(partner_yes, 3))

partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()
print('partner == no:', round(partner_no, 3))

In [27]:
global_mean = df_train_full.churn.mean()

df_group = df_train_full.groupby('gender').churn.agg(['mean'])
df_group['diff'] = df_group['mean'] - global_mean
df_group['risk'] = df_group['mean'] / global_mean

df_group

In [28]:
from IPython.display import display

for col in categorical:
    df_group = df_train_full.groupby(col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    
    display(df_group)

# Взаимная информация

In [30]:
from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    return mutual_info_score(series, df_train_full.churn)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

df_mi

# Коэффициент корреляции

In [32]:
df_train_full[numerical].corrwith(df_train_full.churn)

# Конструирование признаков

In [40]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [42]:
train_dict[0]

In [43]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [44]:
X_train = dv.transform(train_dict)

In [47]:
X_train[0]

In [49]:
dv.get_feature_names_out()

# Машинное обучение для классификации

## Обучение логистической регрессии

In [50]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

In [52]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)


In [54]:
y_pred = model.predict_proba(X_val)[:, 1]

In [55]:
churn = y_pred >= 0.5

In [56]:
(y_val == churn).mean()

## Интерпретация модели

In [57]:
bias = model.intercept_[0]
bias

In [59]:
w = dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))
w

In [71]:
small_subset = ['contract', 'tenure', 'totalcharges']
train_dict_small = df_train[small_subset].to_dict(orient='records')
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

X_small_train = dv_small.transform(train_dict_small)

In [72]:
dv_small.get_feature_names_out()

In [73]:
model_small = LogisticRegression(solver='liblinear', random_state=1)
model_small.fit(X_small_train, y_train)

In [74]:
model_small.intercept_[0]

In [75]:
dict(zip(dv_small.get_feature_names_out(),model_small.coef_[0].round(3)))

## Использование модели

In [76]:
customer = {
'customerid': '8879-zkjof',

'gender': 'female',

'seniorcitizen': 0,

'partner': 'no',

'dependents': 'no',

'tenure': 41,

'phoneservice': 'yes',

'multiplelines': 'no',

'internetservice': 'dsl',

'onlinesecurity': 'yes',

'onlinebackup': 'no',

'deviceprotection': 'yes',

'techsupport': 'yes',

'streamingtv': 'yes',

'streamingmovies': 'yes',

'contract': 'one_year',
    
'paperlessbilling': 'yes',
    
'paymentmethod': 'bank_transfer_(automatic)',

'monthlycharges': 79.85,
    
'totalcharges': 3320.75,

}

In [77]:
X_test = dv.transform([customer])

In [78]:
model.predict_proba(X_test)[0, 1]

In [80]:
customer = {
'gender': 'female',

'seniorcitizen': 1,

'partner': 'no',

'dependents': 'no',

'phoneservice': 'yes',

'multiplelines': 'yes',

'internetservice': 'fiber_optic',

'onlinesecurity': 'no',

'onlinebackup': 'no',

'deviceprotection': 'no',

'techsupport': 'no',
    
'streamingtv': 'yes',

'streamingmovies': 'no',

'contract': 'month-to-month',

'paperlessbilling': 'yes',

'paymentmethod': 'electronic_check',

'tenure': 1,

'monthlycharges': 85.7,

'totalcharges': 85.7
}

In [81]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]