In [26]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [27]:
## dataSet setting
churn = pd.read_csv('C:/Users/KIHyuk/Documents/GitHub/Jupyter_Repo/Data/churn.csv',sep=',',header=0)

In [28]:
churn.columns = [heading.lower() for heading in churn.columns.str.replace(' ','_').str.replace("\'","").str.strip('?')]

## np.where(조건문, true변경값, false변경값)
churn['churn01'] = np.where(churn['churn'] == 'True.', 1., 0.) 

churn.head()

Unnamed: 0,state,account_length,area_code,phone,intl_plan,vmail_plan,vmail_message,day_mins,day_calls,day_charge,...,eve_charge,night_mins,night_calls,night_charge,intl_mins,intl_calls,intl_charge,custserv_calls,churn,churn01
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,16.78,244.7,91,11.01,10.0,3,2.7,1,False.,0.0
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,16.62,254.4,103,11.45,13.7,3,3.7,1,False.,0.0
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,10.3,162.6,104,7.32,12.2,5,3.29,0,False.,0.0
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,5.26,196.9,89,8.86,6.6,7,1.78,2,False.,0.0
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,12.61,186.9,121,8.41,10.1,3,2.73,3,False.,0.0


In [29]:
## churn 에 따른 그룹화

## 그룹별 기술통계
print(churn.groupby(['churn'])[['day_charge','eve_charge','night_charge','intl_charge','account_length','custserv_calls']].agg(['count','mean','std']))

       day_charge                       eve_charge                       \
            count       mean        std      count       mean       std   
churn                                                                     
False.       2850  29.780421   8.530835       2850  16.918909  4.274863   
True.         483  35.175921  11.729710        483  18.054969  4.396762   

       night_charge                     intl_charge                      \
              count      mean       std       count      mean       std   
churn                                                                     
False.         2850  9.006074  2.299768        2850  2.743404  0.751784   
True.           483  9.235528  2.121081         483  2.889545  0.754152   

       account_length                       custserv_calls                      
                count        mean       std          count      mean       std  
churn                                                                           
False

In [30]:
## 변수별 통계량
print(churn.groupby(['churn']).agg({'day_charge' : ['mean', 'std'],
                             'eve_charge' : ['mean','std'],
                             'night_charge' : ['mean','std'],
                             'intl_charge' : ['mean','std'],
                             'account_length' : ['count','min','max'],
                             'custserv_calls' : ['count','min','max']}))

       day_charge            eve_charge           night_charge            \
             mean        std       mean       std         mean       std   
churn                                                                      
False.  29.780421   8.530835  16.918909  4.274863     9.006074  2.299768   
True.   35.175921  11.729710  18.054969  4.396762     9.235528  2.121081   

       intl_charge           account_length          custserv_calls          
              mean       std          count min  max          count min max  
churn                                                                        
False.    2.743404  0.751784           2850   1  243           2850   0   8  
True.     2.889545  0.754152            483   1  225            483   0   9  


In [31]:
## total_charges(새로운 변수) 기준 그룹화 + 그룹별 통계량

churn['total_charges'] = churn['day_charge'] + churn['eve_charge'] + churn['night_charge'] + churn['intl_charge']

factor_cut = pd.cut(churn.total_charges,5,precision=2) ## cut, total_charges 변수의 데이터를 폭이 같은 5개 구간으로 나눔 

def get_stats(group): ## 그룹 통계량 반환 함수
    return {'min' : group.min(), 'max' : group.max(),
           'count' : group.count(), 'mean' : group.mean(),
           'std' : group.std()}

grouped = churn.groupby(factor_cut)['custserv_calls']

print(grouped.apply(get_stats).unstack())
# print(grouped.apply(get_stats))

                min  max   count      mean       std
total_charges                                       
(22.86, 37.57]  0.0  5.0    70.0  1.528571  1.348337
(37.57, 52.22]  0.0  7.0   742.0  1.564690  1.305234
(52.22, 66.86]  0.0  9.0  1726.0  1.581692  1.326646
(66.86, 81.51]  0.0  9.0   735.0  1.523810  1.295209
(81.51, 96.15]  0.0  5.0    60.0  1.516667  1.359108


In [32]:
factor_qcut = pd.qcut(churn.account_length, 4)
grouped = churn.custserv_calls.groupby(factor_qcut)
print(grouped.apply(get_stats).unstack())

                min  max  count      mean       std
account_length                                     
(0.999, 74.0]   0.0  9.0  857.0  1.506418  1.251268
(74.0, 101.0]   0.0  7.0  847.0  1.604486  1.359888
(101.0, 127.0]  0.0  8.0  803.0  1.652553  1.358479
(127.0, 243.0]  0.0  9.0  826.0  1.491525  1.286970


In [33]:
# intl_plan와 vmail_plan 열에 대한 이진형 지시변수를 만들고,
# churn열과 병합하여 새로운 데이터 프레임을 생성하기

intl_dummies = pd.get_dummies(churn['intl_plan'],prefix='intl_plan')
vmail_dummies = pd.get_dummies(churn['vmail_plan'],prefix='vmail_plan')

churn_with_dummies = churn[['churn']].join([intl_dummies,vmail_dummies])
print(churn_with_dummies.head())

    churn  intl_plan_no  intl_plan_yes  vmail_plan_no  vmail_plan_yes
0  False.             1              0              0               1
1  False.             1              0              0               1
2  False.             1              0              1               0
3  False.             0              1              1               0
4  False.             0              1              1               0


In [34]:
# total_charges를 사분위수로 분할하고, 이진형 지시변수를 만들고,
# 새로운 더미변수를 churn dataFrame에 추가하기

qcut_names = ['1st_quartile', '2nd_quartile', '3rd_quartile', '4th_quartile']
total_charges_quartiles = pd.qcut(churn['total_charges'] ,4 ,labels=qcut_names)

dummies = pd.get_dummies(total_charges_quartiles, prefix='total_charges')
churn_with_dummies = churn.join(dummies)
churn_with_dummies.head()

Unnamed: 0,state,account_length,area_code,phone,intl_plan,vmail_plan,vmail_message,day_mins,day_calls,day_charge,...,intl_calls,intl_charge,custserv_calls,churn,churn01,total_charges,total_charges_1st_quartile,total_charges_2nd_quartile,total_charges_3rd_quartile,total_charges_4th_quartile
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,3,2.7,1,False.,0.0,75.56,0,0,0,1
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,3,3.7,1,False.,0.0,59.24,0,1,0,0
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,5,3.29,0,False.,0.0,62.29,0,0,1,0
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,7,1.78,2,False.,0.0,66.8,0,0,0,1
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,3,2.73,3,False.,0.0,52.09,1,0,0,0


In [35]:
# pivot table
churn.pivot_table(['total_charges'],index=['churn', 'custserv_calls'])

Unnamed: 0_level_0,Unnamed: 1_level_0,total_charges
churn,custserv_calls,Unnamed: 2_level_1
False.,0,58.429752
False.,1,58.164391
False.,2,57.534241
False.,3,58.797195
False.,4,64.318
False.,5,66.303077
False.,6,62.15
False.,7,64.6775
False.,8,64.67
True.,0,69.601087


In [36]:
churn.pivot_table(['total_charges'],index=['churn'],columns=['custserv_calls'])

Unnamed: 0_level_0,total_charges,total_charges,total_charges,total_charges,total_charges,total_charges,total_charges,total_charges,total_charges,total_charges
custserv_calls,0,1,2,3,4,5,6,7,8,9
churn,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
False.,58.429752,58.164391,57.534241,58.797195,64.318,66.303077,62.15,64.6775,64.67,
True.,69.601087,70.723443,69.39908,68.931136,55.374474,52.8455,49.714286,50.578,52.73,70.39


In [37]:
churn.pivot_table(['total_charges'],index=['custserv_calls'],columns=['churn'],aggfunc='mean',fill_value='NaN',margins=True)

Unnamed: 0_level_0,total_charges,total_charges,total_charges
churn,False.,True.,All
custserv_calls,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,58.4298,69.601087,59.904304
1,58.1644,70.723443,59.46177
2,57.5342,69.39908,58.894242
3,58.7972,68.931136,59.836573
4,64.318,55.374474,60.223373
5,66.3031,52.8455,58.14697
6,62.15,49.714286,54.236364
7,64.6775,50.578,56.844444
8,64.67,52.73,58.7
9,,70.39,70.39


In [43]:
# modeling

dependent_variable = churn['churn01']
independent_variables = churn[['account_length','custserv_calls','total_charges']] 
independent_variables_with_constant = sm.add_constant(independent_variables,prepend=True) ##

logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit()

Optimization terminated successfully.
         Current function value: 0.363480
         Iterations 7


In [14]:
print(logit_model.summary())

                           Logit Regression Results                           
Dep. Variable:                churn01   No. Observations:                 3333
Model:                          Logit   Df Residuals:                     3329
Method:                           MLE   Df Model:                            3
Date:                Wed, 18 Dec 2019   Pseudo R-squ.:                  0.1216
Time:                        02:07:10   Log-Likelihood:                -1211.5
converged:                       True   LL-Null:                       -1379.1
Covariance Type:            nonrobust   LLR p-value:                 2.234e-72
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -7.2205      0.394    -18.309      0.000      -7.993      -6.448
account_length     0.0012      0.001      0.927      0.354      -0.001       0.004
custserv_calls     0.4443      0.037

In [15]:
logit_model.params

const            -7.220520
account_length    0.001222
custserv_calls    0.444323
total_charges     0.072914
dtype: float64

In [16]:
logit_model.bse

const             0.394363
account_length    0.001317
custserv_calls    0.036633
total_charges     0.005422
dtype: float64

In [17]:
def inverse_logit(model_formula):
    from math import exp
    return (1.0 / (1.0 + exp(-model_formula)))

at_means = float(logit_model.params[0]) + \
    float(logit_model.params[1])*float(churn['account_length'].mean()) + \
    float(logit_model.params[2])*float(churn['custserv_calls'].mean()) + \
    float(logit_model.params[3])*float(churn['total_charges'].mean())

In [18]:
print(churn['account_length'].mean())
print(churn['custserv_calls'].mean())
print(churn['total_charges'].mean())
print(at_means)
print('P of churn when ind. vars are their mean: %.3f' % inverse_logit(at_means))

101.06480648064806
1.5628562856285628
59.44975397539747
-2.0679167809476997
P of churn when ind. vars are their mean: 0.112


In [19]:
cust_serv_mean = float(logit_model.params[0]) + \
    float(logit_model.params[1])*float(churn['account_length'].mean()) + \
    float(logit_model.params[2])*float(churn['custserv_calls'].mean()) + \
    float(logit_model.params[3])*float(churn['total_charges'].mean())

cust_serv_mean_minus_one = float(logit_model.params[0]) + \
    float(logit_model.params[1])*float(churn['account_length'].mean()) + \
    float(logit_model.params[2])*float(churn['custserv_calls'].mean()-1.0) + \
    float(logit_model.params[3])*float(churn['total_charges'].mean())

In [20]:
print(cust_serv_mean)
print(churn['custserv_calls'].mean()-1.0)
print(cust_serv_mean_minus_one)
print('Probability of churn when account length changes by 1: %.3f' %(inverse_logit(cust_serv_mean)- inverse_logit(cust_serv_mean_minus_one)))

-2.0679167809476997
0.5628562856285628
-2.512239499484104
Probability of churn when account length changes by 1: 0.037


In [21]:
# 기존 데이터셋의 첫 10개 값을 가지고 '새로운' 관측값 데이터셋을 만듦

new_observations = churn.ix[churn.index.isin(range(10)),independent_variables.columns]
new_observations_with_constant = sm.add_constant(new_observations, prepend=True)
y_predicted = logit_model.predict(new_observations_with_constant)
y_predicted_rounded = [round(score,2) for score in y_predicted]

print(y_predicted_rounded)

[0.25, 0.09, 0.08, 0.2, 0.12, 0.1, 0.49, 0.03, 0.22, 0.24]


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
