In [1]:
# 목적: 로지스틱 모델을 통해 이탈 고객 예측하기
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Read the data set into a pandas DataFrame
churn = pd.read_csv('churn.csv', sep=',', header=0)

churn.columns = [heading.lower() for heading in \
churn.columns.str.replace(' ', '_').str.replace("\'", "").str.strip('?')]
churn['churn'] = np.where(churn['churn'] == 'True.', 1, 0) # 예측 값이 문자이기 때문에 수학적 통계 모델에 적용하기 위해 수치 변환
churn['intl_plan'] = np.where(churn['intl_plan'] == 'yes', 1, 0)
churn['vmail_plan'] = np.where(churn['vmail_plan'] == 'yes', 1, 0)
# Fit a logistic regression model
dependent_variable = churn['churn']

independent_variables = churn[['account_length', 'area_code', 'intl_plan', 'vmail_plan', 'vmail_message', 'day_mins', 'day_calls', 'day_charge', 'eve_mins', 'eve_calls', 'eve_charge', 'night_mins', 'night_calls', 'night_charge', 'intl_mins', 'intl_calls', 'intl_charge', 'custserv_calls']]

In [2]:
independent_variables.head(1)

Unnamed: 0,account_length,area_code,intl_plan,vmail_plan,vmail_message,day_mins,day_calls,day_charge,eve_mins,eve_calls,eve_charge,night_mins,night_calls,night_charge,intl_mins,intl_calls,intl_charge,custserv_calls
0,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1


In [7]:
# Logistic 모델에서 내부적으로 사용하는 고정상수(1)를 추가
# 데이터 셋과 상관없이 항상 1이 추가된다.
# add_constant의 인자는 2차원 배열인 경우에 유효
independent_variables_with_constant = sm.add_constant(independent_variables, prepend=True)
independent_variables_with_constant.head(3)

Unnamed: 0,const,account_length,area_code,intl_plan,vmail_plan,vmail_message,day_mins,day_calls,day_charge,eve_mins,eve_calls,eve_charge,night_mins,night_calls,night_charge,intl_mins,intl_calls,intl_charge,custserv_calls
0,1.0,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,1.0,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
2,1.0,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0


In [8]:
logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit()

logit_model.summary()

Optimization terminated successfully.
         Current function value: 0.323821
         Iterations 7


0,1,2,3
Dep. Variable:,churn,No. Observations:,3333.0
Model:,Logit,Df Residuals:,3314.0
Method:,MLE,Df Model:,18.0
Date:,"Mon, 12 Aug 2024",Pseudo R-squ.:,0.2174
Time:,14:34:18,Log-Likelihood:,-1079.3
converged:,True,LL-Null:,-1379.1
Covariance Type:,nonrobust,LLR p-value:,9.95e-116

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-8.4417,0.926,-9.118,0.000,-10.256,-6.627
account_length,0.0008,0.001,0.597,0.550,-0.002,0.004
area_code,-0.0005,0.001,-0.363,0.716,-0.003,0.002
intl_plan,2.0456,0.146,14.038,0.000,1.760,2.331
vmail_plan,-2.0251,0.574,-3.527,0.000,-3.150,-0.900
vmail_message,0.0359,0.018,1.992,0.046,0.001,0.071
day_mins,-0.2567,3.275,-0.078,0.938,-6.675,6.162
day_calls,0.0032,0.003,1.159,0.246,-0.002,0.009
day_charge,1.5861,19.263,0.082,0.934,-36.169,39.342


# 표준화

In [10]:
# 표준화 작업 수행 (수동으로)
independent_variables_standardized = (independent_variables - independent_variables.mean()) / independent_variables.std()

# 표준화된 변수에 상수항 추가
independent_variables_with_constant_standardized = sm.add_constant(independent_variables_standardized, prepend=True)

# 로지스틱 회귀 모델 적합
logit_model_manual_standardized = sm.Logit(dependent_variable, independent_variables_with_constant_standardized).fit()

# 결과 요약 출력
logit_model_manual_standardized.summary()

Optimization terminated successfully.
         Current function value: 0.323821
         Iterations 10


0,1,2,3
Dep. Variable:,churn,No. Observations:,3333.0
Model:,Logit,Df Residuals:,3314.0
Method:,MLE,Df Model:,18.0
Date:,"Mon, 12 Aug 2024",Pseudo R-squ.:,0.2174
Time:,10:00:53,Log-Likelihood:,-1079.3
converged:,True,LL-Null:,-1379.1
Covariance Type:,nonrobust,LLR p-value:,9.95e-116

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.3216,0.072,-32.101,0.000,-2.463,-2.180
account_length,0.0331,0.055,0.597,0.550,-0.076,0.142
area_code,-0.0202,0.056,-0.363,0.716,-0.129,0.089
intl_plan,0.6053,0.043,14.038,0.000,0.521,0.690
vmail_plan,-0.9060,0.257,-3.527,0.000,-1.409,-0.403
vmail_message,0.4911,0.247,1.992,0.046,0.008,0.974
day_mins,-13.9812,178.368,-0.078,0.938,-363.576,335.614
day_calls,0.0643,0.055,1.159,0.246,-0.044,0.173
day_charge,14.6868,178.368,0.082,0.934,-334.909,364.282
