# 기준데이터 테스트 (A테스트)

In [24]:
# 목적: 로지스틱 모델을 통해 이탈 고객 예측하기
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Read the data set into a pandas DataFrame
churn = pd.read_csv('churn.csv', sep=',', header=0)

churn.columns = [heading.lower() for heading in \
churn.columns.str.replace(' ', '_').str.replace("\'", "").str.strip('?')]
churn['churn'] = np.where(churn['churn'] == 'True.', 1, 0)
churn['total_charges'] = churn['day_charge'] + churn['eve_charge'] + \
						 churn['night_charge'] + churn['intl_charge']
churn['intl_plan'] = np.where(churn['intl_plan'] == 'yes', 1, 0)
churn['vmail_plan'] = np.where(churn['vmail_plan'] == 'yes', 1, 0)

# Fit a logistic regression model
dependent_variable = churn['churn']

# R추천 독립변수 열을 기준데이터 A로 설정
independent_variables = churn[['intl_plan', 'vmail_plan', 'vmail_message', 'day_charge', 'eve_mins', 'night_charge', 'intl_calls', 'intl_charge', 'custserv_calls']]

In [25]:
independent_variables.head()

Unnamed: 0,intl_plan,vmail_plan,vmail_message,day_charge,eve_mins,night_charge,intl_calls,intl_charge,custserv_calls
0,0,1,25,45.07,197.4,11.01,3,2.7,1
1,0,1,26,27.47,195.5,11.45,3,3.7,1
2,0,0,0,41.38,121.2,7.32,5,3.29,0
3,1,0,0,50.9,61.9,8.86,7,1.78,2
4,1,0,0,28.34,148.3,8.41,3,2.73,3


In [26]:
# 상수항 추가
# independent_variables = churn[['intl_plan', 'vmail_plan', 'vmail_message', 'day', 'eve', 'night','intl','custserv_calls','churn']]
independent_variables_with_constant = sm.add_constant(independent_variables, prepend=True)
independent_variables_with_constant.head()

Unnamed: 0,const,intl_plan,vmail_plan,vmail_message,day_charge,eve_mins,night_charge,intl_calls,intl_charge,custserv_calls
0,1.0,0,1,25,45.07,197.4,11.01,3,2.7,1
1,1.0,0,1,26,27.47,195.5,11.45,3,3.7,1
2,1.0,0,0,0,41.38,121.2,7.32,5,3.29,0
3,1.0,1,0,0,50.9,61.9,8.86,7,1.78,2
4,1.0,1,0,0,28.34,148.3,8.41,3,2.73,3


In [27]:
logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit()

new_observatios = churn.loc[:, independent_variables.columns]

Optimization terminated successfully.
         Current function value: 0.324276
         Iterations 7


In [28]:
new_observatios.shape

(3333, 9)

In [29]:
new_observatios_with_constant = sm.add_constant(new_observatios, prepend=True)

In [30]:
type(new_observatios_with_constant)

pandas.core.frame.DataFrame

In [31]:
new_observatios_with_constant.shape

(3333, 10)

In [32]:
y_predicted = logit_model.predict(new_observatios_with_constant)
y_predicted_rounded=[round(score,0) for score in y_predicted]
logistic_predicted_value_list=[]

total_count = 0
index = 0
total_number = len(y_predicted_rounded)
total_correct = 0

while index < total_number:
    if index<20:
        print(f'{index+1}\t|{y_predicted_rounded[index]}\t|{dependent_variable[index]}')
    if y_predicted_rounded[index] == dependent_variable [index]:
        total_correct += 1
    index+=1

print(f'\n전체 관찰 계수: {total_number}')
print(f'정답수: {total_correct}')
print(f'정답률: {(total_correct/total_number)*100} %')

1	|0.0	|0
2	|0.0	|0
3	|0.0	|0
4	|1.0	|0
5	|1.0	|0
6	|0.0	|0
7	|0.0	|0
8	|0.0	|0
9	|0.0	|0
10	|0.0	|0
11	|0.0	|1
12	|0.0	|0
13	|0.0	|0
14	|0.0	|0
15	|0.0	|0
16	|1.0	|1
17	|0.0	|0
18	|0.0	|0
19	|0.0	|0
20	|0.0	|0

전체 관찰 계수: 3333
정답수: 2877
정답률: 86.31863186318633 %


* 기준데이터 테스트 결과: 86.31863186318633 %

# 비교 테스트1 (테스트 B)
* 가설: 요금 관련된 열을 추가한 신규열을 생성하여 테스트시 일반적으로 성능이 향상되기 때문에 실제로 향상이 되는지 확인

In [33]:
# 목적: 로지스틱 모델을 통해 이탈 고객 예측하기
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Read the data set into a pandas DataFrame
churn = pd.read_csv('churn.csv', sep=',', header=0)

churn.columns = [heading.lower() for heading in \
churn.columns.str.replace(' ', '_').str.replace("\'", "").str.strip('?')]
churn['churn'] = np.where(churn['churn'] == 'True.', 1, 0)
churn['total_charges'] = churn['day_charge'] + churn['eve_charge'] + \
						 churn['night_charge'] + churn['intl_charge']
churn['intl_plan'] = np.where(churn['intl_plan'] == 'yes', 1, 0)
churn['vmail_plan'] = np.where(churn['vmail_plan'] == 'yes', 1, 0)

# Fit a logistic regression model
dependent_variable = churn['churn']

# R추천 + total_charges 추가
independent_variables = churn[['intl_plan', 'vmail_plan', 'vmail_message', 'day_charge', 'eve_mins', 'night_charge', 'intl_calls', 'intl_charge', 'custserv_calls','total_charges']]
independent_variables_with_constant = sm.add_constant(independent_variables, prepend=True)
logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit()

new_observatios = churn.loc[:, independent_variables.columns]
new_observatios_with_constant = sm.add_constant(new_observatios, prepend=True)
y_predicted = logit_model.predict(new_observatios_with_constant)
y_predicted_rounded=[round(score,0) for score in y_predicted]
logistic_predicted_value_list=[]

total_count = 0
index = 0
total_number = len(y_predicted_rounded)
total_correct = 0

while index < total_number:
    if index<20:
        print(f'{index+1}\t|{y_predicted_rounded[index]}\t|{dependent_variable[index]}')
    if y_predicted_rounded[index] == dependent_variable [index]:
        total_correct += 1
    index+=1

print(f'\n전체 관찰 계수: {total_number}')
print(f'정답수: {total_correct}')
print(f'정답률: {(total_correct/total_number)*100} %')


Optimization terminated successfully.
         Current function value: 0.324244
         Iterations 8
1	|0.0	|0
2	|0.0	|0
3	|0.0	|0
4	|1.0	|0
5	|1.0	|0
6	|0.0	|0
7	|0.0	|0
8	|0.0	|0
9	|0.0	|0
10	|0.0	|0
11	|0.0	|1
12	|0.0	|0
13	|0.0	|0
14	|0.0	|0
15	|0.0	|0
16	|1.0	|1
17	|0.0	|0
18	|0.0	|0
19	|0.0	|0
20	|0.0	|0

전체 관찰 계수: 3333
정답수: 2876
정답률: 86.28862886288628 %


* AB테스트  
total_charges는 성능향상에 기여를 하지 않은 것으로 판단된다.

---

# 비교 테스트2 (테스트 B)
* 가설: 추천열중에 가장 회귀계수가 낮은 'vmail_message' 독립변수를 제거하면 성능이 향상될 것이다.

In [34]:
# 목적: 로지스틱 모델을 통해 이탈 고객 예측하기
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Read the data set into a pandas DataFrame
churn = pd.read_csv('churn.csv', sep=',', header=0)

churn.columns = [heading.lower() for heading in \
churn.columns.str.replace(' ', '_').str.replace("\'", "").str.strip('?')]
churn['churn'] = np.where(churn['churn'] == 'True.', 1, 0)
churn['total_charges'] = churn['day_charge'] + churn['eve_charge'] + \
						 churn['night_charge'] + churn['intl_charge']
churn['intl_plan'] = np.where(churn['intl_plan'] == 'yes', 1, 0)
churn['vmail_plan'] = np.where(churn['vmail_plan'] == 'yes', 1, 0)

# Fit a logistic regression model
dependent_variable = churn['churn']

# R추천 + total_charges 추가
independent_variables = churn[['intl_plan', 'vmail_plan', 'day_charge', 'eve_mins', 'night_charge', 'intl_calls', 'intl_charge', 'custserv_calls']]
independent_variables_with_constant = sm.add_constant(independent_variables, prepend=True)
logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit()

new_observatios = churn.loc[:, independent_variables.columns]
new_observatios_with_constant = sm.add_constant(new_observatios, prepend=True)
y_predicted = logit_model.predict(new_observatios_with_constant)
y_predicted_rounded=[round(score,0) for score in y_predicted]
logistic_predicted_value_list=[]

total_count = 0
index = 0
total_number = len(y_predicted_rounded)
total_correct = 0

while index < total_number:
    if index<20:
        print(f'{index+1}\t|{y_predicted_rounded[index]}\t|{dependent_variable[index]}')
    if y_predicted_rounded[index] == dependent_variable [index]:
        total_correct += 1
    index+=1

print(f'\n전체 관찰 계수: {total_number}')
print(f'정답수: {total_correct}')
print(f'정답률: {(total_correct/total_number)*100} %')

Optimization terminated successfully.
         Current function value: 0.324860
         Iterations 7
1	|0.0	|0
2	|0.0	|0
3	|0.0	|0
4	|1.0	|0
5	|1.0	|0
6	|0.0	|0
7	|0.0	|0
8	|0.0	|0
9	|0.0	|0
10	|0.0	|0
11	|0.0	|1
12	|0.0	|0
13	|0.0	|0
14	|0.0	|0
15	|0.0	|0
16	|1.0	|1
17	|0.0	|0
18	|0.0	|0
19	|0.0	|0
20	|0.0	|0

전체 관찰 계수: 3333
정답수: 2877
정답률: 86.31863186318633 %


# 미션

* 기준데이터 성능: 86.31863186318633 % 보다 더 높은 성능에 도달할 수 있는 독립변수 열을 AB테스트로 찾아보세요.

In [4]:
# 목적: 로지스틱 모델을 통해 이탈 고객 예측하기
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Read the data set into a pandas DataFrame
churn = pd.read_csv('churn.csv', sep=',', header=0)

churn.columns = [heading.lower() for heading in \
churn.columns.str.replace(' ', '_').str.replace("\'", "").str.strip('?')]
churn['churn'] = np.where(churn['churn'] == 'True.', 1, 0)
churn['total_charges'] = churn['day_charge'] + churn['eve_charge'] + \
						 churn['night_charge'] + churn['intl_charge']
churn['intl_plan'] = np.where(churn['intl_plan'] == 'yes', 1, 0)
churn['vmail_plan'] = np.where(churn['vmail_plan'] == 'yes', 1, 0)

# Fit a logistic regression model
dependent_variable = churn['churn']

# R추천 + total_charges 추가
independent_variables = churn[ [
    "account_length",  # 계정 기간
    "intl_plan",       # 국제 요금제 가입 여부
    "vmail_message",   # 음성 메일 메시지 수
    "day_calls",       # 낮 시간 동안의 전화 횟수
    "eve_calls",       # 저녁 시간 동안의 전화 횟수
    "night_calls",     # 야간 시간 동안의 전화 횟수
    "intl_mins",       # 국제 전화 시간
    "intl_calls",      # 국제 전화 횟수
    "intl_charge"      # 국제 전화 요금
]]
independent_variables_with_constant = sm.add_constant(independent_variables, prepend=True)
logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit()

new_observatios = churn.loc[:, independent_variables.columns]
new_observatios_with_constant = sm.add_constant(new_observatios, prepend=True)
y_predicted = logit_model.predict(new_observatios_with_constant)
y_predicted_rounded=[round(score,0) for score in y_predicted]
logistic_predicted_value_list=[]

total_count = 0
index = 0
total_number = len(y_predicted_rounded)
total_correct = 0

while index < total_number:
    if index<20:
        print(f'{index+1}\t|{y_predicted_rounded[index]}\t|{dependent_variable[index]}')
    if y_predicted_rounded[index] == dependent_variable [index]:
        total_correct += 1
    index+=1

print(f'\n전체 관찰 계수: {total_number}')
print(f'정답수: {total_correct}')
print(f'정답률: {(total_correct/total_number)*100} %')

Optimization terminated successfully.
         Current function value: 0.379256
         Iterations 7
1	|0.0	|0
2	|0.0	|0
3	|0.0	|0
4	|0.0	|0
5	|1.0	|0
6	|0.0	|0
7	|0.0	|0
8	|0.0	|0
9	|0.0	|0
10	|0.0	|0
11	|0.0	|1
12	|0.0	|0
13	|0.0	|0
14	|0.0	|0
15	|0.0	|0
16	|0.0	|1
17	|0.0	|0
18	|0.0	|0
19	|0.0	|0
20	|0.0	|0

전체 관찰 계수: 3333
정답수: 2891
정답률: 86.73867386738674 %
