# Library

In [1]:
# 기본 라이브러리
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 모델 라이브러리
from xgboost import XGBClassifier 
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# 경고창 무시
import warnings
warnings.filterwarnings("ignore")

# 데이터
train =  pd.read_csv("../preprocess/train_preprocessing.csv", index_col="acc_id").sort_index()
train_label = pd.read_csv("../raw/train_label.csv", index_col="acc_id").sort_index()

# train : 30,000  / val = 10,000 / test1 = 20,000

In [2]:
# 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, train_label, test_size=0.25, random_state=42)

# model1 : 결제 vs 미결제

In [3]:
# 결제와 미결제 표현
# 미결제 : 0
# 결제 : 1
y_train_1th = y_train["amount_spent"].apply(lambda x : 0 if x == 0 else 1).values
y_val_1th = y_val["amount_spent"].apply(lambda x : 0 if x == 0 else 1).values

# 모델 생성 및 적용
import xgboost as xgb
model1 = xgb.XGBRFClassifier(random_state=42, 
                             n_estimators=300, 
                             max_depth=3, 
                             max_delta_step=3,
                             gamma=0.5).fit(X_train, y_train_1th)

# 정확도 계산
print("[결제 vs 미결제]")
print("train of Accuracy :", model1.score(X_train, y_train_1th))
print("val of Accuracy :", model1.score(X_val, y_val_1th))

# 오분류표
from sklearn.metrics import confusion_matrix
print("\n[train counfusion matrix]")
print(confusion_matrix(y_train_1th, model1.predict(X_train)))
print("\n[val counfusion matrix]")
print(confusion_matrix(y_val_1th, model1.predict(X_val)))

[결제 vs 미결제]
train of Accuracy : 0.7261333333333333
val of Accuracy : 0.7126

[train counfusion matrix]
[[ 5050  7219]
 [  997 16734]]

[val counfusion matrix]
[[1641 2528]
 [ 346 5485]]


In [4]:
# train 분리
train_non_money = train[model1.predict(train) == 0]
train_money = train[model1.predict(train) == 1]

# model2 : 생존 vs 이탈

In [5]:
# 생존과 이탈
# 이탈 : 0
# 생존 : 1
y_train_2th = y_train["survival_time"].apply(lambda x : 0 if x < 64 else 1).values
y_val_2th = y_val["survival_time"].apply(lambda x : 0 if x < 64 else 1).values

# 모델 생성 및 적용
import xgboost as xgb
model2 = xgb.XGBRFClassifier(random_state=42, 
                             n_estimators=300, 
                             max_depth=3, 
                             max_delta_step=3,
                             gamma=0.5).fit(X_train, y_train_2th)

# 정확도 계산
print("[생존 vs 이탈]")
print("train of Accuracy :", model2.score(X_train, y_train_2th))
print("val of Accuracy :", model2.score(X_val, y_val_2th))

# 오분류표
from sklearn.metrics import confusion_matrix
print("\n[train counfusion matrix]")
print(confusion_matrix(y_train_2th, model2.predict(X_train)))
print("\n[val counfusion matrix]")
print(confusion_matrix(y_val_2th, model2.predict(X_val)))

[생존 vs 이탈]
train of Accuracy : 0.6831666666666667
val of Accuracy : 0.6837

[train counfusion matrix]
[[ 7112  6315]
 [ 3190 13383]]

[val counfusion matrix]
[[2439 2138]
 [1025 4398]]


In [6]:
# 결제 이력이 있는 유저 중에 생존과 이탈을 분리
train_money_survival = train_money[model2.predict(train_money) == 1]
train_money_exit = train_money[model2.predict(train_money) == 0]

# 유저군 분리

In [7]:
print("Train User : 40,000")
print("결제이력이 없는 유저 : {:,}" .format(len(train_non_money)))
print("결제이력이 있는 생존 유저 : {:,}" .format(len(train_money_survival)))
print("결제이력이 있는 이탈 유저  : {:,}" .format(len(train_money_exit)))

Train User : 40,000
결제이력이 없는 유저 : 8,034
결제이력이 있는 생존 유저 : 19,661
결제이력이 있는 이탈 유저  : 12,305


# 결제이력이 있고 이탈한 유저

In [8]:
# 결제 이력이 있고 이탈한 유저만 선택
y_train_3th = y_train.loc[y_train["survival_time"] < 64].loc[y_train["amount_spent"] != 0]
y_val_3th = y_val.loc[y_val["survival_time"] < 64].loc[y_val["amount_spent"] != 0]
X_train_3th = X_train.loc[y_train_3th.index].values
X_val_3th = X_val.loc[y_val_3th.index].values


# 결제 금액 예측 모델
import xgboost as xgb
model3 = xgb.XGBRegressor(random_state=42,
                          objective="reg:squarederror").fit(X_train_3th, np.log(y_train_3th["amount_spent"]))

# 정확도 계산
print("[결제 금액 결정계수]")
print("train of Accuracy :", model3.score(X_train_3th, np.log(y_train_3th["amount_spent"])))
print("val of Accuracy :", model3.score(X_val_3th, np.log(y_val_3th["amount_spent"])))


# 이탈 시점 예측 모델
import xgboost as xgb
model4 = xgb.XGBRegressor(random_state=42,
                          objective="reg:squarederror").fit(X_train_3th, y_train_3th["survival_time"])

# 정확도 계산
print("[결제 금액 결정계수]")
print("train of Accuracy :", model4.score(X_train_3th, y_train_3th["survival_time"]))
print("val of Accuracy :", model4.score(X_val_3th, y_val_3th["survival_time"]))

[결제 금액 결정계수]
train of Accuracy : 0.1531524443777713
val of Accuracy : 0.06849031977922204
[결제 금액 결정계수]
train of Accuracy : 0.1873940776085552
val of Accuracy : 0.10414882140069726


# 최종 예측 결과

In [9]:
# 결제이력이 없는 유저
result_0 = pd.DataFrame()
result_0["acc_id"] = train_non_money.index
result_0["amount_spent"] = 0
result_0["survival_time"] = 64
result_0 = result_0.set_index("acc_id").sort_index()

# 결제이력이 있는 생존 유저
result_1 = pd.DataFrame()
result_1["acc_id"] = train_money_survival.index
result_1["amount_spent"] = 0.05
result_1["survival_time"] = 64
result_1 = result_1.set_index("acc_id").sort_index()

# 결제이력이 있고 이탈한 유저
result_2 = pd.DataFrame()
result_2["acc_id"] = train_money_exit.index
result_2["amount_spent"] =np.exp(model3.predict(train_money_exit.values))
result_2["survival_time"] = model4.predict(train_money_exit.values)
result_2 = result_2.set_index("acc_id").sort_index()

# 다듬기
result_2["survival_time"] = np.round(result_2["survival_time"])

# 최종 결과
train_predict = pd.concat([result_0, result_1, result_2]).sort_index()
train_predict.to_csv("../predict/train_predict.csv", header=True, index=True)