# Customers in a company exit data

### The dataset is the details of the customers in a company.

https://www.kaggle.com/datasets/shubh0799/churn-modelling

![image.png](../Images/Service.png)

고객의 신상정보 데이터를 통한 회사 서비스 이탈 예측

### Library & Data Import

In [90]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [73]:
X_train = pd.read_csv('../Datasets/Service_X_train.csv')
X_test = pd.read_csv('../Datasets/Service_X_test.csv')
y_train = pd.read_csv('../Datasets/Service_y_train.csv')

In [74]:
X_train

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15799217,Zetticci,791,Germany,Female,35,7,52436.20,1,1,0,161051.75
1,15748986,Bischof,705,Germany,Male,42,8,166685.92,2,1,1,55313.51
2,15722004,Hsiung,543,France,Female,31,4,138317.94,1,0,0,61843.73
3,15780966,Pritchard,709,France,Female,32,2,0.00,2,0,0,109681.29
4,15636731,Ts'ai,714,Germany,Female,36,1,101609.01,2,1,1,447.73
...,...,...,...,...,...,...,...,...,...,...,...,...
6494,15702806,Martin,696,Spain,Male,24,9,0.00,1,0,0,10883.52
6495,15674179,Vorobyova,513,Germany,Male,34,7,60515.13,1,0,0,124571.09
6496,15790204,Myers,663,Spain,Female,22,9,0.00,1,1,0,29135.89
6497,15690772,Hughes,635,Spain,Female,48,2,0.00,2,1,1,136551.25


In [75]:
X_test

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15601012,Abdullah,802,France,Female,60,3,92887.06,1,1,0,39473.63
1,15734762,Ignatiev,602,France,Female,56,3,115895.22,3,1,0,4176.17
2,15586757,Anenechukwu,801,France,Female,32,4,75170.54,1,1,1,37898.50
3,15590888,Wade,693,Spain,Female,34,10,107556.06,2,0,0,154631.35
4,15726087,Ch'in,592,France,Female,62,5,0.00,1,1,1,100941.57
...,...,...,...,...,...,...,...,...,...,...,...,...
3496,15733966,Johnstone,496,Germany,Female,55,4,125292.53,1,1,1,31532.96
3497,15669994,Greece,556,Germany,Female,31,1,128663.81,2,1,0,125083.29
3498,15712403,McMillan,589,France,Female,61,1,0.00,1,1,0,61108.56
3499,15643819,Dawson,714,France,Female,25,4,0.00,2,0,0,82500.84


In [76]:
y_train

Unnamed: 0,CustomerId,Exited
0,15799217,0
1,15748986,0
2,15722004,0
3,15780966,0
4,15636731,0
...,...,...
6494,15702806,0
6495,15674179,0
6496,15790204,1
6497,15690772,0


In [77]:
print(X_train.isna().sum())

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64


In [78]:
print(X_test.isna().sum())

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64


In [79]:
# CustomerId 컬럼은 탑승자에 대한 고유 정보로 key 역할로 모델에는 불필요함
# 결과 제출 시에는 X_test의 CustomerId 컬럼이 필요하기 때문에 별도 저장
CustomerId = X_test['CustomerId'].copy()

# 또한, CustomerId, Surname는 Exited와 상관관계가 낮으므로 컬럼을 삭제
X_train = X_train.drop(columns = ['CustomerId', 'Surname'])
X_test = X_test.drop(columns = ['CustomerId', 'Surname'])
y_train = y_train.drop(columns = ['CustomerId'])

In [80]:
X_train_cat = X_train.select_dtypes('object').copy()
X_test_cat = X_test.select_dtypes('object').copy()

ohe = OneHotEncoder(sparse=False)
ohe.fit(X_tran_cat)

X_train_ohe = ohe.transform(X_train_cat)
X_test_ohe = ohe.transform(X_test_cat)

In [81]:
X_train_cat

Unnamed: 0,Geography,Gender
0,Germany,Female
1,Germany,Male
2,France,Female
3,France,Female
4,Germany,Female
...,...,...
6494,Spain,Male
6495,Germany,Male
6496,Spain,Female
6497,Spain,Female


In [82]:
X_train_num = X_train.select_dtypes(exclude='object').copy()
X_test_num = X_test.select_dtypes(exclude='object').copy()

scaler = MinMaxScaler()
scaler.fit(X_train_num)

X_train_sca = scaler.transform(X_train_num)
X_test_sca = scaler.transform(X_test_num)

In [83]:
X_train_num

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,791,35,7,52436.20,1,1,0,161051.75
1,705,42,8,166685.92,2,1,1,55313.51
2,543,31,4,138317.94,1,0,0,61843.73
3,709,32,2,0.00,2,0,0,109681.29
4,714,36,1,101609.01,2,1,1,447.73
...,...,...,...,...,...,...,...,...
6494,696,24,9,0.00,1,0,0,10883.52
6495,513,34,7,60515.13,1,0,0,124571.09
6496,663,22,9,0.00,1,1,0,29135.89
6497,635,48,2,0.00,2,1,1,136551.25


In [84]:
X_TRAIN = np.concatenate([X_train_ohe, X_train_sca], axis=1)
X_TEST = np.concatenate([X_test_ohe, X_test_sca], axis=1)

y_TRAIN = y_train['Exited']

print(type(X_TRAIN), type(X_TEST), type(y_TRAIN))
print(X_TRAIN.shape, X_TEST.shape, y_TRAIN.shape)

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'pandas.core.series.Series'>
(6499, 15) (3501, 15) (6499,)


In [85]:
xtrain, xtest, ytrain, ytest = train_test_split(X_TRAIN, y_TRAIN, test_size=0.25, stratify=y_TRAIN, random_state=2022)

print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

(4874, 15) (1625, 15) (4874,) (1625,)


In [86]:
def make_models(xtrain, xtest, ytrain, ytest):
    model1 = LogisticRegression().fit(xtrain, ytrain)
    print('model1', get_score(model1, xtrain, xtest, ytrain, ytest))
    
    model2 = DecisionTreeClassifier(random_state=2022).fit(xtrain, ytrain)
    print('model2', get_score(model2, xtrain, xtest, ytrain, ytest))
    
    for d in range(3,8):
        model2 = DecisionTreeClassifier(max_depth=d, random_state=2022).fit(xtrain, ytrain)
        print('model2', d, get_score(model2, xtrain, xtest, ytrain, ytest))
        
    model3 = RandomForestClassifier(500, random_state=2022).fit(xtrain, ytrain)
    print('model3', get_score(model3, xtrain, xtest, ytrain, ytest))
    
    for d in range(3,8):
        model3 = RandomForestClassifier(500, max_depth=d, random_state=2022).fit(xtrain, ytrain)
        print('model3', d, get_score(model3, xtrain, xtest, ytrain, ytest))
        

def get_score(model, xtrain, xtest, ytrain, ytest):
    A = model.score(xtrain, ytrain)
    
    ypred = model.predict_proba(xtest)[:,1]
    
    B = roc_auc_score(ytest, ypred)
    
    return f'{A:.4} {B:.4}'

In [91]:
make_models(xtrain, xtest, ytrain, ytest)

model1 0.8135 0.7424
model2 1.0 0.6694
model2 3 0.8492 0.7768
model2 4 0.8541 0.8071
model2 5 0.8607 0.8076
model2 6 0.8716 0.8161
model2 7 0.8802 0.7891
model3 1.0 0.839
model3 3 0.8127 0.8086
model3 4 0.842 0.8187
model3 5 0.8451 0.8251
model3 6 0.8619 0.833
model3 7 0.8794 0.8381


In [92]:
final_model = RandomForestClassifier(500, max_depth=7, random_state=2022).fit(xtrain, ytrain)

print('final model', get_score(final_model, xtrain, xtest, ytrain, ytest))

final model 0.8794 0.8381


In [112]:
y_pred = final_model.predict(X_TEST)

obj = {
    'CustomerId' : CustomerId,
    'Exited' : y_pred
}

result = pd.DataFrame(obj).reset_index(drop=True, inplace=True)

In [113]:
result