In [52]:
import numpy as np
import pandas as pd

In [53]:
train_df = pd.read_csv('./data/train.csv')
train_df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [54]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  int64  
 1   CustomerId       165034 non-null  int64  
 2   Surname          165034 non-null  object 
 3   CreditScore      165034 non-null  int64  
 4   Geography        165034 non-null  object 
 5   Gender           165034 non-null  object 
 6   Age              165034 non-null  float64
 7   Tenure           165034 non-null  int64  
 8   Balance          165034 non-null  float64
 9   NumOfProducts    165034 non-null  int64  
 10  HasCrCard        165034 non-null  float64
 11  IsActiveMember   165034 non-null  float64
 12  EstimatedSalary  165034 non-null  float64
 13  Exited           165034 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 17.6+ MB


In [55]:
train_df.drop(['id', 'CustomerId', 'Surname'], axis = 1, inplace = True)

float_cols = train_df.select_dtypes(include = ['float64', 'int64']).columns
object_cols = train_df.select_dtypes(include = ['object']).columns

print(float_cols)
print(object_cols)

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')
Index(['Geography', 'Gender'], dtype='object')


In [56]:
train_df[float_cols] = train_df[float_cols].astype('float32')
train_df[object_cols] = train_df[object_cols].astype('category')

In [57]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   CreditScore      165034 non-null  float32 
 1   Geography        165034 non-null  category
 2   Gender           165034 non-null  category
 3   Age              165034 non-null  float32 
 4   Tenure           165034 non-null  float32 
 5   Balance          165034 non-null  float32 
 6   NumOfProducts    165034 non-null  float32 
 7   HasCrCard        165034 non-null  float32 
 8   IsActiveMember   165034 non-null  float32 
 9   EstimatedSalary  165034 non-null  float32 
 10  Exited           165034 non-null  float32 
dtypes: category(2), float32(9)
memory usage: 6.0 MB


### Model

In [58]:
import sacred
from sacred.observers import FileStorageObserver
from catboost import CatBoostClassifier, Pool

ex = sacred.Experiment('config', interactive=True)
ex.observers.append(FileStorageObserver.create('runs'))

In [59]:
@ex.config
def cfg():
    iterations = 1000
    learning_rate = 0.1
    depth = 6
    loss_function = 'Logloss'
    verbose = True

In [60]:
from sklearn.model_selection import train_test_split


In [61]:
@ex.capture
def get_model(iterations, learning_rate, depth, loss_function, verbose):
    clf = CatBoostClassifier(iterations = iterations,
                             learning_rate = learning_rate,
                             depth = depth,
                             loss_function = loss_function,
                             verbose = verbose)
    return clf

In [62]:
@ex.main
def run():
    X = train_df.drop('Exited', axis = 1)
    y = train_df['Exited']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    
    train_pool = Pool(X_train, y_train, cat_features=list(object_cols))
    
    model = get_model()
    model.fit(train_pool)
    
    return model

In [63]:
run = ex.run()
model = run.result

INFO - config - Running command 'run'
INFO - config - Started run with ID "6"


(132027, 10) (33007, 10) (132027,) (33007,)
0:	learn: 0.6053044	total: 64.7ms	remaining: 1m 4s
1:	learn: 0.5408520	total: 129ms	remaining: 1m 4s
2:	learn: 0.4932528	total: 195ms	remaining: 1m 4s
3:	learn: 0.4538326	total: 259ms	remaining: 1m 4s
4:	learn: 0.4253255	total: 318ms	remaining: 1m 3s
5:	learn: 0.4038672	total: 372ms	remaining: 1m 1s
6:	learn: 0.3880723	total: 432ms	remaining: 1m 1s
7:	learn: 0.3757558	total: 498ms	remaining: 1m 1s
8:	learn: 0.3663654	total: 559ms	remaining: 1m 1s
9:	learn: 0.3588934	total: 619ms	remaining: 1m 1s
10:	learn: 0.3530627	total: 682ms	remaining: 1m 1s
11:	learn: 0.3484368	total: 752ms	remaining: 1m 1s
12:	learn: 0.3438778	total: 817ms	remaining: 1m 2s
13:	learn: 0.3405457	total: 881ms	remaining: 1m 2s
14:	learn: 0.3378850	total: 943ms	remaining: 1m 1s
15:	learn: 0.3353638	total: 1.01s	remaining: 1m 2s
16:	learn: 0.3338604	total: 1.07s	remaining: 1m 1s
17:	learn: 0.3322397	total: 1.14s	remaining: 1m 1s
18:	learn: 0.3310814	total: 1.21s	remaining: 1m

INFO - config - Result: <catboost.core.CatBoostClassifier object at 0x0000024FD1713BD0>
INFO - config - Completed after 0:01:10


In [65]:
test_df = pd.read_csv('./data/test.csv')
test_df.drop(['id', 'CustomerId', 'Surname'], axis = 1, inplace = True)

In [68]:
float_cols = float_cols.drop('Exited')
test_df[float_cols] = test_df[float_cols].astype('float32')
test_df[object_cols] = test_df[object_cols].astype('category')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110023 entries, 0 to 110022
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   CreditScore      110023 non-null  float32 
 1   Geography        110023 non-null  category
 2   Gender           110023 non-null  category
 3   Age              110023 non-null  float32 
 4   Tenure           110023 non-null  float32 
 5   Balance          110023 non-null  float32 
 6   NumOfProducts    110023 non-null  float32 
 7   HasCrCard        110023 non-null  float32 
 8   IsActiveMember   110023 non-null  float32 
 9   EstimatedSalary  110023 non-null  float32 
dtypes: category(2), float32(8)
memory usage: 3.6 MB


In [72]:
pred = model.predict_proba(test_df)[:, 1]
pred

array([0.0274501 , 0.82998775, 0.02129126, ..., 0.02211826, 0.14210517,
       0.19504515])

In [73]:
print(pred.shape)

(110023,)


### Submission

In [74]:
sub_df = pd.read_csv('./data/sample_submission.csv')
sub_df['Exited'] = pred
sub_df.head()

Unnamed: 0,id,Exited
0,165034,0.02745
1,165035,0.829988
2,165036,0.021291
3,165037,0.213794
4,165038,0.352986


In [76]:
sub_df.to_csv('./data/output_main.csv', index = False)