In [66]:
import numpy as np
import pandas as pd

In [67]:
train_df = pd.read_csv('./data/train.csv')
train_df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [68]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  int64  
 1   CustomerId       165034 non-null  int64  
 2   Surname          165034 non-null  object 
 3   CreditScore      165034 non-null  int64  
 4   Geography        165034 non-null  object 
 5   Gender           165034 non-null  object 
 6   Age              165034 non-null  float64
 7   Tenure           165034 non-null  int64  
 8   Balance          165034 non-null  float64
 9   NumOfProducts    165034 non-null  int64  
 10  HasCrCard        165034 non-null  float64
 11  IsActiveMember   165034 non-null  float64
 12  EstimatedSalary  165034 non-null  float64
 13  Exited           165034 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 17.6+ MB


In [69]:
train_df.drop(['id', 'CustomerId', 'Surname'], axis = 1, inplace = True)

float_cols = train_df.select_dtypes(include = ['float64', 'int64']).columns
object_cols = train_df.select_dtypes(include = ['object']).columns

print(float_cols)
print(object_cols)

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')
Index(['Geography', 'Gender'], dtype='object')


In [70]:
train_df[float_cols] = train_df[float_cols].astype('float32')
train_df[object_cols] = train_df[object_cols].astype('category')

In [71]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   CreditScore      165034 non-null  float32 
 1   Geography        165034 non-null  category
 2   Gender           165034 non-null  category
 3   Age              165034 non-null  float32 
 4   Tenure           165034 non-null  float32 
 5   Balance          165034 non-null  float32 
 6   NumOfProducts    165034 non-null  float32 
 7   HasCrCard        165034 non-null  float32 
 8   IsActiveMember   165034 non-null  float32 
 9   EstimatedSalary  165034 non-null  float32 
 10  Exited           165034 non-null  float32 
dtypes: category(2), float32(9)
memory usage: 6.0 MB


### Model

In [73]:
import sacred
from sacred.observers import FileStorageObserver
from catboost import CatBoostClassifier, Pool

ex = sacred.Experiment('config', interactive=True)
ex.observers.append(FileStorageObserver.create('runs'))

In [74]:
@ex.config
def cfg():
    iterations = 1000
    learning_rate = 0.1
    depth = 6
    loss_function = 'Logloss'
    verbose = True

In [None]:
from sklearn.model_selection import train_test_split
ex.capture
def split():
    X = train_df.drop('Exited', axis = 1)
    y = train_df['Exited']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    return X, y, X_train, X_test, y_train, y_test

In [None]:
@ex.capture
def get_model(iterations, learning_rate, depth, loss_function, verbose):
    clf = CatBoostClassifier(iterations = iterations,
                             learning_rate = learning_rate,
                             depth = depth,
                             loss_function = loss_function,
                             verbose = verbose)
    return clf

In [77]:
@ex.main
def run():
    print('Hello world!')

In [78]:
run = ex.run()

INFO - config - Running command 'run'
INFO - config - Started run with ID "7"
INFO - run - [INFO] Training model
INFO - config - Completed after 0:00:00


Hello world!
