In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from category_encoders import TargetEncoder, one_hot
from catboost import *
%matplotlib inline

In [3]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,id,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,X13,X14,X15,X16,X17,X18,X19,X20,X21,target
0,0,2,908749,954,480270,935,71,79,53,89,...,51,100,87,3,61,48,43,79,329,0
1,1,1,438444,2162,486685,2154,32,39,48,49,...,39,83,91,6,57,94,15,89,581,1
2,2,1,596915,2066,711059,2081,43,21,58,94,...,7,52,37,50,69,60,89,10,548,0
3,3,1,625198,1508,66810,1474,7,1,34,73,...,39,27,55,97,83,1,88,87,555,1
4,4,2,228654,1202,542816,1196,53,35,16,80,...,35,95,50,55,10,49,12,68,651,0


# Data Prepairing

In [4]:
data.describe()

Unnamed: 0,id,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,X13,X14,X15,X16,X17,X18,X19,X20,X21,target
count,3633005.0,3633005.0,3633005.0,3633005.0,3633005.0,3633005.0,3633005.0,3633005.0,3633005.0,3633005.0,...,3633005.0,3633005.0,3633005.0,3633005.0,3633005.0,3633005.0,3633005.0,3633005.0,3633005.0,3633005.0
mean,1816502.0,1.650469,563803.5,1449.992,563757.4,1450.097,50.16512,50.16514,50.18091,50.15996,...,50.17506,50.17908,50.18681,50.1889,50.14741,50.18218,50.16238,50.15347,491.3625,0.3800829
std,1048758.0,0.7179981,324881.4,484.8098,324785.9,484.8208,29.64439,29.66132,29.63772,29.64519,...,29.65711,29.65469,29.64957,29.65893,29.65769,29.6573,29.63982,29.66294,146.4528,0.485407
min,0.0,1.0,0.0,6.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0
25%,908251.0,1.0,282992.0,1074.0,282937.0,1074.0,24.0,24.0,24.0,24.0,...,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,384.0,0.0
50%,1816502.0,2.0,563746.0,1370.0,563918.0,1371.0,51.0,51.0,51.0,51.0,...,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,535.0,0.0
75%,2724753.0,2.0,845418.0,1747.0,845459.0,1747.0,79.0,79.0,79.0,79.0,...,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,615.0,1.0
max,3633004.0,8.0,1126077.0,4186.0,1126076.0,4192.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,680.0,1.0


In [5]:
y = data.target
X = data.drop(['target', 'id'], axis=1)

In [6]:
cat_features = list(range(0, X.shape[1]-1))
print(cat_features)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [7]:
print('Labels: {}'.format(set(y)))
print('Zero count = {}, One count = {}'.format(len(y) - sum(y), sum(y)))

Labels: {0, 1}
Zero count = 2252162, One count = 1380843


In [8]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8, random_state=42, shuffle=True)

# Model training

In [10]:
# !rm 'catboost_info/snapshot.bkp'
from catboost import CatBoostClassifier
model = CatBoostClassifier(
    iterations=1000,
    verbose=10,
    random_seed=42,
    learning_rate=0.3,
    save_snapshot=True,
    snapshot_file='snapshot.bkp',
    snapshot_interval=1,
    l2_leaf_reg=5.0,
    early_stopping_rounds=20,
    one_hot_max_size=5,
    bootstrap_type='Bernoulli',
    subsample=0.5,
    random_strength=2,
    use_best_model=True,
    best_model_min_trees=30,
    rsm=0.5,
    leaf_estimation_iterations=5
)
model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    plot=False
)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.650908725
bestIteration = 224

Shrink model to first 225 iterations.


<catboost.core.CatBoostClassifier at 0x1e555bced00>

In [11]:
model.tree_count_

225

# Feature Importance

In [12]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,X1,30.616309
1,X3,29.53941
2,X2,6.444459
3,X4,2.453371
4,X18,2.429361
5,X6,2.065132
6,X9,1.984663
7,X12,1.947628
8,X5,1.930384
9,X8,1.904168


# Hyperparameter tunning

# Saving Model

# Calculate predictions for the contest

In [13]:
data_test = pd.read_csv('test.csv')
data_test.head()

Unnamed: 0,id,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21
0,0,1,1032217,1187,1011523,1172,57,100,1,80,...,79,45,18,35,10,80,81,16,0,547
1,1,2,1059033,2128,505263,2104,82,27,87,79,...,74,96,55,54,48,3,92,5,51,656
2,2,1,207787,1017,822639,1040,82,48,43,62,...,16,64,35,70,99,60,30,37,45,437
3,3,1,1095582,1449,32841,1424,71,43,11,98,...,1,7,41,94,26,52,45,81,1,589
4,4,2,963764,1202,489871,1182,11,62,16,55,...,26,60,37,7,1,80,48,50,52,340


In [14]:
X_test = data_test.loc[:, 'X0':'X21']
X_test.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21
0,1,1032217,1187,1011523,1172,57,100,1,80,25,...,79,45,18,35,10,80,81,16,0,547
1,2,1059033,2128,505263,2104,82,27,87,79,80,...,74,96,55,54,48,3,92,5,51,656
2,1,207787,1017,822639,1040,82,48,43,62,18,...,16,64,35,70,99,60,30,37,45,437
3,1,1095582,1449,32841,1424,71,43,11,98,80,...,1,7,41,94,26,52,45,81,1,589
4,2,963764,1202,489871,1182,11,62,16,55,88,...,26,60,37,7,1,80,48,50,52,340


In [15]:
y_test = model.predict_proba(X_test)
y_test = y_test[:, 1]

# Prepare the submission

In [17]:
ans = pd.DataFrame(data=y_test, columns=["target"])
ans.reset_index(level=0, inplace=True)
ans.reset_index(drop=True, inplace=True)
ans = ans.rename(columns={'index':'id'})
ans.head()

Unnamed: 0,id,target
0,0,0.395078
1,1,0.364833
2,2,0.318545
3,3,0.375087
4,4,0.318014


In [18]:
ans.to_csv('submission.csv',index=False)