In [1]:
import pandas as pd
import numpy as np

path = "../data/"
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")

In [2]:
from utils.models import CatBoostCV, LGBMCV
from utils.eda import reduce_mem_usage, missing_data, get_cats_nums

In [3]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of properties dataframe is : 0.081695556640625  MB
******************************
Column:  PassengerId
dtype before:  int64
min for this col:  1
max for this col:  891
dtype after:  uint16
******************************
******************************
Column:  Survived
dtype before:  int64
min for this col:  0
max for this col:  1
dtype after:  uint8
******************************
******************************
Column:  Pclass
dtype before:  int64
min for this col:  1
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  Age
dtype before:  float64
min for this col:  0.42
max for this col:  80.0
dtype after:  float32
******************************
******************************
Column:  SibSp
dtype before:  int64
min for this col:  0
max for this col:  8
dtype after:  uint8
******************************
******************************
Column:  Parch
dtype before:  int64
min for this col:  0
max for this col:  6
dtype 

In [4]:
train.columns = [f.lower() for f in train.columns]
test.columns = [f.lower() for f in test.columns]

In [5]:
train.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283302,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.099998,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [7]:
drop_cols = ['name', 'ticket', 'cabin']

In [8]:
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [9]:
train

Unnamed: 0,passengerid,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,1,0,3,male,22.0,1,0,7.250000,S
1,2,1,1,female,38.0,1,0,71.283302,C
2,3,1,3,female,26.0,0,0,7.925000,S
3,4,1,1,female,35.0,1,0,53.099998,S
4,5,0,3,male,35.0,0,0,8.050000,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.000000,S
887,888,1,1,female,19.0,0,0,30.000000,S
888,889,0,3,female,,1,2,23.450001,S
889,890,1,1,male,26.0,0,0,30.000000,C


In [10]:
missing_data(train)

Unnamed: 0,Total,Percent
age,177,19.86532
embarked,2,0.224467
fare,0,0.0
parch,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0
survived,0,0.0
passengerid,0,0.0


In [11]:
missing_data(test)

Unnamed: 0,Total,Percent
age,86,20.574163
fare,1,0.239234
embarked,0,0.0
parch,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0
passengerid,0,0.0


In [12]:
#FeaturesData assumes you have no null values, just impute blindly
train['age'] = train['age'].fillna(-1)
test['age'] = test['age'].fillna(-1)

In [13]:
train['fare'] = train['fare'].fillna(-1)
test['fare'] = test['fare'].fillna(-1)

In [14]:
train['embarked'] = train['embarked'].fillna(train['embarked'].mode()[0])
test['embarked'] = test['embarked'].fillna(train['embarked'].mode()[0])

In [15]:
cats, nums = get_cats_nums(train)

In [16]:
nums = [f for f in nums if f not in ['passengerid', 'survived']]

In [17]:
feats = cats+nums

In [18]:
### MODEL
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
params = {
    'eval_metric':'AUC',
   # 'task_type': 'GPU', if you want to train on GPU
    'iterations': 1000,
    'learning_rate': 0.01,
    'early_stopping_rounds':50,
    'cat_features': cats,
   # 'one_hot_max_size': 5, helpful in high cardinality features
    'bootstrap_type': 'Bayesian', #Bernoulli is a good alternative
    'depth': 6, #depth is useful if range 4~12
    'l2_leaf_reg': 0.3,  #reg_lambda is the alias of l2_leaf_reg
   # 'random_strength': 1,  #works like gamma in XGBOOST
 #   'subsample': 0.8 #works like feature_fraction in lightgbm if bootstrap_type = Bernoulli
}

In [19]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

In [20]:
skf_splitted = skf.split(train[feats], train['survived'])
#train[cats] = train[cats].astype('object')

In [21]:
cb_model = CatBoostCV(
    cv=skf_splitted,
    cats=cats,
    nums=nums,
     random_state=21,
     **params
    )

In [22]:
cb_model = cb_model.fit(
    train, train.survived,
    verbose_eval=1000,
    use_best_model=True
)

0:	test: 0.7959448	best: 0.7959448 (0)	total: 56ms	remaining: 56s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8511647972
bestIteration = 11

Shrink model to first 12 iterations.
0:	test: 0.7823555	best: 0.7823555 (0)	total: 4.26ms	remaining: 4.26s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8797814208
bestIteration = 41

Shrink model to first 42 iterations.
0:	test: 0.7850877	best: 0.7850877 (0)	total: 3.29ms	remaining: 3.29s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8346515195
bestIteration = 35

Shrink model to first 36 iterations.


In [23]:
np.mean(cb_model.model_scores_)

0.8551992458377272

In [24]:
cb_model.feature_importances_

Unnamed: 0,feature_names,importance_0,importance_1,importance_2
0,pclass,25.967284,12.513074,19.600143
1,age,5.214174,3.741861,3.656392
2,sibsp,0.759679,3.017841,2.622744
3,parch,1.565418,4.13248,1.330674
4,fare,1.724192,8.113338,4.209917
5,sex,63.393671,66.664217,65.544897
6,embarked,1.375582,1.81719,3.035233


In [25]:
cb_model.model_scores_

[0.8511647972389991, 0.8797814207650273, 0.8346515195091554]

In [26]:
pd.read_csv("../data/gender_submission.csv")

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [27]:
#test[cats] = test[cats].astype('category')

In [28]:
from catboost import Pool, FeaturesData

test_set = Pool(
    data=FeaturesData(
        num_feature_data=test[nums].values,
        cat_feature_data=test[cats].values,
        num_feature_names=nums,
        cat_feature_names=cats
    )
)

In [29]:
test['Survived'] = cb_model.predict(test_set)

In [30]:
test.rename(columns={'passengerid': 'PassengerId'}, inplace=True)

In [31]:
test[['PassengerId', 'Survived']].to_csv("catboost_basic.csv", index=False)