In [1]:
import pandas as pd
import numpy as np

path = "../data/titanic/"
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")

In [2]:
from utils.models import CatBoostCV, LGBMCV
from utils.eda import reduce_mem_usage, missing_data, get_cats_nums

In [3]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of properties dataframe is : 0.081695556640625  MB
******************************
Column:  PassengerId
dtype before:  int64
min for this col:  1
max for this col:  891
dtype after:  uint16
******************************
******************************
Column:  Survived
dtype before:  int64
min for this col:  0
max for this col:  1
dtype after:  uint8
******************************
******************************
Column:  Pclass
dtype before:  int64
min for this col:  1
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  Age
dtype before:  float64
min for this col:  0.42
max for this col:  80.0
dtype after:  float32
******************************
******************************
Column:  SibSp
dtype before:  int64
min for this col:  0
max for this col:  8
dtype after:  uint8
******************************
******************************
Column:  Parch
dtype before:  int64
min for this col:  0
max for this col:  6
dtype 

In [4]:
train.columns = [f.lower() for f in train.columns]
test.columns = [f.lower() for f in test.columns]

In [5]:
train.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283302,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.099998,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [7]:
drop_cols = ['name', 'ticket', 'cabin']

In [8]:
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [9]:
train

Unnamed: 0,passengerid,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,1,0,3,male,22.0,1,0,7.250000,S
1,2,1,1,female,38.0,1,0,71.283302,C
2,3,1,3,female,26.0,0,0,7.925000,S
3,4,1,1,female,35.0,1,0,53.099998,S
4,5,0,3,male,35.0,0,0,8.050000,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.000000,S
887,888,1,1,female,19.0,0,0,30.000000,S
888,889,0,3,female,,1,2,23.450001,S
889,890,1,1,male,26.0,0,0,30.000000,C


In [10]:
missing_data(train)

Unnamed: 0,Total,Percent
age,177,19.86532
embarked,2,0.224467
fare,0,0.0
parch,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0
survived,0,0.0
passengerid,0,0.0


In [11]:
missing_data(test)

Unnamed: 0,Total,Percent
age,86,20.574163
fare,1,0.239234
embarked,0,0.0
parch,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0
passengerid,0,0.0


In [12]:
cats, nums = get_cats_nums(train)

In [13]:
cats

['sex', 'embarked']

In [14]:
nums

['passengerid', 'survived', 'pclass', 'age', 'sibsp', 'parch', 'fare']

In [15]:
feats = cats + [f for f in nums if f not in ['passengerid', 'survived']]

In [16]:
feats

['sex', 'embarked', 'pclass', 'age', 'sibsp', 'parch', 'fare']

In [17]:
### MODEL
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
   # 'eta': 0.001,
    'bagging_fraction': 0.9,
    'bagging_freq': 5,
    'random_state': 13,
    'num_leaves': 40,# (2**7)-20,
    'max_depth': -1, #default
    'n_jobs': -1, #all cores
    'feature_fraction': 0.8,
    'reg_alpha': 0.3,
    'reg_lambda': 0.1,
    'categorical_feature': cats#randomstuff
}

In [18]:
train.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [19]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

In [20]:
skf_splitted = skf.split(train[feats], train['survived'])
train[cats] = train[cats].astype('category')

In [21]:
lgb_model = LGBMCV(cv=skf_splitted, **params)

In [22]:
lgb_model.fit(train[feats], train.survived, num_boost_round=1000, \
              early_stopping_rounds=100, verbose_eval=100, categorical_feature=cats)

New categorical_feature is ['embarked', 'sex']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 100 rounds.
[100]	train's binary_error: 0.0808081	eval's binary_error: 0.178451
Early stopping, best iteration is:
[25]	train's binary_error: 0.127946	eval's binary_error: 0.154882
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_error: 0.0959596	eval's binary_error: 0.154882
Early stopping, best iteration is:
[22]	train's binary_error: 0.13468	eval's binary_error: 0.13468
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_error: 0.0707071	eval's binary_error: 0.181818
Early stopping, best iteration is:
[29]	train's binary_error: 0.109428	eval's binary_error: 0.175084


<utils.models.LGBMCV at 0x7f16f6ee5c50>

In [23]:
np.mean(lgb_model.model_scores_)

0.15488215488215487

In [24]:
lgb_model.feature_importances_

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2
sex,850.509148,19,750.085623,16,904.323574,22
embarked,52.705821,17,30.150036,21,31.280233,16
pclass,271.508001,25,184.091776,21,299.71198,32
age,273.232822,131,283.095307,121,328.744187,154
sibsp,36.54688,15,46.171503,16,52.60841,19
parch,9.324376,6,14.69963,6,9.08042,6
fare,305.936251,150,259.895895,130,283.553108,175


In [25]:
lgb_model.model_scores_

[0.15488215488215487, 0.13468013468013468, 0.1750841750841751]

In [26]:
pd.read_csv(path+"gender_submission.csv")

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [27]:
test[cats] = test[cats].astype('category')

In [28]:
test['Survived'] = lgb_model.predict(test[feats])

In [29]:
test.rename(columns={'passengerid': 'PassengerId'}, inplace=True)

In [30]:
submission_path = "submission/titanic_submission/"

In [31]:
test[['PassengerId', 'Survived']].to_csv(submission_path+"lightgbm_basic.csv", index=False)