In [11]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool, cv

In [13]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_size = train.shape[0]
test_size = test.shape[0]

data = pd.concat([train, test], sort=True)

In [14]:
data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=False)

In [15]:
age_ref = data.groupby('Title').Age.mean()
data['Age'] = data.apply(lambda r: r.Age if pd.notnull(r.Age) else age_ref[r.Title] , axis=1)
del age_ref

In [16]:
data.loc[(data.PassengerId==1044, 'Fare')] = 14.43

In [17]:
data['Embarked'] = data['Embarked'].fillna('S')
data['Cabin'] = data['Cabin'].fillna('Undefined')

In [18]:
cols = [
    'Pclass',
    'Name',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Ticket',
    'Fare',
    'Cabin',
    'Embarked'
]
X_train = data[:train_size][cols]
Y_train = data[:train_size]['Survived'].astype(int)
X_test = data[train_size:][cols]

categorical_features_indices = [0,1,2,6,8,9]
X_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Undefined,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Undefined,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Undefined,S


In [19]:
train_pool = Pool(X_train, Y_train, cat_features=categorical_features_indices)

In [21]:
model = CatBoostClassifier(
    learning_rate=0.1,
    depth=3,
    iterations=300,
    random_seed=42,
    logging_level='Silent',
    allow_writing_files=False
)

cv_data = cv(
    train_pool,
    model.get_params(),
    fold_count=5
)

model.fit(train_pool);
model.score(X_train, Y_train)

0.9225589225589226

In [22]:
Y_pred = model.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": data[train_size:]["PassengerId"], 
    "Survived": Y_pred.astype(int)
})
submission.to_csv('submission.csv', index=False)