In [1]:
import re
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [2]:
class Model:

    def __init__(self):
        
        self.median_age_by_title = {}
        self.mode_cabin_by_pclass = {}
        self.embarked_mode = None
        self.fare_stand_scal = None
        
    def fit(self, df):
        
        df_temp = df.copy()
        
        # median age by title
        df_temp['Title'] = df_temp['Name'].str.split().apply(lambda x: self.get_title(x))
        for title in df_temp['Title'].unique():
            self.median_age_by_title[title] = df_temp[df_temp['Title'] == title]['Age'].median()
            
        # mode cabin_type by pclass
        df_temp['Cabin_type'] = df_temp['Cabin'].str.split().apply(lambda x: self.get_cabin_type(x))
        for pclass in df_temp['Pclass'].unique():
            self.mode_cabin_by_pclass[pclass] = df_temp[df_temp['Pclass'] == pclass]['Cabin_type'].mode()[0]
        
        self.embarked_mode = df_temp['Embarked'].mode()[0]
        
        # fit scaler
        self.fare_stand_scal = StandardScaler()
        self.fare_stand_scal.fit(df_temp[['Fare']].to_numpy())
        
        del df_temp
        
    def transform(self, df):

        # titles mr|ms
        df['Title'] = df['Name'].str.split().apply(lambda x: self.get_title(x))
        for title in df['Title'].unique():
            if self.median_age_by_title[title]:
                df.loc[(df['Title'] == title) & (df['Age'].isna()), 'Age'] = self.median_age_by_title[title]
        df.loc[(df['Title'].isna()) & (df['Sex'] == 'male'), 'Title'] = 'mr'
        df.loc[(df['Title'].isna()) & (df['Sex'] == 'female'), 'Title'] = 'ms'
        
        df['Cabin_type'] = df['Cabin'].str.split().apply(lambda x: self.get_cabin_type(x))
        for pclass in df['Pclass'].unique():
            if self.mode_cabin_by_pclass[pclass]:
                df.loc[(df['Pclass'] == pclass) & (df['Cabin_type'].isna()), 'Cabin_type'] = self.mode_cabin_by_pclass[pclass]
        
        df['Ticket_type'] = df['Ticket'].str.split().apply(lambda x: re.sub('[^\w\s]','',x[0]).lower() if len(x) > 1 else 'None')
        
        df['Family_size'] = df['SibSp'] + df['Parch'] + 1
        df['Family_size'] = df['Family_size'].apply(lambda x: self.get_family_size(x))
        
        df['Age_group'] = df['Age'].apply(lambda x: self.get_age_group(x))
        df['Fare'] = self.fare_stand_scal.transform(df[['Fare']].to_numpy())
        
        df['Cabin'].fillna('None', inplace = True)
        df['Fare'].fillna(0, inplace = True)
        df['Embarked'].fillna(self.embarked_mode, inplace = True)
        
        df.drop(['Name', 'Cabin', 'Ticket', 'Sex', 'Age', 'Parch'], axis=1, inplace=True)
        
        return df

    @staticmethod
    def get_age_group(age):
        if (age < 1): 
            return 'infant'
        elif (age >= 1 and age <= 11):
            return 'child'
        elif (age >= 12 and age <= 17):
            return 'teen'
        elif (age >= 18 and age <= 65):
            return 'adult'
        else:
            return 'pensioner'
    
    @staticmethod
    def get_family_size(size):
        if (size == 1): 
            return 'alone'
        elif (size <= 4):
            return 'medium'
        else:
            return 'large'
    
    @staticmethod
    def get_cabin_type(cabin):
        return cabin[-1][0].strip().lower() if cabin is not np.nan else cabin
    
    @staticmethod
    def get_title(name):
        correct_titles = ['Mr.', 'Ms.']
        title = None
        for i in name:
            if i in correct_titles:
                title = i
                break
            elif i in ['Sir.', 'Don.', 'Master.', 'Dr.', 'Rev.', 'Major.', 'Col.', 'Capt.', 'Jonkheer.']:
                title = 'Mr.'
                break
            elif i in ['Lady.', 'Dona.', 'Countess.', 'Mlle.', 'Mme.']:
                title = 'Ms.'
                break
        if title is None:
            return np.nan
        else:
            return title.lower()[0:-1]

In [3]:
df_train = pd.read_csv('../data/train.csv')
df_final = pd.read_csv('../data/test.csv')
df_train_test = pd.concat([df_train.drop('Survived', axis=1), df_final], axis=0)

model = Model()
model.fit(df_train_test)

df_train = model.transform(df_train)
df_final = model.transform(df_final)

target = 'Survived'
features_all = df_train.drop([target, 'PassengerId'], axis=1).columns.tolist()
features_cat = df_train.select_dtypes(include=['object']).columns.tolist()

In [4]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,SibSp,Fare,Embarked,Title,Cabin_type,Ticket_type,Family_size,Age_group
0,1,0,3,1,-0.503402,S,mr,g,a5,medium,adult
1,2,1,1,1,0.734222,C,ms,c,pc,medium,adult
2,3,1,3,0,-0.490356,S,ms,g,stono2,alone,adult
3,4,1,1,1,0.382778,S,ms,c,,medium,adult
4,5,0,3,0,-0.48794,S,mr,g,,alone,adult


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   SibSp        891 non-null    int64  
 4   Fare         891 non-null    float64
 5   Embarked     891 non-null    object 
 6   Title        891 non-null    object 
 7   Cabin_type   891 non-null    object 
 8   Ticket_type  891 non-null    object 
 9   Family_size  891 non-null    object 
 10  Age_group    891 non-null    object 
dtypes: float64(1), int64(4), object(6)
memory usage: 76.7+ KB


In [6]:
X = df_train[features_all]
y = df_train[target]

train_pool = Pool(
    X,
    y,
    cat_features=features_cat
)

In [7]:
cls_model = CatBoostClassifier(
    silent=True,
    eval_metric='Accuracy',
    auto_class_weights='Balanced',
    early_stopping_rounds=50,
)

params_grid = {
    'depth': [2, 4, 6, 8, 10],
    'learning_rate': [0.01, 0.03, 0.1, 0.3, 0.5],
    'iterations': [800, 1000, 1050, 1100, 1200],
    'l2_leaf_reg': [2, 2.5, 3, 3.5, 4],
    'bagging_temperature': [1, 1.5, 2, 2.5],
}

gs = cls_model.randomized_search(
    params_grid,
    train_pool,
    shuffle=True,
    n_iter=20,
    stratified=True,
    partition_random_seed=42,
    cv=StratifiedKFold(
        n_splits=10,
        shuffle=True,
        random_state=42,
    ),
    calc_cv_statistics=True,
    refit=True,
)

Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7771331903
bestIteration = 0

0:	loss: 0.7771332	best: 0.7771332 (0)	total: 121ms	remaining: 2.3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8297075184
bestIteration = 131

1:	loss: 0.8297075	best: 0.8297075 (1)	total: 325ms	remaining: 2.93s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8490452025
bestIteration = 6

2:	loss: 0.8490452	best: 0.8490452 (2)	total: 692ms	remaining: 3.92s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8462654106
bestIteration = 18

3:	loss: 0.8462654	best: 0.8490452 (2)	total: 1.14s	remaining: 4.58s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8462654106
bestIteration = 5

4:	loss: 0.8462654	best: 0.8490452 (2)	total: 1.78s	remaining: 5.34s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8693497705
bestIteration = 41

5:	loss: 0.8693498	best: 0.8693498 (5)	total: 1.89s	remaining: 4

In [8]:
feature_importances = pd.DataFrame(
    zip(X.columns, cls_model.get_feature_importance()),
    columns=['feature_name', 'importance']
)

feature_importances.sort_values(by='importance', ascending=False, inplace=True)
feature_importances.head(20)

Unnamed: 0,feature_name,importance
2,Fare,27.338385
4,Title,22.101522
5,Cabin_type,11.498611
7,Family_size,8.924772
8,Age_group,7.819569
6,Ticket_type,7.701429
0,Pclass,6.831192
3,Embarked,6.1793
1,SibSp,1.60522


In [9]:
preds_final = pd.DataFrame()
preds_final['PassengerId'] = df_final['PassengerId']
preds_final['Survived'] = cls_model.predict(df_final[features_all])

preds_final.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [10]:
preds_final.to_csv('./predictions.csv', index=False, encoding='utf-8', sep=',')