In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


In [2]:
data = pd.read_csv('train.csv', index_col='PassengerId')
data = pd.DataFrame(data)

SOME FEATURE ENGINEERING

In [3]:

data['Deck']=data['Cabin'].apply(lambda x: str(x)[0])
data['Side']=data['Cabin'].apply(lambda x: str(x)[-1])
data['Flight'] = data.HomePlanet+' - '+data.Destination
data.Transported.replace([True, False],[1,0], inplace=True)
data.dropna(axis=0, subset=['Transported'], inplace=True)
X_full=data.drop(['Transported','Name', 'Cabin', 'Destination', 'HomePlanet'], axis=1)
y = data.Transported.copy()

MODEL TRAINING VIA PIPELINES

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2,random_state=1)

cat_cols = [col for col in X_full.columns if X_full[col].dtype == 'object']
num_cols = [col for col in X_full.columns if X_full[col].dtype in ['float64', 'int64']]

age_transformer = SimpleImputer(strategy='median')
num_transformer = SimpleImputer(strategy='median')

cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('ordinal', OrdinalEncoder(handle_unknown='error'))
                                         ])

preprocessor = ColumnTransformer(transformers=[('age', age_transformer, ['Age']),
                                               ('num', num_transformer, num_cols),
                                               ('cat', cat_transformer, cat_cols),
                                              ])

# Hyperparameters optimization

cv_res={}
for n in np.arange(100,160,10):
    for l in np.arange(0.04, 0.08, 0.01):
        for d in np.arange(6,8,1):
            
            model = XGBClassifier(n_estimators=n, learning_rate = l, max_depth=d,  random_state=1)

            clf = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', model)
                                 ])


            cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
            print(f'n={n}, l={l}, d={d} cross_val_score', cross_val_scores.mean())
            
            cv_res[f'n={n}, l={l}, d={d}']=cross_val_scores.mean()


n=100, l=0.04, d=6 cross_val_score 0.8006888062519071
n=100, l=0.04, d=7 cross_val_score 0.8022707125457075
n=100, l=0.05, d=6 cross_val_score 0.8029901370061392
n=100, l=0.05, d=7 cross_val_score 0.8004017605469901
n=100, l=0.060000000000000005, d=6 cross_val_score 0.8034212744829299
n=100, l=0.060000000000000005, d=7 cross_val_score 0.8008338289828238
n=100, l=0.07, d=6 cross_val_score 0.8038524119597206
n=100, l=0.07, d=7 cross_val_score 0.8016965176959797
n=110, l=0.04, d=6 cross_val_score 0.7999695886712629
n=110, l=0.04, d=7 cross_val_score 0.8014076100729769
n=110, l=0.05, d=6 cross_val_score 0.8025585857697738
n=110, l=0.05, d=7 cross_val_score 0.8009768863557609
n=110, l=0.060000000000000005, d=6 cross_val_score 0.8037089408272088
n=110, l=0.060000000000000005, d=7 cross_val_score 0.8018404025880661
n=110, l=0.07, d=6 cross_val_score 0.804427848088172
n=110, l=0.07, d=7 cross_val_score 0.8014085410320199
n=120, l=0.04, d=6 cross_val_score 0.8002572550155419
n=120, l=0.04, d=7 

In [5]:
print('best model for cross val', max(cv_res, key=cv_res.get), '\n Cross_val_Score', cv_res[max(cv_res, key=cv_res.get)])

best model for cross val n=130, l=0.060000000000000005, d=6 
 Cross_val_Score 0.8052909505609028


In [6]:
final_model = XGBClassifier(n_estimators=130, learning_rate = 0.06, max_depth=6,  random_state=1)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', final_model)
                                 ])

clf.fit(X_train, y_train)

preds = clf.predict(X_valid)

print(f'n={n}, l={l}, d={d} Score', accuracy_score(y_valid, preds))

cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print(f'n={n}, l={l}, d={d} cross_val_score', cross_val_scores.mean())

cv_res[f'n={n}, l={l}, d={d}']=cross_val_scores.mean()

n=150, l=0.07, d=7 Score 0.8039102932719954
n=150, l=0.07, d=7 cross_val_score 0.8052909505609028
