In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
gender_submission = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [None]:
# remove raws with missing targets, separate target from dataset
train_data.dropna(axis=0, subset=['Survived'], inplace=True)
y = train_data.Survived
train_data.drop(['Survived'], axis=1, inplace=True)

In [None]:
# Select categorical columns with low cardinality
categorical_columns = [col for col in train_data.columns if 
                        train_data[col].dtype =='object' and
                        train_data[col].nunique() < 10]
# Select numerical columns
numerical_columns = [cols for cols in train_data.columns
                     if train_data[cols].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_columns + numerical_columns

X_train_full = train_data[my_cols].copy()
X_test_full = test_data[my_cols].copy()

In [None]:
import xgboost as xgb 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')


# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
                                        ('imputer', SimpleImputer(strategy='most_frequent')),
                                        ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                        ])
                
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
                        transformers=[
                            ('num', numerical_transformer, numerical_columns),
                            ('cat', categorical_transformer, categorical_columns)
                        ])

# Bundle preprocessing in pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor)
                     ])

X_train = pd.DataFrame(my_pipeline.fit_transform(X_train_full))
X_test  = pd.DataFrame(my_pipeline.transform(X_test_full))


# Bundle preprocessing and modelling code in pipeline
#clf = Pipeline(steps=[('preprocessor', preprocessor),
#                      ('model', model)
#                     ])

# Preprocessing of training data, fit model
#scores = cross_val_score(clf, X_train, y,
#                        cv=5)
#print('Roc Auc score:\n', scores)

In [None]:
# Preprocessing validation data, get predictions
xgtrain = xgb.DMatrix(X_train, label=y)
clf = xgb.XGBClassifier(n_estimators=1000, 
                        learning_rate=0.05, 
                        n_jobs=-1)
xgb_param = clf.get_xgb_params()

# Start cross validation
cvresult = xgb.cv(xgb_param, xgtrain, nfold=5, metrics=['auc'],
                 early_stopping_rounds=5)
print('Best number of trees = {}'.format(cvresult.shape[0]))
clf.set_params(n_estimators=cvresult.shape[0])

# Fit on the trainings data
clf.fit(X_train, y, eval_metric='auc')
print('Overall AUC:', roc_auc_score(y, clf.predict_proba(X_train)[:,1]))

# Predict the probabilities based on features in the test set
pred = clf.predict_proba(X_test, ntree_limit=cvresult.shape[0])

In [None]:
def prob(x):
    if x > 0.5:
        l = 1
    else:
        l = 0
    return l

In [None]:
output = pd.DataFrame({'PassengerId': X_test_full.PassengerId,
                      'Survived': pred[:,1]
                      })
output.Survived = output.Survived.apply(prob)
output.to_csv('submission.csv', index=False)