In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from pactools.grid_search import GridSearchCVProgressBar

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
X_train, Y_train = train.drop("Survived", axis = 1), train["Survived"]

categorical_features = X_train.select_dtypes(include=['object']).columns

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

## Hyper Parameter Optimization
n_estimators = [1100, 1500]
max_depth = [10, 15]
learning_rate=[0.05,0.20]
min_child_weight=[1,4]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'regressor__n_estimators': n_estimators,
    'regressor__max_depth':max_depth,
    'regressor__learning_rate':learning_rate,
    'regressor__min_child_weight':min_child_weight
    }

# xgboost pipeline
classifier = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regressor", XGBClassifier())]
    )

# grid search object
classifier_cv = GridSearchCV(
    estimator = classifier, 
    param_grid = hyperparameter_grid,
    cv = 3, 
    verbose = 1
)

In [3]:
classifier_cv.fit(X_train, Y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object'))])),
                                       ('regressor',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=No...
                                              

In [5]:
pd.DataFrame(classifier_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__learning_rate,param_regressor__max_depth,param_regressor__min_child_weight,param_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.915532,0.038842,0.010174,0.001007,0.05,10,1,1100,"{'regressor__learning_rate': 0.05, 'regressor_...",0.777778,0.794613,0.771044,0.781145,0.009912,1
1,1.205946,0.019107,0.010568,0.000894,0.05,10,1,1500,"{'regressor__learning_rate': 0.05, 'regressor_...",0.777778,0.794613,0.771044,0.781145,0.009912,1
2,0.884409,0.041001,0.009407,0.000291,0.05,10,4,1100,"{'regressor__learning_rate': 0.05, 'regressor_...",0.774411,0.791246,0.771044,0.7789,0.008837,9
3,1.156336,0.008155,0.010336,0.001246,0.05,10,4,1500,"{'regressor__learning_rate': 0.05, 'regressor_...",0.774411,0.791246,0.771044,0.7789,0.008837,9
4,0.882333,0.018912,0.010999,0.000815,0.05,15,1,1100,"{'regressor__learning_rate': 0.05, 'regressor_...",0.777778,0.794613,0.771044,0.781145,0.009912,1
5,1.202237,0.016871,0.010332,0.00047,0.05,15,1,1500,"{'regressor__learning_rate': 0.05, 'regressor_...",0.777778,0.794613,0.771044,0.781145,0.009912,1
6,0.861576,0.014185,0.010503,0.001079,0.05,15,4,1100,"{'regressor__learning_rate': 0.05, 'regressor_...",0.774411,0.791246,0.771044,0.7789,0.008837,9
7,1.156387,0.011859,0.010882,0.000835,0.05,15,4,1500,"{'regressor__learning_rate': 0.05, 'regressor_...",0.774411,0.791246,0.771044,0.7789,0.008837,9
8,0.897563,0.013643,0.009332,0.000473,0.2,10,1,1100,"{'regressor__learning_rate': 0.2, 'regressor__...",0.777778,0.794613,0.771044,0.781145,0.009912,1
9,1.225106,0.018264,0.009834,0.000235,0.2,10,1,1500,"{'regressor__learning_rate': 0.2, 'regressor__...",0.777778,0.794613,0.771044,0.781145,0.009912,1


In [6]:
test["Survived"] = classifier_cv.predict(test)

submission = test[["PassengerId","Survived"]]

submission.to_csv("submission.csv", index=False)