<a href="https://colab.research.google.com/github/addicted-ai/kaggle_practice/blob/main/titanic_xgb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


import warnings
warnings.filterwarnings('ignore')

In [40]:
# Reading the dataset
df = pd.read_csv('https://raw.githubusercontent.com/addicted-ai/kaggle_practice/main/dataset/titanic/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/addicted-ai/kaggle_practice/main/dataset/titanic/test.csv')

In [41]:
# data to be used for training & with labeled dependent variable
df.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [42]:
# test data that i have to get prediction & submit
test.head(4)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


- Categorical columns are `Name`, `Sex`, `Ticket`, `Cabin`, `Embarked`.
- Name column can't be used for model.

In [44]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [45]:
cat_col = ['Sex', 'Ticket', 'Cabin', 'Embarked']
for i in cat_col:
  print(i,':')
  display(df[i].value_counts(dropna=False))
  print('________\n')

Sex :


male      577
female    314
Name: Sex, dtype: int64

________

Ticket :


CA. 2343    7
1601        7
347082      7
347088      6
CA 2144     6
           ..
3101267     1
3101277     1
29108       1
17464       1
365222      1
Name: Ticket, Length: 681, dtype: int64

________

Cabin :


NaN            687
C23 C25 C27      4
B96 B98          4
G6               4
F33              3
              ... 
B73              1
C118             1
C45              1
D56              1
B50              1
Name: Cabin, Length: 148, dtype: int64

________

Embarked :


S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

________



In [46]:
print('No of Unique values in Cabin:')
display(df['Cabin'].nunique())
print('No of Unique values in Ticket:')
display(df['Ticket'].nunique())

No of Unique values in Cabin:


147

No of Unique values in Ticket:


681

- Both Ticket & Cabin have very high no of level. We can't use them for training.
- 'Age' Columns seems to have ~20% NaN values. We can drop it.
- 'Embarked` column has 2 NaN values. We can impute NaN with mode of column.

In [47]:
df['Embarked'] = df['Embarked'].replace(np.nan, df['Embarked'].mode()[0])

In [48]:
df['Fare'].describe(percentiles=[0.1, 0.25, 0.5, 0.75, .8, 0.9, 0.95, 0.97, 0.99, 1])

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
10%        7.550000
25%        7.910400
50%       14.454200
75%       31.000000
80%       39.687500
90%       77.958300
95%      112.079150
97%      151.550000
99%      249.006220
100%     512.329200
max      512.329200
Name: Fare, dtype: float64

In [49]:
df['Fare'].median()

14.4542

In [50]:
test['Fare'] = test['Fare'].replace(np.nan, df['Fare'].median())

In [51]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [52]:
features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = pd.get_dummies(df[features])
X_test = pd.get_dummies(test[features])

y = df["Survived"]

In [53]:
xgb = XGBClassifier()
xgb.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [54]:
param1 = {
    'max_depth': [3],
    'min_child_weight': [1],
    'learning_rate': [0.1]
}
gs = GridSearchCV(estimator = XGBClassifier(),
                       param_grid = param1,
                       scoring=scorer, iid=False,
                       cv=4, verbose = 1,
                       refit=True)
gs.fit(X, y)

Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s finished


GridSearchCV(cv=4, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid=False, n_jobs=None,
             param_grid={'learning_rate': [0.1], 'max_depth': [3],
                         'min_child_weight': [1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_sco

In [55]:
print(gs.best_params_)
print(gs.best_score_)
print(gs.best_estimator_)

{'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1}
0.8565203643397828
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [56]:
param2 = {
    'max_depth': [3,5],
    'min_child_weight': [1,3,5],
    'learning_rate': [0.001,0.01, 0.1]
}
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
scorer = metrics.make_scorer(metrics.roc_auc_score,
                             greater_is_better=True,
                             needs_proba=True,
                             needs_threshold=False)

In [57]:
gs = GridSearchCV(estimator = XGBClassifier(),
                       param_grid = param2,
                       scoring=scorer, iid=False,
                       cv=4, verbose = 1,
                       refit=True)
gs.fit(X, y)

Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:    3.8s finished


GridSearchCV(cv=4, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid=False, n_jobs=None,
             param_grid={'learning_rate': [0.001, 0.01, 0.1],
                         'max_depth': [3, 5], 'min_child_weight': [1, 3, 5]},
             pre_dispatch='2*n_jobs', refit=

In [58]:
print(gs.best_params_)
print(gs.best_score_)
print(gs.best_estimator_)

{'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 5}
0.861934327914777
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [59]:
param = {
    'max_depth': [4,5,6],
    'min_child_weight': [5,6,7],
    'learning_rate': [0.25,0.05, 0.1,0.15,0.2]
}
gs = GridSearchCV(estimator = XGBClassifier(),
                       param_grid = param,
                       scoring=scorer, iid=False,
                       cv=4, verbose = 1,
                       refit=True)
gs.fit(X, y)

Fitting 4 folds for each of 45 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:   10.6s finished


GridSearchCV(cv=4, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid=False, n_jobs=None,
             param_grid={'learning_rate': [0.25, 0.05, 0.1, 0.15, 0.2],
                         'max_depth': [4, 5, 6],
                         'min_child_weight': [5, 6, 7]},
      

In [60]:
print(gs.best_params_)
print(gs.best_score_)
print(gs.best_estimator_)

{'learning_rate': 0.25, 'max_depth': 5, 'min_child_weight': 6}
0.8665275903118068
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.25, max_delta_step=0, max_depth=5,
              min_child_weight=6, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [61]:
param = {
    'max_depth': [4,5,6],
    'min_child_weight': [5,6,7],
    'learning_rate': [0.025, 0.1,0.2,0.25,0.3]
}
gs = GridSearchCV(estimator = XGBClassifier(),
                       param_grid = param,
                       scoring=scorer, iid=False,
                       cv=4, verbose = 1,
                       refit=True)
gs.fit(X, y)

Fitting 4 folds for each of 45 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:   10.7s finished


GridSearchCV(cv=4, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid=False, n_jobs=None,
             param_grid={'learning_rate': [0.025, 0.1, 0.2, 0.25, 0.3],
                         'max_depth': [4, 5, 6],
                         'min_child_weight': [5, 6, 7]},
      

In [62]:
print(gs.best_params_)
print(gs.best_score_)
print(gs.best_estimator_)

{'learning_rate': 0.3, 'max_depth': 4, 'min_child_weight': 7}
0.8675280978266295
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.3, max_delta_step=0, max_depth=4,
              min_child_weight=7, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [63]:
param = {
    'max_depth': [3,4,5,6],
    'min_child_weight': [5,6,7,8],
    'learning_rate': [0.25,0.3,0.325,0.35]
}
gs = GridSearchCV(estimator = XGBClassifier(),
                       param_grid = param,
                       scoring=scorer, iid=False,
                       cv=4, verbose = 1,
                       refit=True)
gs.fit(X, y)

Fitting 4 folds for each of 64 candidates, totalling 256 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 256 out of 256 | elapsed:   13.7s finished


GridSearchCV(cv=4, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid=False, n_jobs=None,
             param_grid={'learning_rate': [0.25, 0.3, 0.325, 0.35],
                         'max_depth': [3, 4, 5, 6],
                         'min_child_weight': [5, 6, 7, 8]},
    

In [64]:
print(gs.best_params_)
print(gs.best_score_)
print(gs.best_estimator_)

{'learning_rate': 0.325, 'max_depth': 4, 'min_child_weight': 8}
0.8684844365738886
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.325, max_delta_step=0, max_depth=4,
              min_child_weight=8, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [65]:
param = {
    'max_depth': [3,4,5],
    'min_child_weight': [6,7,8,9,10],
    'learning_rate': [0.3,0.325,0.35]
}
gs = GridSearchCV(estimator = XGBClassifier(),
                       param_grid = param,
                       scoring=scorer, iid=False,
                       cv=4, verbose = 1,
                       refit=True)
gs.fit(X, y)

Fitting 4 folds for each of 45 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:    9.0s finished


GridSearchCV(cv=4, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid=False, n_jobs=None,
             param_grid={'learning_rate': [0.3, 0.325, 0.35],
                         'max_depth': [3, 4, 5],
                         'min_child_weight': [6, 7, 8, 9, 10]},
         

In [66]:
print(gs.best_params_)
print(gs.best_score_)
print(gs.best_estimator_)

{'learning_rate': 0.325, 'max_depth': 4, 'min_child_weight': 8}
0.8684844365738886
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.325, max_delta_step=0, max_depth=4,
              min_child_weight=8, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [None]:
gs2 = GridSearchCV(estimator = XGBClassifier(),
                       param_grid = param2,
                       scoring=scoring, iid=False,
                       cv=4, verbose = 1,
                       refit='Accuracy')
gs2.fit(X, y)

Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:    4.1s finished


GridSearchCV(cv=4, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid=False, n_jobs=None,
             param_grid={'learning_rate': [0.001, 0.01, 0.1],
                         'max_depth': [3, 5], 'min_child_weight': [1, 3, 5]},
             pre_dispatch='2*n_jobs', refit=

In [None]:
print(gs2.best_params_)
print(gs2.best_score_)
print(gs2.best_estimator_)

{'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1}
0.8125984729123742
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [67]:
xgb = gs.best_estimator_
xgb

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.325, max_delta_step=0, max_depth=4,
              min_child_weight=8, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [68]:
y_pred = xgb.predict(X_test)

In [69]:
print('Accuracy of the model is:  ',accuracy_score(y, xgb.predict(X)))

Accuracy of the model is:   0.8653198653198653


In [70]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_pred})
output.to_csv('my_submission.csv', index=False)

In [None]:
#Confusion Matrix
cm = confusion_matrix(y, xgb.predict(X))
print('The confusion Matrix : \n',cm)

The confusion Matrix : 
 [[516  33]
 [ 87 255]]
