<a href="https://colab.research.google.com/github/addicted-ai/kaggle_practice/blob/main/titanic_xgb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading the dataset
df = pd.read_csv('https://raw.githubusercontent.com/addicted-ai/kaggle_practice/main/dataset/titanic/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/addicted-ai/kaggle_practice/main/dataset/titanic/test.csv')

In [3]:
# data to be used for training & with labeled dependent variable
df.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [4]:
# test data that i have to get prediction & submit
test.head(4)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


- Categorical columns are `Name`, `Sex`, `Ticket`, `Cabin`, `Embarked`.
- Name column can't be used for model.

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [7]:
cat_col = ['Sex', 'Ticket', 'Cabin', 'Embarked']
for i in cat_col:
  print(i,':')
  display(df[i].value_counts(dropna=False))
  print('________\n')

Sex :


male      577
female    314
Name: Sex, dtype: int64

________

Ticket :


1601                  7
347082                7
CA. 2343              7
347088                6
CA 2144               6
                     ..
STON/O 2. 3101288     1
SOTON/O.Q. 3101311    1
330959                1
3101296               1
373450                1
Name: Ticket, Length: 681, dtype: int64

________

Cabin :


NaN            687
B96 B98          4
G6               4
C23 C25 C27      4
F33              3
              ... 
B69              1
A26              1
A16              1
A32              1
D7               1
Name: Cabin, Length: 148, dtype: int64

________

Embarked :


S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

________



In [8]:
print('No of Unique values in Cabin:')
display(df['Cabin'].nunique())
print('No of Unique values in Ticket:')
display(df['Ticket'].nunique())

No of Unique values in Cabin:


147

No of Unique values in Ticket:


681

- Both Ticket & Cabin have very high no of level. We can't use them for training.
- 'Age' Columns seems to have ~20% NaN values. We can drop it.
- 'Embarked` column has 2 NaN values. We can impute NaN with mode of column.

In [9]:
df['Embarked'] = df['Embarked'].replace(np.nan, df['Embarked'].mode()[0])

In [10]:
df['Fare'].describe(percentiles=[0.1, 0.25, 0.5, 0.75, .8, 0.9, 0.95, 0.97, 0.99, 1])

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
10%        7.550000
25%        7.910400
50%       14.454200
75%       31.000000
80%       39.687500
90%       77.958300
95%      112.079150
97%      151.550000
99%      249.006220
100%     512.329200
max      512.329200
Name: Fare, dtype: float64

In [11]:
df['Fare'].median()

14.4542

In [12]:
test['Fare'] = test['Fare'].replace(np.nan, df['Fare'].median())

In [13]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [14]:
features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = pd.get_dummies(df[features])
X_test = pd.get_dummies(test[features])

y = df["Survived"]

In [None]:
param_test1 = {
    'n_estimators': [100,200,500,750,1000],
    'max_depth': [3,5,7,9],
    'min_child_weight': [1,3,5],
    'gamma':[i/10.0 for i in range(0,5)],
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05, 0.1, 1],
    'learning_rate': [0.01, 0.02, 0.05, 0.1]
}
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
gs = GridSearchCV(estimator = XGBClassifier(), 
                       param_grid = param_test1, 
                       scoring=scoring, iid=False,
                       cv=5, verbose = 5, 
                       refit='Accuracy')

gs.fit(X, y)

Fitting 5 folds for each of 134400 candidates, totalling 672000 fits
[CV] colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, subsample=0.6 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, subsample=0.6, AUC=0.837, Accuracy=0.788, total=   0.2s
[CV] colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, subsample=0.6, AUC=0.831, Accuracy=0.809, total=   0.1s
[CV] colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, subsample=0.6, AUC=0.866, Accuracy=0.809, total=   0.1s
[CV] colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.0, learning_rate

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[CV]  colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, reg_alpha=0.01, subsample=0.6, AUC=0.827, Accuracy=0.809, total=   0.1s
[CV] colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, reg_alpha=0.01, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, reg_alpha=0.01, subsample=0.6, AUC=0.869, Accuracy=0.815, total=   0.1s
[CV] colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, reg_alpha=0.01, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, reg_alpha=0.01, subsample=0.6, AUC=0.873, Accuracy=0.792, total=   0.1s
[CV] colsample_bytree=0.6, gamma=0.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, 

In [None]:
xgb = gs.best_estimator_

In [46]:
y_pred = xgb.predict(X_test)

      Iter       Train Loss   Remaining Time 
         1           1.2100           12.19s
         2           1.1003           10.05s
         3           1.0177            8.98s
         4           0.9506            8.35s
         5           0.8911            8.18s
         6           0.8420            7.99s
         7           0.7965            7.87s
         8           0.7549            7.95s
         9           0.7215            7.88s
        10           0.6913            7.84s
        20           0.5057            7.48s
        30           0.4339            7.16s
        40           0.3986            6.93s
        50           0.3781            6.85s
        60           0.3627            6.77s
        70           0.3525            6.70s
        80           0.3464            6.54s
        90           0.3415            6.46s
       100           0.3387            6.38s
       200           0.3303            5.44s
       300           0.3299            4.48s
       40

In [47]:
print('Accuracy of the model is:  ',accuracy_score(y, gbm.predict(X)))

Accuracy of the model is:   0.9248035914702581


In [48]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_pred})
output.to_csv('my_submission.csv', index=False)

In [49]:
#Confusion Matrix
cm = confusion_matrix(y, xgb.predict(X))
print('The confusion Matrix : \n',cm)

The confusion Matrix : 
 [[532  17]
 [ 50 292]]
