In [9]:
# Call the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### Load The Data

In [10]:
df = pd.read_csv('train (1).csv')

In [11]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
# Target variable
y = df['Survived']

In [13]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

### Filtering the obvious non-affecting features

##### These are 'PassengerId','Name','Ticket','Cabin','Embarked' . These will not have too much affect on our model.

In [14]:
# Droping those features
new_df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked','Survived'], axis = 1)

In [15]:
new_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.25
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.925
3,1,female,35.0,1,0,53.1
4,3,male,35.0,0,0,8.05


#### Let's check for any missing value

In [16]:
new_df.isnull().any()

Pclass    False
Sex       False
Age        True
SibSp     False
Parch     False
Fare      False
dtype: bool

In [17]:
new_df.isnull().sum()

Pclass      0
Sex         0
Age       177
SibSp       0
Parch       0
Fare        0
dtype: int64

In [18]:
X = new_df.fillna(0)

In [19]:
X.isnull().any()

Pclass    False
Sex       False
Age       False
SibSp     False
Parch     False
Fare      False
dtype: bool

In [20]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.25
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.925
3,1,female,35.0,1,0,53.1
4,3,male,35.0,0,0,8.05


### We will use Label Encoding to convert categorical data into numerical data

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

X['Sex'] = le.fit_transform(X['Sex'])

In [22]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,22.0,1,0,7.25
1,1,0,38.0,1,0,71.2833
2,3,0,26.0,0,0,7.925
3,1,0,35.0,1,0,53.1
4,3,1,35.0,0,0,8.05


### Split the data into training and testing for model validation

In [23]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)

## Model:

#### 1. XGBOOST

In [24]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,accuracy_score

In [25]:
params = {'max_depth' : [8,10,12,15], 'learning_rate' : [0.01,0.05,0.1,0.2], 'scale_pos_weight' : [2,3,4,5]}

In [26]:
# Model
xgbm = XGBClassifier()

In [27]:
model = GridSearchCV(xgbm, params, cv = 5, n_jobs = -1, scoring = 'f1', verbose = 2)

In [28]:
model.fit(X_train,y_train)

# Get the best parameters
best_params = model.best_params_
print('Best parameters: ',best_params)

# Use the best model
best_model = model.best_estimator_

y_pred = best_model.predict(X_test)

# Classification Report
print(classification_report(y_test,y_pred))


Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best parameters:  {'learning_rate': 0.1, 'max_depth': 12, 'scale_pos_weight': 2}
              precision    recall  f1-score   support

           0       0.84      0.83      0.84       157
           1       0.76      0.78      0.77       111

    accuracy                           0.81       268
   macro avg       0.80      0.81      0.80       268
weighted avg       0.81      0.81      0.81       268



In [29]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8097014925373134

### XGBOOST model is able to give a good accuracy!