<a href="https://www.kaggle.com/code/amirulmahmud/titanic-predictions-with-gradient-boosting?scriptVersionId=124935967" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Load The Data**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
train_data.head()

In [None]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
test_data.head()

# **Data Cleaning**

In [None]:
# Drop columns that are not used in training data
train_data = train_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
train_data.head()

In [None]:
# Drop columns that are not used in testing data
test_data = test_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
test_data.head()

# **Handling Missing Value**

In [None]:
train_data.isna().sum()

In [None]:
test_data.isna().sum()

In [None]:
train_data['Embarked'].value_counts()

In [None]:
# Fill missing value in train_data column 'Embarked' with 'S'
train_data['Embarked'] = train_data['Embarked'].fillna('S')

In [None]:
train_data['Age'].describe()

In [None]:
plt.figure(figsize=(12,8),dpi=200)
sns.histplot(data=train_data,x='Age',bins=30)

In [None]:
# Fill missing value in train_data column 'Age' with its mean
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())

In [None]:
train_data.isna().sum()

In [None]:
test_data['Age'].describe()

In [None]:
plt.figure(figsize=(12,8),dpi=200)
sns.histplot(data=test_data,x='Age',bins=30)

In [None]:
# Fill missing value in test_data column 'Age' with its mean
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())

In [None]:
test_data['Fare'].describe()

In [None]:
plt.figure(figsize=(12,8),dpi=200)
sns.histplot(data=test_data,x='Fare',bins=15)

In [None]:
# Fill missing value in test_data column 'Fare' with its mean
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mean())

In [None]:
test_data.isna().sum()

# **Split The Data**

In [None]:
X_train = train_data.drop('Survived',axis=1)
y_train = train_data['Survived']
X_test = test_data.copy()

**Check the balance of the label data in training set.**

In [None]:
y_train.value_counts()

In [None]:
sns.countplot(x=y_train)

In [None]:
342/(342+549)

As we see here, the label class of training data is unbalance (62% : 38%). So, the data needs to be balanced in order to avoid bias. SMOTE (Synthetic Minority Over-sampling Technique) is used to handle this problem.

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
#Apply one-hot encoding to X_train
X_train = pd.get_dummies(X_train, drop_first=True)
X_train.head()

In [None]:
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [None]:
X_train.shape, y_train.shape

In [None]:
sns.countplot(x=y_train)

# **Feature Engineering**

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

# **Modelling with Gradient Boosting**

Create a base model of gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
model = GradientBoostingClassifier()

Create a pipeline

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([('scaler',scaler),('model',model)])

Perform a grid-search with the pipeline to test various parameters and report back the best performing parameters.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {'model__n_estimators':list(range(1,100)),
             'model__max_depth':[3,4,5,6,7]}

In [None]:
grid = GridSearchCV(estimator=pipe,param_grid=parameters,scoring='accuracy',cv=5)

In [None]:
#Fit grid model to the training data
grid.fit(X_train,y_train)

In [None]:
#Find the best parameters
grid.best_params_

# **Cross Validation Results**

In [None]:
cv_results = pd.DataFrame(grid.cv_results_)

In [None]:
cv_results.info()

Result : n_estimators

In [None]:
cv_n = cv_results.groupby('param_model__n_estimators').agg('mean')['mean_test_score']
cv_n

In [None]:
cv_n.sort_values(ascending=False)

In [None]:
cv_max = cv_results.groupby('param_model__max_depth').agg('mean')['mean_test_score']
cv_max

In [None]:
cv_max.sort_values(ascending=False)

# **Prediction**

In [None]:
#Apply one-hot encoding to X_test
X_test = pd.get_dummies(X_test, drop_first=True)

In [None]:
#Prediction
y_pred = grid.predict(X_test)

In [None]:
y_pred

# **Submission**

In [None]:
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test.head()

In [None]:
predictions = pd.DataFrame({'PassengerId': test['PassengerId'],
                          'Survived': y_pred})
predictions.to_csv('/kaggle/working/submission2.csv', index=False)

In [None]:
predictions.head()

In [None]:
predictions['Survived'].value_counts()

**Thank You**