## Data Dictionary
1. survival     Survival 	0 = No, 1 = Yes
2. pclass       Ticket class 	1 = 1st, 2 = 2nd, 3 = 3rd
3. sex 	        Sex 	
4. Age 	        Age in years 	
5. sibsp 	    # of siblings / spouses aboard the Titanic 	
6. parch 	    # of parents / children aboard the Titanic 	
7. ticket 	    Ticket number 	
8. fare 	    Passenger fare 	
9. cabin 	    Cabin number 	
10. embarked 	Port of Embarkation 	C = Cherbourg, Q = Queenstown, S = Southampton

## Import library

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
from catboost import Pool, CatBoostClassifier, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# This function makes the plot directly on browser
%matplotlib inline

# Setting a figure size 
rcParams['figure.figsize'] = 10, 8

## Import Dataset

In [None]:
# Import train dataset
dfTrain = pd.read_csv("/kaggle/input/titanic/train.csv")
# Import test dataset
dfTest = pd.read_csv("/kaggle/input/titanic/test.csv")

## Preprocessing train data

In [None]:
#show the train data type & count of null
print(dfTrain.info())
print("-"*20)
print(dfTrain.isnull().sum())

Delete [PassengerId, Cabin, Ticket and Name], because they willn't gave as a lot of information

In [None]:
drop_column = ['PassengerId','Cabin', 'Ticket', 'Name']
dfTrain.drop(drop_column, axis=1, inplace = True)

We haven't a lot of date, so we should fill null

In [None]:
dfTrain['Age'].fillna(dfTrain['Age'].median(), inplace = True)
dfTrain['Embarked'].fillna(dfTrain['Embarked'].mode()[0], inplace = True)
dfTrain.isnull().sum()

Make LabelEncoder for Sex and Embarked

In [None]:
dfTrain['Sex'].replace({"male" : 0, 
                        "female" : 1}, inplace = True)
dfTrain['Embarked'].replace({"C" : 0, 
                             "Q" : 1,
                             "S" : 2}, inplace = True)

## Summary data

In [None]:
print(dfTrain.info())
print("-"*70)
print(dfTrain.describe())

## Split data

In [None]:
#get the train data and label
X = dfTrain.drop('Survived', axis=1)
y = dfTrain.Survived

#make the x for train and test (for cross validation) 
xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=.9, random_state=420, stratify=y)

## Train CatBoost

In [None]:
#let us make the catboost model, use_best_model params will make the model prevent overfitting
model = CatBoostClassifier(iterations=10000, 
                           eval_metric='Accuracy',
                           learning_rate=0.01, 
                           verbose = 0,
                           random_seed=420)

model.fit(xtrain, ytrain, 
          eval_set = (xtest, ytest))

In [None]:
from sklearn.model_selection import cross_val_score
print(f'cross validation: {cross_val_score(model, X, y, cv=5)}')
print(f'the test accuracy is :{accuracy_score(ytest, model.predict(xtest)):.2f}')

## Make submission prediction.
First, preprocessing test data

In [None]:
dfTest = pd.read_csv("/kaggle/input/titanic/test.csv")
drop_column = ['Cabin', 'Ticket', 'Name']
dfTest.drop(drop_column, axis=1, inplace = True)
dfTest['Age'].fillna(dfTest['Age'].median(), inplace = True)
dfTest['Embarked'].fillna(dfTest['Embarked'].mode()[0], inplace = True)
dfTest.isnull().sum()
dfTest['Sex'].replace({"male" : 0, 
                       "female" : 1}, inplace = True)
dfTest['Embarked'].replace({"C" : 0, 
                            "Q" : 1,
                            "S" : 2}, inplace = True)
Xtest = dfTest.drop('PassengerId', axis=1)

And make submission prediction

In [None]:
pred = model.predict(Xtest)
pred = pred.astype(int)
submission = pd.DataFrame({'PassengerId':dfTest['PassengerId'],'Survived':pred})
submission.to_csv('submission.csv',index=False)