# Install required packages

In [None]:
import pandas as pd
import matplotlib as plt
import seaborn as snb
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

# Import Titanic dataset into Python 

In [None]:
train_data = pd.read_csv('~/Documents/Titanic/train.csv')

In [None]:
test_data = pd.read_csv('~/Documents/Titanic/test.csv')

In [None]:
test_survived = pd.read_csv('~/Documents/Titanic/gender_submission.csv') 

In [None]:
test_data=pd.merge(left=test_data, right=test_survived, left_on='PassengerId', right_on='PassengerId')

# Pre-processing

In [None]:
columnsTitles = ['PassengerId',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked',
 'Survived']

train_data = train_data.reindex(columns=columnsTitles)

In [None]:
total_data=pd.concat([train_data,test_data])

In [None]:
total_data.isnull().sum()

In [None]:
total_data.groupby('Embarked').sum()

In [None]:
total_data["Embarked"] = total_data["Embarked"].fillna("S")
total_data["Fare"].fillna(total_data["Fare"].median(), inplace=True)
total_data["Age"].fillna(total_data["Age"].median(), inplace=True)

In [None]:
total_data['Family'] =  total_data['Parch'] + total_data['SibSp']
total_data['Family'].loc[total_data['Family'] > 0] = 1
total_data['Family'].loc[total_data['Family'] == 0] = 0

In [None]:
total_data.head()

In [None]:
embark_dummies_total  = pd.get_dummies(total_data['Embarked'])
embark_dummies_total.drop(['S'], axis=1, inplace=True)
total_data = pd.concat([total_data, embark_dummies_total], axis=1)

In [None]:
sex_dummies_total  = pd.get_dummies(total_data['Sex'])
sex_dummies_total.drop(['male'], axis=1, inplace=True)
total_data = pd.concat([total_data, sex_dummies_total], axis=1)

In [None]:
pclass_dummies_total  = pd.get_dummies(total_data['Pclass'])
pclass_dummies_total.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_total.drop(['Class_3'], axis=1, inplace=True)
total_data = pd.concat([total_data, pclass_dummies_total], axis=1)

In [None]:
total_data = total_data.drop(['PassengerId','Pclass','Name','Ticket','Cabin','SibSp','Parch','Embarked','Sex'], axis=1)

In [None]:
train_set, test_set = train_test_split(total_data, test_size=0.3,random_state=42)

In [None]:
train_set.head()

In [None]:
train_set['Fare'].corr(train_set['Age'])

In [None]:
X_train = train_set.drop("Survived",axis=1)
Y_train = train_set["Survived"]
X_test  = test_set.copy()
X_test = X_test.drop("Survived",axis=1)
Y_test = test_set["Survived"]

# Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

In [None]:
Y_pred = logreg.predict(X_test)

In [None]:
accuracy=round(logreg.score(X_test, Y_test),4)
print('The Model Accuracy' + ' is '+ str(accuracy*100)+ '%')

# Classification Tree

In [None]:
tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=7).fit(X_train,Y_train)

In [None]:
prediction = tree.predict(X_test)

In [None]:
dot_data = StringIO()
export_graphviz(tree, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
print("The Model Accuracy is: ",round(tree.score(X_test,Y_test)*100,2),"%")

In [None]:
from sklearn.model_selection import KFold

In [None]:
kf = KFold(n_splits=5)

# CatBoost Algorithm

In [None]:
from catboost import CatBoostClassifier,FeaturesData,Pool

In [None]:
model = CatBoostClassifier(iterations=200,
                           depth=8,
                           learning_rate=0.05,
                           loss_function='Logloss',
                           verbose=True)

In [None]:
model.fit(X_train, Y_train)

In [None]:
preds_class = model.predict(X_test)

In [None]:
preds_class = model.predict(X_test, prediction_type='Class')
preds_proba = model.predict(X_test, prediction_type='Probability')
preds_raw_vals = model.predict(X_test, prediction_type='RawFormulaVal')

In [None]:
# Get predictions
preds = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, recall_score

In [None]:
accuracy_score(Y_test, preds_class)