In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
print("Done!")

In [None]:
test_data_path = "../input/titanic/test.csv"
train_data_path = "../input/titanic/train.csv"

raw_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
train_data = raw_data.copy(deep = True)
data_all = [test_data, train_data]
data_clean = pd.concat(data_all, ignore_index = True)

Target =['Survived']
print("Done!")

In [None]:
train_data.info()

In [None]:
train_data.describe(include = "all")

In [None]:
train_data.head()

In [None]:
train_data.Age.describe()

In [None]:
train_data.Age.fillna(train_data.Age.mean(), inplace = True)
train_data.Embarked.fillna('S', inplace = True)
train_data.Fare.fillna(train_data.Fare.mean(), inplace = True)
train_data['Family'] = train_data.SibSp + train_data.Parch
print("Done!")

In [None]:
test_data.Age.fillna(test_data.Age.mean(), inplace = True)
test_data.Embarked.fillna('S', inplace = True)
test_data.Fare.fillna(test_data.Fare.mean(), inplace = True)
test_data['Family'] = test_data.SibSp + test_data.Parch
print("Done!")

In [None]:
train_data.sample(20)

In [None]:
train_data.drop(['PassengerId','Name','Ticket'], axis = 1, inplace = True)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=4,figsize=(16, 5))

ax = sns.countplot(train_data.Survived, ax = axes[0])
ax = sns.countplot(train_data.Pclass, ax = axes[1])
ax = sns.countplot(train_data.Sex, ax = axes[2])
ax = sns.countplot(train_data.Family, ax = axes[3])

In [None]:
women = train_data[train_data['Sex'] == 'female']
men = train_data[train_data['Sex'] == 'male']
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(16, 8))

ax = sns.distplot(women[women['Survived'] == 1].Age, label = 'Survived', color = 'green', kde = False, bins=18, ax = axes[0])
ax = sns.distplot(women[women['Survived'] == 0].Age, label = 'Died', color = 'red', kde = False, bins=40, ax = axes[0])
ax.legend()
ax.set_title('Female')
ax = sns.distplot(men[men['Survived'] == 1].Age, bins=18, label = 'Survived', ax = axes[1], kde = False, color="green")
ax = sns.distplot(men[men['Survived'] == 0].Age, bins=40, label = 'Died', ax = axes[1], kde = False, color="red")
ax.legend()
_ = ax.set_title('Male');

In [None]:
train_data['Cabin_Alloted'] = np.where(train_data.Cabin.isnull(), 0, 1)
train_data.drop('Cabin', axis = 1, inplace = True)

In [None]:
test_data['Cabin_Alloted'] = np.where(test_data.Cabin.isnull(), 0, 1)
test_data.drop('Cabin', axis = 1, inplace = True)

In [None]:
train_data['AgeBin'] = pd.qcut(x = train_data.Age, q =3)
train_data['AgeLabel'] = lb.fit_transform(train_data['AgeBin'])
train_data['FareBin'] = pd.qcut(x = train_data.Fare, q =3)
train_data['FareLabel'] = lb.fit_transform(train_data['FareBin'])
train_data['EmbarkedLabel'] = lb.fit_transform(train_data['Embarked'])
train_data['SexLabel'] = lb.fit_transform(train_data['Sex'])
print('Done!')

In [None]:
test_data['AgeBin'] = pd.qcut(x = test_data.Age, q =3)
test_data['AgeLabel'] = lb.fit_transform(test_data['AgeBin'])
test_data['FareBin'] = pd.qcut(x = test_data.Fare, q =3)
test_data['FareLabel'] = lb.fit_transform(test_data['FareBin'])
test_data['EmbarkedLabel'] = lb.fit_transform(test_data['Embarked'])
test_data['SexLabel'] = lb.fit_transform(test_data['Sex'])

In [None]:
train_data.shape

In [None]:
train_data_X = ['Pclass','Family','AgeLabel','Cabin_Alloted','FareLabel','EmbarkedLabel','SexLabel']

for i in train_data[train_data_X].columns:
    sns.lineplot(i, 'Survived', data = train_data)
    plt.show()

In [None]:
from sklearn import ensemble, tree, neighbors
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn import model_selection

MLA = [
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    neighbors.KNeighborsClassifier(), 

    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(), 

    XGBClassifier(objective='binary:logistic', eval_metric='logloss')    
]

cv_split = model_selection.ShuffleSplit(n_splits=10, test_size=.2, train_size=.8, random_state=1)

MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD', 'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

row_index = 0
for alg in MLA:
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())

    cv_results = model_selection.cross_validate(alg, train_data[train_data_X], train_data[Target].values.reshape(-1,), cv=cv_split, return_train_score=True)

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean() 
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3

    row_index += 1

# MLA_compare.sort_values(by=['MLA Test Accuracy Mean'], ascending=False, inplace=True)
MLA_compare

In [None]:
test_data.sample(10)

In [None]:
model = ensemble.GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.05)
model.fit(train_data[train_data_X], train_data[Target].values.reshape(-1, ))
predictions = model.predict(test_data[train_data_X])

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('./sumbission_GradientBoostingClassifier.csv', index=False)
print("Your submission was successfully saved!")