In [None]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 

# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# algorithms
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB

In [None]:
test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.head()

In [None]:
train_df = train_df.drop(['Ticket'], axis=1)
train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

In [None]:
test_df.columns.values

In [None]:
train_df.head()

In [None]:
total = train_df.isnull().sum().sort_values(ascending=False)
temp = train_df.isnull().sum()/train_df.isnull().count()*100
percentage = (round(temp, 1)).sort_values(ascending=False)
result_table = pd.concat([total, percentage], axis=1, keys=['Total', '%'])
result_table.head(10)

In [None]:
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)

In [None]:
test_df.columns.values

In [None]:
train_df.head()

In [None]:
common_value = 'S'
data = [train_df, test_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

In [None]:
ind = np.arange(3)  # the x locations for the groups
width = 0.35  # the width of the bars

southampton = train_df[train_df['Embarked']=='S']
cherbourg = train_df[train_df['Embarked']=='C']
queenstown = train_df[train_df['Embarked']=='Q']

survived_values = (len(southampton[southampton['Survived']==1]), len(cherbourg[cherbourg['Survived']==1]), len(queenstown[queenstown['Survived']==1]))
not_survived_values = (len(southampton[southampton['Survived']==0]), len(cherbourg[cherbourg['Survived']==0]), len(queenstown[queenstown['Survived']==0]))

fig, ax = plt.subplots()
rects1 = ax.bar(ind - width/2, survived_values, width, yerr=False,
                color='SkyBlue', label='Survived')
rects2 = ax.bar(ind + width/2, not_survived_values, width, yerr=False,
                color='IndianRed', label='Not Survived')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Count')
ax.set_title('Survived or not based on port of embarkation')
ax.set_xticks(ind)
ax.set_xticklabels(('Southampton', 'Cherbourg', 'Queenstown'))
ax.legend()


def autolabel(rects, xpos='center'):
    xpos = xpos.lower()  # normalize the case of the parameter
    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
    offset = {'center': 0.5, 'right': 0.57, 'left': 0.43}  # x_txt = x + w*off

    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height,
                '{}'.format(height), ha=ha[xpos], va='bottom')


autolabel(rects1, "left")
autolabel(rects2, "right")

plt.show()

In [None]:
data = [train_df, test_df]

for dataset in data:
    mean = dataset["Age"].mean()
    stdev = dataset["Age"].std()
    total_nulls = dataset["Age"].isnull().sum()
    random_age = np.random.randint(mean - stdev, mean + stdev, size = total_nulls)
    temp = dataset["Age"].copy()
    temp[np.isnan(temp)] = random_age
    dataset["Age"] = temp
    dataset["Age"] = dataset["Age"].astype(int)

In [None]:
train_df.info()

In [None]:
train_df = train_df.drop(['PassengerId'], axis=1)

In [None]:
ind = np.arange(3)  # the x locations for the groups
width = 0.35  # the width of the bars

one = train_df[train_df['Pclass']==1]
two = train_df[train_df['Pclass']==2]
three = train_df[train_df['Pclass']==3]

survived_values = (len(one[one['Survived']==1]), len(two[two['Survived']==1]), len(three[three['Survived']==1]))
not_survived_values = (len(one[one['Survived']==0]), len(two[two['Survived']==0]), len(three[three['Survived']==0]))

fig, ax = plt.subplots()
rects1 = ax.bar(ind - width/2, survived_values, width, yerr=False,
                color='SkyBlue', label='Survived')
rects2 = ax.bar(ind + width/2, not_survived_values, width, yerr=False,
                color='IndianRed', label='Not Survived')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Count')
ax.set_title('Survived or not based on ticket class')
ax.set_xticks(ind)
ax.set_xticklabels(('1', '2', '3'))
ax.legend()


def autolabel(rects, xpos='center'):
    xpos = xpos.lower()  # normalize the case of the parameter
    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
    offset = {'center': 0.5, 'right': 0.57, 'left': 0.43}  # x_txt = x + w*off

    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height,
                '{}'.format(height), ha=ha[xpos], va='bottom')


autolabel(rects1, "left")
autolabel(rects2, "right")

plt.show()

In [None]:
not_survived = train_df[train_df['Survived']==0]
survived = train_df[train_df['Survived']==1]

nt,ns = np.unique(not_survived['Age'].values, return_counts=True)
t,s = np.unique(survived['Age'].values, return_counts=True)

fig, ax = plt.subplots()
ax.plot(nt, ns, color='IndianRed', label='Not Survived')
ax.plot(t, s, color='SkyBlue', label='Survived')

ax.set(xlabel='Age', ylabel='Count',
       title='Survived or not based on age')
ax.grid()
ax.legend()
lim = ax.get_xlim()
ax.set_xticks(list(ax.get_xticks()) + list([5,15,25,35,45,55,65,75]))
ax.set_xlim(lim)
plt.show()

In [None]:
data = [train_df, test_df]

for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 14, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 14) & (dataset['Age'] <= 20), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 20) & (dataset['Age'] <= 27), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 32), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 37), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 37) & (dataset['Age'] <= 42), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 42) & (dataset['Age'] <= 55), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 55, 'Age'] = 7

In [None]:
genders = {"S": 0, "C": 1, "Q": 2}
data = [train_df, test_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(genders)

In [None]:
genders = {"male": 0, "female": 1}
data = [train_df, test_df]

for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(genders)

In [None]:
data = [train_df, test_df]
for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
not_survived = train_df[train_df['Survived']==0]
survived = train_df[train_df['Survived']==1]

nt,ns = np.unique(not_survived.loc[not_survived['Fare'] <= 50, 'Fare'], return_counts=True)
t,s = np.unique(survived.loc[survived['Fare'] <= 50, 'Fare'], return_counts=True)

fig, ax = plt.subplots()
ax.plot(nt, ns, color='IndianRed', label='Not Survived')
ax.plot(t, s, color='SkyBlue', label='Survived')

ax.set(xlabel='Fare', ylabel='Count',
       title='Survived or not based on fare')
ax.grid()
ax.legend()
plt.show()

In [None]:
not_survived = train_df[train_df['Survived']==0]
survived = train_df[train_df['Survived']==1]

nt,ns = np.unique(not_survived.loc[(not_survived['Fare'] > 50) & (not_survived['Fare'] <= 100), 'Fare'], return_counts=True)
t,s = np.unique(survived.loc[(survived['Fare'] > 50) & (survived['Fare'] <= 100), 'Fare'], return_counts=True)

fig, ax = plt.subplots()
ax.plot(nt, ns, color='IndianRed', label='Not Survived')
ax.plot(t, s, color='SkyBlue', label='Survived')

ax.set(xlabel='Fare', ylabel='Count',
       title='Survived or not based on fare')
ax.grid()
ax.legend()
plt.show()

In [None]:
data = [train_df, test_df]

for dataset in data:
    dataset.loc[ dataset['Fare'] <= 5, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 5) & (dataset['Fare'] <= 10), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 10) & (dataset['Fare'] <= 17), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 25), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 25) & (dataset['Fare'] <= 28), 'Fare']   = 4
    dataset.loc[(dataset['Fare'] > 28) & (dataset['Fare'] <= 65), 'Fare']   = 5
    dataset.loc[(dataset['Fare'] > 65) & (dataset['Fare'] <= 100), 'Fare']   = 6
    dataset.loc[ dataset['Fare'] > 100, 'Fare'] = 7
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
train_df.head(10)

In [None]:
x_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"]
x_test  = test_df.drop("PassengerId", axis=1).copy()

In [None]:
gaussian_naive_bayes = GaussianNB(var_smoothing=0.0001)
gaussian_naive_bayes.fit(x_train, y_train)

y_predict_gnb = gaussian_naive_bayes.predict(x_test)
test_df['Survived'] = pd.Series(y_predict_gnb, index=test_df.index)

acc_gaussian = round(gaussian_naive_bayes.score(x_train, y_train) * 100, 2)

In [None]:
perceptron = Perceptron(max_iter=250, tol=0.00001)
perceptron.fit(x_train, y_train)
y_predict_perceptron = perceptron.predict(x_test)
acc_perceptron = round(perceptron.score(x_train, y_train) * 100, 2)

In [None]:
results = pd.DataFrame({
    'Model': ['Naive Bayes', 'Perceptron'],
    'Score': [acc_gaussian, acc_perceptron]
})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head()

In [None]:
test_df.to_csv(path_or_buf='result.csv', index=False, columns=['PassengerId','Survived'])