In [34]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [35]:
training = pd.read_csv('/kaggle/input/titanic/train.csv')
testing = pd.read_csv('/kaggle/input/titanic/test.csv')

In [36]:
training.info()

In [37]:
testing.info()

In [38]:
training.describe()

In [39]:
testing.describe()

In [40]:
training.sample(15)

In [41]:
df_num = training[['Age', 'SibSp', 'Parch', 'Fare']]
# Ticket was not considered
df_cat = training[['Survived', 'Pclass', 'Sex', 'Cabin', 'Embarked']]
df_cat['Cabin'] = df_cat['Cabin'].apply(lambda x: str(x)[0])

In [42]:
for i in df_num:
    plt.hist(df_num[training['Survived'] == 1][i])
    plt.title(i + " Survived")
    plt.show()
    plt.hist(df_num[training['Survived'] == 0][i])
    plt.title(i + " Dead")
    plt.show()

In [43]:
sns.set_style("whitegrid")
for i in df_cat:
    sns.countplot(x=df_cat["Survived"],hue=i,data=df_cat)
    plt.title(i)
    plt.show()

In [44]:
def preprocess(data):
    processed = data.drop(["PassengerId","Name","Ticket","Fare"],axis=1)
    processed['Cabin'] = processed['Cabin'].fillna('N').apply(lambda x: 0 if str(x) == 'N' else 1)
    processed['Age'] = processed["Age"].fillna(processed['Age'].mean())
    processed["Sex"]=processed.Sex.map({"male":1,"female":0})
    processed['Embarked'] = processed['Embarked'].fillna('S')
    dummy = pd.get_dummies(processed['Embarked'],drop_first=True)
    processed.drop('Embarked',axis = 1 ,inplace=True)
    processed = pd.concat([dummy , processed],axis = 1 )
    return processed

In [45]:
p_train = preprocess(training)
p_test = preprocess(testing)

In [46]:
x=p_train.drop("Survived",axis=1)
y=p_train.Survived

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

kf = KFold(n_splits=10, random_state=None)

precisions = []
recalls = []
f1s = []

for train_index , test_index in kf.split(x):
    x_train , x_test = x.iloc[train_index,:],x.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
    model=GaussianNB()
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    
    precisions.append(precision_score(y_test, y_pred, labels=[1]))
    recalls.append(recall_score(y_test, y_pred, labels=[1]))
    f1s.append(f1_score(y_test, y_pred, labels=[1]))

precision = sum(precisions)/10
recall = sum(recalls)/10
f1 = sum(f1s)/10

print(f'Precision - {precision}, Recall - {recall}, F1 - {f1}')

In [48]:
model_full=GaussianNB()
model_full.fit(x,y)
y_pred_full=model_full.predict(p_test)

In [49]:
sub=pd.read_csv("../input/titanic/gender_submission.csv")
sub["Survived"]=y_pred_full

In [50]:
sub.to_csv("submission2.csv",index=False)