In [None]:
#importing libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
#Loading Data
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.shape


In [None]:
test.shape

In [None]:

train.info()

In [None]:
test.info()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

# Bar Chart for Categorical Features for Visualization

 - Pclass(Passenger Class)
 - Sex
 - SibSp(Number of siblings and spouse)
 - Parch (Number of Parents and Children)

In [None]:
def bar_chart(feature):
    survived = train[train['Survived']==1][feature].value_counts()
    dead = train[train['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived,dead])
    df.index = ['Survived','Dead']
    df.plot(kind = 'bar',stacked=True,figsize=(10,5))

In [None]:
bar_chart('Sex')

In [None]:
bar_chart('Pclass')

In [None]:
bar_chart('SibSp')

In [None]:
#Combining Dataset
train_test_data = [train,test]

for data in train_test_data:
    data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.',expand = False)

In [None]:
train.Title.value_counts()
train.Title.isnull().sum()

In [None]:
test.Title.value_counts()
test.Title.isnull().sum()

In [None]:
#Map each Title
'''
Mr as 0
Miss as 1
Mrs as 2
Other as 3'''

map_title = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
                 "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 ,"Dona":3}
for data in train_test_data:
    data['Title'] = data['Title'].map(map_title)
    

In [None]:
bar_chart("Title")

In [None]:

train.head()

In [None]:
test.tail()

In [None]:
#Map Sex Male:0 ,Female :1
sex_mapping = {"male":0 ,"female":1}
for data in train_test_data:
    data['Sex'] = data['Sex'].map(sex_mapping)

In [None]:
bar_chart('Sex')

In [None]:
#counts of null values in age 
print(train["Age"].isnull().sum())
print(test["Age"].isnull().sum())

In [None]:
#Filling missing age with median age for each title
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"),inplace = True)
test["Age"].fillna(train.groupby("Title")["Age"].transform("median"),inplace = True)

In [None]:
train["Age"].isnull().sum()


# Converting numerical age value to binary values

child:0 <br>
young:1 <br>
adult:2 <br>
mid-age:3 <br>
senior:4 <br>

In [None]:
for dataset in train_test_data:
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0,
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1,
    dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2,
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3,
    dataset.loc[ dataset['Age'] > 62, 'Age'] = 4

In [None]:
train.head()

In [None]:
bar_chart("Age")

# Embarked

##Filling missing values

In [None]:
train["Embarked"].isnull().sum()
Pclass1 = train[train['Pclass']==1]['Embarked'].value_counts()
Pclass2 = train[train['Pclass']==2]['Embarked'].value_counts()
Pclass3 = train[train['Pclass']==3]['Embarked'].value_counts()
df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
df.index  =['1st Class' , '2nd Class', '3rd Class']
df.plot(kind = 'bar',stacked = True,figsize = (10,5))


# Fill out missing embark with S embark

In [None]:
for data in train_test_data:
    data['Embarked'] = data['Embarked'].fillna('S')

In [None]:
train['Embarked'].isnull().sum()

In [None]:
#Map Each Embark with Numerical Vaues
embark_mapping = { "S":0,"C":1,"Q":2}
for data in train_test_data:
    data['Embarked'] = data['Embarked'].map(embark_mapping)

# Fare

In [None]:
#Fill Out missing Fare value with median fare for each Passenger Class
train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"),inplace = True)
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"),inplace = True)
train.head()

In [None]:
train.head()

In [None]:
test.head()

In [None]:
#Drop unnecessary features
features_drop  = ['Name','Ticket','Cabin']
train = train.drop(features_drop, axis = 1)
test = test.drop(features_drop,axis = 1)
train = train.drop('PassengerId',axis = 1)

In [None]:
train_data = train.drop('Survived' , axis = 1)
target = train['Survived']
train.head()

In [None]:
test.head()

# Modelling 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Cross Validation (K-fold)

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits = 10, shuffle = True , random_state = 0)

# KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors = 10)
score = cross_val_score(knn ,train_data , target,cv = k_fold,n_jobs = 1 , scoring='accuracy')
print(score)

In [None]:
#KNN score mean
round(np.mean(score)*100,2)

# Logistic Regression

In [None]:
logreg = LogisticRegression()
score = cross_val_score(knn ,train_data , target,cv = k_fold,n_jobs = 1 , scoring='accuracy')
print(score)

In [None]:
#Logistic Regression Score
round(np.mean(score)*100,2)

# Decision Tree

In [None]:
DecTree = DecisionTreeClassifier()
score = cross_val_score(knn ,train_data , target,cv = k_fold,n_jobs = 1 , scoring='accuracy')
print(score)

In [None]:
round(np.mean(score)*100,2)

# Random Forest

In [None]:
rand = RandomForestClassifier(n_estimators=12)
score = cross_val_score(rand, train_data,target , cv = k_fold ,n_jobs =1 , scoring = 'accuracy')
print(score)

In [None]:
round(np.mean(score)*100,2)

# Naive Bayes

In [None]:
NB_clf = GaussianNB()
score = cross_val_score(NB_clf , train_data, target , cv = k_fold, n_jobs = 1, scoring='accuracy')
print(score)

In [None]:
round(np.mean(score)*100,2)

# SVM

In [None]:
clf = SVC()
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring='accuracy')
print(score)

In [None]:
round(np.mean(score)*100,2)

# Testing 

In [None]:
clf = GaussianNB()
clf.fit(train_data , target)
test_data = test.drop("PassengerId", axis=1).copy()
prediction = clf.predict(test_data)

In [None]:
submission = pd.DataFrame({"PassengerId":test["PassengerId"],"Survived":prediction})

In [None]:
submission.to_csv('Submission.csv',index=False)
submission.head()