# Problem -
### Titanic sank in the North Atlantic Ocean in 1912, after striking an iceberg.
## Given details of passengers that were present in Titanic, predict the chance of passenger surviving.
# Dataset-

## Titanic.csv 
### Details of 891 passengers.


In [55]:
# Load CSV file
import pandas as pd

df=pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


## Drop irrelevant features

In [56]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0



### Add male and female columns based on sex column

In [57]:
dummies = pd.get_dummies(df.Sex)
df = pd.concat([df,dummies],axis='columns')
df.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,Survived,female,male
0,3,male,22.0,7.25,0,0,1
1,1,female,38.0,71.2833,1,1,0
2,3,female,26.0,7.925,1,1,0


## Drop sex and male columns as female column can represent gender of the passenger

In [58]:
df.drop(['Sex','male'],axis='columns',inplace=True)
columns_titles = ["Pclass","Age","Fare","female","Survived"]
df=df.reindex(columns=columns_titles)
df.head(3)

Unnamed: 0,Pclass,Age,Fare,female,Survived
0,3,22.0,7.25,0,0
1,1,38.0,71.2833,1,1
2,3,26.0,7.925,1,1


## Replace Nan values with mean of the column that has NaN values

In [59]:
df.columns[df.isna().any()]

Index(['Age'], dtype='object')

In [60]:
df.Age[:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [61]:
df.Age = df.Age.fillna(df.Age.mean())
df.head(6)

Unnamed: 0,Pclass,Age,Fare,female,Survived
0,3,22.0,7.25,0,0
1,1,38.0,71.2833,1,1
2,3,26.0,7.925,1,1
3,1,35.0,53.1,1,1
4,3,35.0,8.05,0,0
5,3,29.699118,8.4583,0,0


## Naive Bayes using sklearn library

### Seperate input and desired output

In [62]:
inputs = df.drop('Survived',axis='columns')
target = df.Survived

## Split dataset into training and testing samples

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.3)

In [64]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [65]:
model.fit(X_train,y_train)

GaussianNB()

In [66]:
model.score(X_test,y_test)

0.7761194029850746

In [67]:
X_test[0:10]

Unnamed: 0,Pclass,Age,Fare,female
550,1,17.0,110.8833,0
304,3,29.699118,8.05,0
192,3,19.0,7.8542,1
386,3,1.0,46.9,0
69,3,26.0,8.6625,0
41,2,27.0,21.0,1
411,3,29.699118,6.8583,0
195,1,58.0,146.5208,1
823,3,27.0,12.475,1
495,3,29.699118,14.4583,0


In [68]:
y_test[0:10]

550    1
304    0
192    1
386    0
69     0
41     0
411    0
195    1
823    1
495    0
Name: Survived, dtype: int64

In [69]:
model.predict(X_test[0:10])

array([1, 0, 1, 0, 0, 1, 0, 1, 1, 0], dtype=int64)

In [70]:
model.predict_proba(X_test[:10])

array([[1.06258079e-01, 8.93741921e-01],
       [9.70711757e-01, 2.92882432e-02],
       [4.32560990e-01, 5.67439010e-01],
       [9.28202250e-01, 7.17977495e-02],
       [9.69989930e-01, 3.00100700e-02],
       [2.68540834e-01, 7.31459166e-01],
       [9.70521934e-01, 2.94780658e-02],
       [1.43387117e-04, 9.99856613e-01],
       [4.61559799e-01, 5.38440201e-01],
       [9.71216811e-01, 2.87831890e-02]])

## Now let's implement Naive Bayes algorithm.


## Step 1: Separate By Class
#### We will need to calculate the probability of data by the class they belong to.
#### This means that we will first need to separate our training data by class.
#### We will create a dictionary object where each key is the class value and then add a list of all the records as the value in the dictionary.
####  It assumes that the last column in each row is the class value.

In [76]:
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

In [80]:
'''Example
[[3.393533211,2.331273381,0],
[3.110073483,1.781539638,0],
[1.343808831,3.368360954,0],
[3.582294042,4.67917911,0],
[2.280362439,2.866990263,0],
[7.423436942,4.696522875,1],
[5.745051997,3.533989803,1],
[9.172168622,2.511101045,1],
[7.792783481,3.424088941,1],
[7.939820817,0.791637231,1]]'''
dataset = [[3.393533211,2.331273381,0],[3.110073483,1.781539638,0],[1.343808831,3.368360954,0],[3.582294042,4.67917911,0],[2.280362439,2.866990263,0],[7.423436942,4.696522875,1],[5.745051997,3.533989803,1],[9.172168622,2.511101045,1],[7.792783481,3.424088941,1],[7.939820817,0.791637231,1]]
separated = separate_by_class(dataset)
for label in separated:
    print(label)
    for row in separated[label]:
        print(row)

0
[3.393533211, 2.331273381, 0]
[3.110073483, 1.781539638, 0]
[1.343808831, 3.368360954, 0]
[3.582294042, 4.67917911, 0]
[2.280362439, 2.866990263, 0]
1
[7.423436942, 4.696522875, 1]
[5.745051997, 3.533989803, 1]
[9.172168622, 2.511101045, 1]
[7.792783481, 3.424088941, 1]
[7.939820817, 0.791637231, 1]


## Step 2: Summarize Dataset

#### We need two statistics from a given set of data.
#### 1) mean = sum(x)/n * count(x)
#### 2) standard deviation = sqrt((sum i to N (x_i – mean(x))^2) / N-1)

In [81]:
from math import sqrt

# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

#### We require the mean and standard deviation statistics to be calculated for each input attribute or each column of our data.

In [82]:
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

In [84]:
#Example
dataset = [[3.393533211,2.331273381,0],
    [3.110073483,1.781539638,0],
    [1.343808831,3.368360954,0],
    [3.582294042,4.67917911,0],
    [2.280362439,2.866990263,0],
    [7.423436942,4.696522875,1],
    [5.745051997,3.533989803,1],
    [9.172168622,2.511101045,1],
    [7.792783481,3.424088941,1],
    [7.939820817,0.791637231,1]]
summary = summarize_dataset(dataset)
print(summary)

[(5.178333386499999, 2.7665845055177263, 10), (2.9984683241, 1.218556343617447, 10)]


## Step 3: Summarize Data By Class
#### Summarize the columns in the dataset organized by class values.
#### The dataset is first split by class, then statistics are calculated on each subset. 
#### The results in the form of a list of tuples of statistics are then stored in a dictionary by their class value.

In [86]:
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

In [87]:
# Example
dataset = [[3.393533211,2.331273381,0],
    [3.110073483,1.781539638,0],
    [1.343808831,3.368360954,0],
    [3.582294042,4.67917911,0],
    [2.280362439,2.866990263,0],
    [7.423436942,4.696522875,1],
    [5.745051997,3.533989803,1],
    [9.172168622,2.511101045,1],
    [7.792783481,3.424088941,1],
    [7.939820817,0.791637231,1]]
summary = summarize_by_class(dataset)
for label in summary:
    print(label)
    for row in summary[label]:
        print(row)

0
(2.7420144012, 0.9265683289298018, 5)
(3.0054686692, 1.1073295894898725, 5)
1
(7.6146523718, 1.2344321550313704, 5)
(2.9914679790000003, 1.4541931384601618, 5)


## Step 4: Gaussian Probability Density Function

#### Calculating the probability or likelihood of observing a given real-value like X1 is difficult.
#### We will assume that X1 values are drawn from a  Gaussian distribution.
#### Gaussian Probability Distribution Function-
     f(x) = (1 / sqrt(2 * PI) * sigma) * exp(-((x-mean)^2 / (2 * sigma^2)))
#### Where sigma is the standard deviation for x, mean is the mean for x and PI is the value of pi.

In [88]:
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

## Step 5: Class Probabilities

#### Probabilities are calculated separately for each class.
#### Calculate the probability that new piece of data belongs to the first class, then calculate probabilities that it belongs to the second class.
   #### P(class|data) = P(X|class) * P(class)
####  For example, when we have 2 input variables, the calculation of the probability that a row belongs to the first class 0 can be calculated as:

    P(class=0|X1,X2) = P(X1|class=0) * P(X2|class=0) * P(class=0)

In [89]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, count = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

In [90]:
# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

## Naive Bayes Algorithm

In [91]:
# Naive Bayes Algorithm
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return(predictions)

## Naive Bayes - Titanic dataset

### Load titanic.csv file

In [92]:
# Load a CSV file
def load_csv(filename):
    df=pd.read_csv(filename)
    df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
    inputs=df
    dummies = pd.get_dummies(inputs.Sex)
    inputs = pd.concat([inputs,dummies],axis='columns')
    inputs.drop(['Sex','male'],axis='columns',inplace=True)
    columns_titles = ["Pclass","Age","Fare","female","Survived"]
    inputs=inputs.reindex(columns=columns_titles)
    inputs.Age = inputs.Age.fillna(inputs.Age.mean())
    dataset=inputs.values.tolist()
    return dataset

In [93]:
#Titanic
filename = 'titanic.csv'
dataset = load_csv(filename)

In [102]:
from random import randrange
from math import exp
from math import pi

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [103]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [104]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [105]:
# evaluate algorithm
n_folds = 5
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [77.52808988764045, 80.89887640449437, 76.96629213483146, 76.40449438202246, 76.40449438202246]
Mean Accuracy: 77.640%
