# Titianic Dataset Description
The sinking of the RMS Titanic is one of the most infamous shipwrecks in history.  On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.

One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.

Number of Objects = 1309 
Number of Features = 10
train and test Split is 1/3 by 2/3

### Variable Definition	Key
* Survived	Survival	0 = No, 1 = Yes
* pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
* sex	gender	
* Age	Age in years	
* sibsp 	# of siblings / spouses aboard the Titanic	
* parch	  # of parents / children aboard the Titanic	
* ticket	Ticket number	
* fare	Passenger fare	
* cabin	Cabin number	
* embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

### Variable Notes

pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

In [125]:
from sklearn import tree
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
import numpy as np
from sklearn import metrics

# Load Dataset

In [126]:
data = pd.read_csv('train.csv', index_col='PassengerId')

# Binarize features 

In [127]:
#dropping columns that have confusing features
data.drop(['Name', 'Ticket'], axis = 1, inplace = True)
# chaning the Cabin feature to a binary feature of having or not having a cabin
data['hasCabin'] = data['Cabin'].apply(lambda x:0 if type(x) == float else 1)
data.drop(['Cabin'], axis = 1, inplace=True)
# binarizing the sex feature
data['Sex'] = data['Sex'].map({'female':0, 'male':1}).astype(int)
#binarizing the Pclass feature to 1 if 1st class and 0 if 2nd or 3rd class
data['Pclass'] = data['Pclass'].map({1:1, 2:0 , 3:0}).astype(int)
#binarzing the Age features to be 0 if less than mean or 1 greater than mean
data['Age'] = data['Age'].apply(lambda x: 0 if x<np.mean(data['Age']) else 1)
#binarizing the Embarked feature to 1 if S 0 otherwise
data['Embarked'] = data['Embarked'].fillna('S')
data['EmbarkedS'] = data['Embarked'].map({'S':1, 'C':0, 'Q':0}).astype(int)
data.drop(['Embarked'], axis = 1, inplace=True)
#binaring the Fare feature to 0 if less than mean and 1 if greater than
data['Fare'] = data['Fare'].apply(lambda x: 0 if x< np.mean(data['Fare']) else 1)
# binarizing the Parch and SibSp feature to one feature if isAlone or not
data['FamilySize'] = data['SibSp'] + data['Parch']+1
data['isAlone'] = data['FamilySize'].apply(lambda x: 1 if x==1 else 0)
data.drop(['FamilySize', 'Parch', 'SibSp'], inplace = True, axis = 1)

In [128]:
# spliting the data into training and testing
train_x, test_x, train_y, test_y =train_test_split(data.drop('Survived', axis = 1),data['Survived'], test_size = 0.33, random_state= 42)

# Training BernoulliNB classifier

In [129]:
clf = BernoulliNB()
clf.fit(train_x, train_y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [130]:
print("train accurary is ", clf.score(train_x,train_y))
print("test accurary is ", clf.score(test_x,test_y))

train accurary is  0.743288590604
test accurary is  0.769491525424


# Section 5:

In [131]:
# spliting the data into yes and No instances
predicted_prob = clf.predict_proba(test_x)
yesClass = []
noClass = []
for i in range(0,len(predicted_prob)):
    if(predicted_prob[i][0]>predicted_prob[i][1]):
        yesClass.append(i)
    else:
        noClass.append(i)

In [132]:
predicted_log = clf.predict_log_proba(test_x)
total_log_evidence_yes = 0
for i in yesClass:
    total_log_evidence_yes+=predicted_log[i][0]-predicted_log[i][1]
total_log_evidence_no = 0
for i in noClass:
    total_log_evidence_no+=predicted_log[i][0]-predicted_log[i][1]

## a) total positive log-evidence

In [133]:
print(total_log_evidence_yes)

370.9353012


## b) total negative log-evidence

In [134]:
print(total_log_evidence_no)

-198.104707962


## c) NOT SURE ABOUT THIS ASK IN CLASS

In [135]:
prob_yes = len(blah.loc[blah['Survived']==1])/len(test_x)
prob_no = 1- prob_yes
prob_log_class = np.log(prob_yes/prob_no)
total_log_evidence_no+= prob_log_class

In [136]:
d = {}
blah = pd.DataFrame(test_x,columns=data.drop('Survived', axis=1).columns)
blah['Survived']= test_y
for name in train_x.columns:
    feature_1_yes=0
    feature_0_yes=0
    feature_1_no=0
    feature_0_no=0
    for row in blah.iterrows():
        if(row[1]['Survived']==1 and row[1][name]==1):
            feature_1_yes+=1
        if(row[1]['Survived']==1 and row[1][name]==0):
            feature_0_yes+=1
        if(row[1]['Survived']==0 and row[1][name]==1):
            feature_1_no+=1
        if(row[1]['Survived']==0 and row[1][name]==0):
            feature_0_no+=1
    #class
    feature_yes_1 = feature_1_yes/prob_yes/len(test_x)
    feature_yes_0 = feature_0_yes/prob_yes/len(test_x)
    feature_no_1 = feature_1_no/prob_no/len(test_x)
    feature_no_0= feature_0_no/prob_no/len(test_x)
    d[name]=(feature_yes_1/feature_no_1,feature_yes_0/feature_no_0)
for i in d.items():
    print(i)
myList=[]
otherList = []
for i in d.values():
    myList.append(i[0])
    otherList.append(i[1])
print(sorted(myList+otherList))

('Pclass', (3.2666666666666657, 0.622222222222222))
('Sex', (0.347682119205298, 5.104166666666666))
('Age', (0.957924836601307, 1.0587899543378996))
('Fare', (3.2638888888888884, 0.6912878787878787))
('hasCabin', (3.4027777777777777, 0.6723484848484848))
('EmbarkedS', (0.8571882951653942, 1.4251893939393938))
('isAlone', (0.6233198924731183, 1.915849673202614))
[0.347682119205298, 0.622222222222222, 0.6233198924731183, 0.6723484848484848, 0.6912878787878787, 0.8571882951653942, 0.957924836601307, 1.0587899543378996, 1.4251893939393938, 1.915849673202614, 3.2638888888888884, 3.2666666666666657, 3.4027777777777777, 5.104166666666666]


## d) top 3 features values that contribute most to the positive evidence

In [137]:
print("Sex = 0")
print("hasCabin = 1")
print("Pclass = 1")

Sex = 0
hasCabin = 1
Pclass = 1


## e) top 3 features values that contribute most to the negative evidence

In [138]:
print("Sex = 1")
print("Pclass = 0")
print("isAlone = 1")

Sex = 1
Pclass = 0
isAlone = 1


### 1.The most positive with respect to the probabilities.

In [139]:
index = np.where(predicted_prob[:,0]==np.max(predicted_prob[:,0]))
print(test_x.iloc[index[0][0],])

Pclass       0
Sex          1
Age          1
Fare         0
hasCabin     0
EmbarkedS    1
isAlone      1
Name: 440, dtype: int64


### 2.The most negative object with respect to the probabilities.

In [140]:
index = np.where(predicted_prob[:,1]==np.max(predicted_prob[:,1]))
print(test_x.iloc[index[0][0],])

Pclass       1
Sex          0
Age          0
Fare         1
hasCabin     1
EmbarkedS    0
isAlone      0
Name: 312, dtype: int64


### 3. The object that has the largest positive evidence.

In [141]:
index = np.where((predicted_prob[:,0]/predicted_prob[:,1])==np.max(predicted_prob[:,0]/predicted_prob[:,1]))
print(test_x.iloc[index[0][0],])

Pclass       0
Sex          1
Age          1
Fare         0
hasCabin     0
EmbarkedS    1
isAlone      1
Name: 440, dtype: int64


### 4. The object that has the largest (in magnitude) negative evidence.

In [142]:
index = np.where((predicted_prob[:,0]/predicted_prob[:,1])==np.min(predicted_prob[:,0]/predicted_prob[:,1]))
print(test_x.iloc[index[0][0],])

Pclass       1
Sex          0
Age          0
Fare         1
hasCabin     1
EmbarkedS    0
isAlone      0
Name: 312, dtype: int64


### 5. The most uncertain object (the probabilities are closest to 0.5)

In [143]:
clearPRE = np.abs(np.add(predicted_prob[:,1],-0.5))
index = np.where(clearPRE==np.min(clearPRE))
print(test_x.iloc[index[0][0],])

Pclass       1
Sex          1
Age          1
Fare         1
hasCabin     0
EmbarkedS    0
isAlone      1
Name: 494, dtype: int64


In [7]:
import math
print([math.exp(x)for x in clf.feature_log_prob_[0]])
print([math.exp(x)for x in clf.feature_log_prob_[1]])

[0.148936170212766, 0.8457446808510638, 0.5851063829787236, 0.18882978723404253, 0.12765957446808512, 0.7898936170212767, 0.6675531914893613]
[0.36160714285714307, 0.3303571428571431, 0.5357142857142858, 0.36607142857142877, 0.3928571428571431, 0.6383928571428571, 0.4955357142857143]
