# Data Small Titanic

In [65]:
import pandas as pd
import numpy as np
import math
import pylab as pl

In [2]:
data_titanic = pd.read_csv('./data/titanic.csv')

In [3]:
data_titanic.head()

Unnamed: 0.1,Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,1,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,2,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,3,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,5,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


## Make object data to numeric data

* PClass: 1~3 (1st, 2nd, 3rd)

In [4]:
def obj_to_number(data):
    if data=="*":
        return 0
    num = data[0]
    return int(num)

In [5]:
data_titanic['class'] = data_titanic['PClass'].apply(func=obj_to_number)

In [6]:
data_titanic.head()

Unnamed: 0.1,Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode,class
0,1,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1,1
1,2,"Allison, Miss Helen Loraine",1st,2.0,female,0,1,1
2,3,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0,1
3,4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1,1
4,5,"Allison, Master Hudson Trevor",1st,0.92,male,1,0,1


In [7]:
data_main = data_titanic[['Age','SexCode','class','Survived']]

## Delete NaN data

In [8]:
data_main = data_main.dropna()

In [10]:
data_main.describe()

Unnamed: 0,Age,SexCode,class,Survived
count,756.0,756.0,756.0,756.0
mean,30.397989,0.380952,2.121693,0.414021
std,14.259049,0.485942,0.84006,0.492878
min,0.17,0.0,1.0,0.0
25%,21.0,0.0,1.0,0.0
50%,28.0,0.0,2.0,0.0
75%,39.0,1.0,3.0,1.0
max,71.0,1.0,3.0,1.0


---

#  Logistic Regression (Classification)

* using titanic data
* survived -> 0 or 1. 2-class classification

In [11]:
X = data_main[data_main.columns[0:3]]
Y = data_main[['Survived']]

## Sigmoid function

0 means negative, 1 means positive.
hypothesis function must be in 0~1 range.

Using 1/(1+e^(-g))

In [12]:
def sigmoid(x):  
    # return math.exp(-np.logaddexp(0, -x))
    return 1 / (1 + math.e ** -x)

In [14]:
sigmoid(0)

0.5

## Cost Function and Gradient Descent

In [17]:
def costfunction(W,X,y):
    # to make one dimensional array
    y = y.as_matrix(columns=y.columns)[:,0]
    m = y.size
    h = sigmoid(np.dot(W, X.T))

    cost = -(1.0/m) * sum(y*np.log(h) + (1-y)*np.log(1-h))
    grad = (1.0/m) * np.dot(X.T, h-y)

    return cost, grad

In [18]:
m,n = X.shape
W = np.zeros(n)
n,m,W

(3, 756, array([ 0.,  0.,  0.]))

In [19]:
cost, grad = costfunction(W, X, Y)
cost, grad

(0.69314718055994562, array([ 3.04350529, -0.09656085,  0.31349206]))

## Regularization

In [176]:
def regular(X):
    Mean = X.mean(axis=0)
    Max = X.max(axis=0)
    Min = X.min(axis=0)
    for idx,col in enumerate(X.columns):
        mean = Mean[idx]
        maxi = Max[idx]
        mini = Min[idx]
        X = X.subtract(mean, axis=col)
        X = X.multiply(1.0/(maxi-mini),axis=col)
    return X

---
## Logistic Regression



**option**

* learning rate **alpha**
* max iteration **max_iter**
* regularization **regular**

In [179]:
def logistic(X,Y,alpha=0.1,conv=0.8,max_iter=10000,r=False):
    if r:
        X = regular(X)
    m,n = X.shape
    W = np.zeros(n)
    grad=np.array([100])
    n = 0
    while True:
        n+=1
        cost, grad = costfunction(W, X, Y)
        if sum(abs(grad)) < conv:
            break
        W -= grad*alpha
        if n==max_iter:
            break

    return W

In [62]:
def predict(W,X,Y):
    TP=0
    TN=0
    FP=0
    FN=0
    Y = Y.as_matrix(columns=Y.columns)[:,0]
    m = Y.size

    print "Using",W,"as theta"
    for i,m in enumerate(X.T):
        hx = sigmoid(np.dot(W, X.T[m]))
        if hx>=0.5:
            if Y[i]==1:
                TP+=1
            else:
                FP+=1
        else:
            if Y[i]==1:
                FN+=1
            else:
                TN+=1
    return TP, TN, FP, FN

In [63]:
print "Learning rate change"
alpha=[0.01,0.05,0.1,0.2,0.5]

for i in alpha:
    TP, TN, FP, FN = predict(logistic(X,Y,alpha=i),X,Y)
    print "Accuracy for alpha(%s) : %s" %(i,(TP+TN)*100.0/(TP+TN+FP+FN))

Learning rate change
Accuracy for alpha(0.01) : 77.9100529101
Accuracy for alpha(0.05) : 79.1005291005
Accuracy for alpha(0.1) : 76.5873015873
Accuracy for alpha(0.2) : 76.1904761905
Accuracy for alpha(0.5) : 76.0582010582


In [149]:
print "Convergence Change 0.1~0.9"
for c in pl.frange(0.1,1,0.1):
    TP, TN, FP, FN = predict(logistic(X,Y,conv=c),X,Y)
    print "Accuracy for conv(%s) : %s" %(c,(TP+TN)*100.0/(TP+TN+FP+FN))

Convergence Change 0.1~0.9
Accuracy for conv(0.1) : 66.4021164021
Accuracy for conv(0.2) : 77.9100529101
Accuracy for conv(0.3) : 77.9100529101
Accuracy for conv(0.4) : 76.5873015873
Accuracy for conv(0.5) : 76.5873015873
Accuracy for conv(0.6) : 76.5873015873
Accuracy for conv(0.7) : 76.5873015873
Accuracy for conv(0.8) : 76.5873015873
Accuracy for conv(0.9) : 76.5873015873
Accuracy for conv(1.0) : 76.5873015873


## 10-fold cross validation

* shuffle and 10-split

In [69]:
from sklearn.utils import shuffle

In [73]:
data_shuffled = shuffle(data_main)
data_shuffled.head()

Unnamed: 0,Age,SexCode,class,Survived
386,18,0,2,0
174,46,0,1,0
546,3,0,2,1
727,31,0,3,0
1312,29,0,3,0


In [139]:
n,m = data_shuffled.shape
k = n/10
F = []
for i in range(0,n,k):
    if i+2*k>n:
        F.append(data_shuffled[i:])
        break
    else:
        F.append(data_shuffled[i:i+k])

In [140]:
for idx,k in enumerate(F):
    print idx,len(k)

0 75
1 75
2 75
3 75
4 75
5 75
6 75
7 75
8 75
9 81


In [180]:
right =0
no =0
for i in range(0,10):
    n=0
    Train=None
    Test=None
    
    Test=F[i]
    if i==0:
        Train = F[1]
        n=1
    else:
        n=0
        Train = F[0]
    for j in range(n,len(F)):
        if j==i or j==n:
            pass
        else:
            Train = Train.append(F[j])
    # split
    Test_X = Test[Test.columns[0:3]]
    Test_Y = Test[['Survived']]
    Train_X = Train[Train.columns[0:3]]
    Train_Y = Train[['Survived']]
    
    W = logistic(Train_X,Train_Y)
    TP, TN, FP, FN = predict(W,Test_X,Test_Y)

    print i+1,"th Accuracy : %.2f with Theta %s"%(((TP+TN)*100.0/(TP+TN+FN+FP)),W)
    right+=(TP+TN)
    no+=(FP+FN)

print "------------------------------------------\nTotal Accuracy : %s"% (right*100.0/(right+no))    

1 th Accuracy : 80.00 with Theta [ 0.03317962  6.33097138 -2.39946711]
2 th Accuracy : 81.33 with Theta [ 0.02836065  6.53715071 -2.11707752]
3 th Accuracy : 84.00 with Theta [ 0.02245455  5.25350235 -1.78503283]
4 th Accuracy : 69.33 with Theta [ 0.05311724  7.07991025 -2.95778149]
5 th Accuracy : 74.67 with Theta [ 0.05176889  5.91207021 -2.62446971]
6 th Accuracy : 74.67 with Theta [ 0.03583064  6.01708823 -2.40378792]
7 th Accuracy : 78.67 with Theta [ 0.04061077  6.03628244 -2.38212927]
8 th Accuracy : 74.67 with Theta [ 0.03779016  6.54916477 -2.38521995]
9 th Accuracy : 72.00 with Theta [ 0.04737454  5.64575542 -2.55734652]
10 th Accuracy : 79.01 with Theta [ 0.03566674  6.17179659 -2.38038826]
------------------------------------------
Total Accuracy : 76.8518518519


In [166]:
data_main.head()

Unnamed: 0,Age,SexCode,class,Survived
0,29.0,1,1,1
1,2.0,1,1,0
2,30.0,0,1,0
3,25.0,1,1,0
4,0.92,0,1,1


---
Based on Theta, We can guess

**More** aged

**Female**

**Higher** class (1 is higher than 2)

made people survived