In [450]:
### IMPORT
import pandas as pd
import numpy as np
import math

In [451]:
### READ DATA
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url) #training set
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url) #test set

In [452]:
train.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [453]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [454]:
### Handle missing value
train["Age"] = train["Age"].fillna(train["Age"].median())
train.loc[train["Embarked"] == "S", "Embarked"] = 0
train.loc[train["Embarked"] == "C", "Embarked"] = 1
train.loc[train["Embarked"] == "Q", "Embarked"] = 2
#train.loc[np.isnan(train["Embarked"]), "Embarked"] = 0
train["Embarked"] = train["Embarked"].fillna((train["Embarked"].mode())[0])

train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1
train["Sex"] = train["Sex"].fillna(train["Sex"].mode())

train["Pclass"] = train["Pclass"].fillna((train["Pclass"].mode())[0])

test["Age"] = test["Age"].fillna(test["Age"].median())
test.loc[test["Embarked"] == "S", "Embarked"] = 0
test.loc[test["Embarked"] == "C", "Embarked"] = 1
test.loc[test["Embarked"] == "Q", "Embarked"] = 2
test["Embarked"] = test["Embarked"].fillna((test["Embarked"].mode())[0])

test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1
test["Sex"] = test["Sex"].fillna(test["Sex"].mode())

test["Pclass"] = test["Pclass"].fillna((test["Pclass"].mode())[0])

In [456]:
def sigmoid(Z):
    return 1/(1 + np.exp(-Z))

In [491]:
def gradient(X, y, theta, alpha):
    Z = np.dot(X,theta)
    loss = y - sigmoid(Z)
    return theta + ( alpha * np.dot(X.T, loss) )

In [484]:
### Use PClass, Sex, Age,and Embarked as input features.
train_data = np.array(train[["Pclass","Sex","Age","Embarked"]].values, dtype=np.float)
train_label = np.array(train[["Survived"]].values, dtype=np.float)
test_data = np.array(test[["Pclass","Sex","Age","Embarked"]].values, dtype=np.float)
steps = 10000
alpha = 0.05

In [498]:
### Train
m = train_data.shape[0]
ones = np.full((m,1), 1., dtype=np.float)
X_train = np.concatenate((ones, train_data), axis = 1) 
best_theta = []
best_acc = 0
threshold = 0.5

for r in range(10):
    theta = np.random.random((5,1))*100
    for step in range(steps):
        theta = gradient(X_train, train_label, theta, alpha)

    pred = sigmoid(np.dot(X_train,theta))
    acc = 0
    for i in range(len(pred)):
        if( pred[i] >= threshold and train_label[i] == 1.):
            acc += 1
        elif(pred[i] < threshold and train_label[i] == 0.):
            acc += 1
    if(acc > best_acc):
        best_acc = acc
        best_theta = theta

print(best_acc/m)
print(best_theta)

  


0.8002244668911336
[[ 5211.3408558 ]
 [-2508.01227371]
 [ 7095.30186401]
 [ -139.76533831]
 [  822.00256362]]


In [503]:
### Test
m = test_data.shape[0]
ones = np.full((m,1), 1., dtype=np.float)
X_test = np.concatenate((ones, test_data), axis = 1) 
threshold = 0.5
Z = np.dot(X_test,best_theta)
pred = np.round(sigmoid(Z))

  


In [513]:
df = pd.DataFrame(test["PassengerId"])
df2 = pd.DataFrame(pred, columns=["Survived"], dtype=np.int)
out = df.join(df2)
out.to_csv("pred.csv", index=False)

In [499]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(fit_intercept=True, C = 1e15)
clf.fit(train_data, train_label.ravel())
print (clf.intercept_, clf.coef_)
print (clf.score(train_data, train_label.ravel()))

[ 2.07085301] [[-1.19651855  2.57705297 -0.0337238   0.32102237]]
0.79012345679
