In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.optimize as op

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Pre-process data
train_data = pd.read_csv("../input/titanic/train.csv")
test_data = pd.read_csv("../input/titanic/test.csv")
# Decide feature parameters
# Initial decision: Pclass, Age, SibSp, Parch, Fare
train_data = train_data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived']]
train_data = train_data.dropna(axis=0)
test_data = test_data[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
train_X = train_data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
train_y = train_data[['Survived']]
# Converting dataframe to numpy array
X_train = train_X.to_numpy()
y_train = train_y.to_numpy()
X_test = test_data.to_numpy()

In [3]:
# Define sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [4]:
# Define function for feature normalization (X is the feature matrix)
# Function returns a normalized version of X where the mean value of
# each feature is 0 and the standard deviation is 1.
def feature_normalization(X):
    X_norm = X
    mean = X.mean(axis=0)
    X_norm = X_norm - mean
    sigma = np.std(X_norm, axis=0)
    X_norm = np.divide(X_norm, sigma)
    return X_norm

In [5]:
# Define regularized cost function
def costFunction(theta, X, y, reglambda):
    m = np.shape(y)[0]
    h = sigmoid(X*theta)
    J = (-y)*np.log(h) - (1-y)*np.log(1-h)
    J = np.sum(J) / m
    t = theta
    t[0] = 0
    t = np.square(t)
    J = J + (np.sum(t) * reglambda / (2*m))
    return J

In [6]:
# Define gradient
def gradient(theta, X, y, reglambda):
    m = np.shape(y)[0]
    h = sigmoid(X*theta)
    h = (h - y) * X
    t = theta
    t[0] = 0
    h = np.sum(h, axis=0, keepdims=True) / m
    h = h + (reglambda * t / m)
    return h

In [7]:
# Define feature mapping function to create more training data
def featureMap(X1, X2):
    degree = 6
    out = np.ones((np.shape(X1)[0],1))
    for i in (n+1 for n in range(degree)):
        for j in range(i):
            X_1 = np.power(X1, (i-j))
            X_2 = np.power(X2, j)
            newFeature = X_1 * X_2
            newFeature = np.reshape(newFeature, (np.shape(X_1)[0],1))
            out = np.hstack((out, newFeature))
    return out[:,1:]

In [8]:
# Normalize training features
X_norm = feature_normalization(X_train)
# Perform feature mapping
#X_norm = featureMap(X_norm[:,1],X_norm[:,2])
# Add bias term
bias = np.ones((np.shape(X_norm)[0],1))
X_norm = np.hstack((bias, X_norm))
reglambda = 10000
initial_theta = np.zeros((np.shape(X_norm)[1],1))
optimal_theta = op.minimize(fun = costFunction, x0 = initial_theta,
                         args = (X_norm, y_train, reglambda), method = 'BFGS').x
optimal_theta = np.reshape(optimal_theta, (np.shape(optimal_theta)[0],1))
optimal_theta = optimal_theta[1:]

In [9]:
# Define prediction function that takes unprocessed data X and theta
# to produce prediction
def predict(X, theta):
    pred = np.zeros((np.shape(X)[0],1))
    s = np.matmul(X, theta)
    for i in range(np.shape(pred)[0]):
        if s[i] < 0.3:
            pred[i] = 0
        else:
            pred[i] = 1
    return pred

In [10]:
output = np.array(X_test[:,0])
output = np.reshape(output,(np.shape(output)[0],1))
X_test = X_test[:,1:]
output = np.hstack((output,predict(X_test, optimal_theta)))
output_df = pd.DataFrame(output, columns = ['PassengerId', 'Survived'], dtype=int)
output_df.to_csv('output.csv',index=False)

In [11]:
# Training accuracy
pred_train = predict(X_train, optimal_theta)
pred_train = pred_train + y_train
true_true = (pred_train == 2).sum()
true_false = (pred_train == 0).sum()
(true_true + true_false) / np.shape(y_train)[0]