In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#Read in train data and test data
train_data = pd.read_csv("../input/titanic/train.csv")
test_data = pd.read_csv("../input/titanic/test.csv")

In [3]:
train_data.head()

In [4]:
# Decide feature parameters
# Initial decision: Pclass, Age, SibSp, Parch, Fare
train_data = train_data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived']]
train_data = train_data.dropna(axis=0)
test_data = test_data[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

In [5]:
train_X = train_data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
train_y = train_data[['Survived']]

In [6]:
# Converting dataframe to numpy array
X_train = train_X.to_numpy()
y_train = train_y.to_numpy()
X_test = test_data.to_numpy()

In [7]:
# Define function for feature normalization (X is the feature matrix)
# Function returns a normalized version of X where the mean value of
# each feature is 0 and the standard deviation is 1.

def feature_normalization(X):
    X_norm = X
    mean = X.mean(axis=0)
    X_norm = X_norm - mean
    sigma = np.std(X_norm, axis=0)
    X_norm = np.divide(X_norm, sigma)
    return X_norm

In [8]:
# Define function for gradient descent, updating parameter theta
# by taking num_iters steps of gradient step with learning rate alpha.
# Returns the theta that has the lowest cost.

def gradient_descent(X, y, theta, alpha, num_iters):
    m = np.shape(y)[0]
    minCost = 0
    minTheta = np.zeros(np.shape(theta))
    for i in range(num_iters):
        h = np.matmul(X, theta) - y
        h = h * X
        h = np.sum(h, axis=0, keepdims=True)
        h = np.transpose(h)
        theta = theta - alpha * h / m
        cost = computecost(X, y, theta)
        if i==0:
            minCost = cost
            minTheta = theta
        else:
            if cost < minCost:
                minCost = cost
                minTheta = theta
    
    return minTheta

In [9]:
# Define function that computes cost for linear regression
def computecost(X, y, theta):
    m = np.shape(y)[0]
    h = np.matmul(X, theta) - y
    h = np.square(h)
    cost = np.sum(h) / (2*m)
    return cost

In [10]:
# Define prediction function that takes unprocessed data X and theta
# to produce prediction
def predict(X, theta):
    pred = np.zeros((np.shape(X)[0],2))
    pred[:,0] = X[:,0]
    X = X[:,1:]
    s = np.matmul(X, theta)
    for i in range(np.shape(pred)[0]):
        if s[i] < 0.5:
            pred[i,1] = 0
        else:
            pred[i,1] = 1
    return pred

In [11]:
# Normalize X training data
X_norm = feature_normalization(X_train)
# Add bias term
bias = np.ones((np.shape(X_norm)[0],1))
X_norm = np.hstack((bias, X_norm))
# Set initial theta to 0, learning rate to 1, and number of iterations tto 50
initial_theta = np.zeros((np.shape(X_norm)[1],1))
alpha = 1
num_iters = 50
theta = gradient_descent(X_norm, y_train, initial_theta, alpha, num_iters)
theta = theta[1:]
output = predict(X_test, theta)

In [12]:
output_df = pd.DataFrame(output, columns = ['PassengerId', 'Survived'], dtype=int)
output_df.to_csv('output.csv',index=False)