## 1 - Packages
- [numpy](https://www.numpy.org)
- [pandas](https://pandas.pydata.org)



In [1]:
import numpy as np
import pandas as pd


## 2 - Load the dataset
Load the [dataset](https://www.kaggle.com/c/titanic/data), the dataset already split for you.



In [2]:
train_data = pd.read_csv("train.csv", delimiter = ',')
test_data = pd.read_csv("test.csv", delimiter = ',')
test_result = pd.read_csv("gender_submission.csv", delimiter = ',')

test_data["Survived"] = test_result["Survived"]

## 3 - prepare the dataset 


In [3]:
def prepare_data(table):
    X = table.drop(["PassengerId", "Name", "Ticket", "Cabin", "Survived"], axis = 1)
    Y = table["Survived"]
    
    # fix "Age" 
    age_avg = round(X["Age"].sum() / (len(X) - X["Age"].isna().sum()), 1)
    X["Age"].fillna(age_avg, inplace = True)
    #normalize Age
    X["Age"] = (X["Age"] - X["Age"].mean()) / X["Age"].std()
    
    
    # fix "Fare"
    Fare_avg = round(X["Fare"].sum() / (len(X) - X["Fare"].isna().sum()), 1)
    X["Fare"].fillna(Fare_avg, inplace = True)
    X["Fare"] = (X["Fare"] - X["Fare"].mean()) / X["Fare"].std()
    
    # fix Embarked 
    X["Embarked"] = X["Embarked"].map({'C' : 0.0, 'S' : 1.0, 'Q' : 2.0})
    X['Embarked'].fillna(-1, inplace = True)
    
    #fix sex 
    X['Sex'] = X['Sex'].map({'female' : 0.0, 'male' : 1.0})
    
    
    
    return X.values.T, Y.values.reshape(1, len(Y))



In [5]:
X_train, Y_train = prepare_data(train_data)
X_test, Y_test = prepare_data(test_data)


## 4 - Activation functions

- [sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function)
- [relu](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))

In [6]:
def sigmoid(Z):
    """
    Argument:
    Z -- the input for the activation function
    
    Returns:
    A -- the output of the activation function
    cache -- dictionary containing Z
    """
    A = 1.0 / (1.0 + np.exp(-Z))
    cache = (Z)
    
    return A, cache
    

In [7]:
def sigmoid_backward(dA, cache):
    """
    Argument:
    Z -- the input for the activation function
    
    Returns:
    dZ -- gradients of the activations 
    """
    
    Z = cache

    s = 1.0 / (1.0 + np.exp(-Z))
    dZ = dA * s * (1 - s)
    
    return dZ

In [8]:
def relu(Z):
    """
    Argument:
    Z -- the input for the activation function
    
    Returns:
    A -- the output of the activation function
    cache -- dictionary containing Z
    """
    
    A = Z * (Z > 0)
    cache = (Z)
    
    return A, cache

In [9]:
def relu_backward(dA, cache):
    """
    Argument:
    Z -- the input for the activation function
    
    Returns:
    dZ -- gradients of the activations 
    """
    
    Z = cache
    dZ = dA * (Z > 0)
    
    return dZ


## 5 - Initialization
Initialize the weights matrices and biases vectors.

In [10]:
def initialize_parameters(n_x, n_h, n_y):
    """
    Argument:
    n_x -- number of units in the input layer
    n_h -- number of units in the hidden layer
    n_y -- number of units in the hidden layer
    
    Returns:
    parameters -- python dictionary containing your parameters:
                    W1 -- weight matrix of shape (n_h, n_x)
                    b1 -- bias vector of shape (n_h, 1)
                    W2 -- weight matrix of shape (n_y, n_h)
                    b2 -- bias vector of shape (n_y, 1)
    """
    
    np.random.seed(1)
    W1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))
    
    assert(W1.shape == (n_h, n_x))
    assert(b1.shape == (n_h, 1))
    assert(W2.shape == (n_y, n_h))
    assert(b2.shape == (n_y, 1))
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    return parameters

## 6 - Forward propagation

In [11]:
def linear_forward(A, W, b):
    """
    Implement the linear part of a layer's forward propagation.

    Arguments:
    A -- activations from previous layer
    W -- weights matrix
    b -- bias vector

    Returns:
    Z -- the input of the activation function, also called pre-activation parameter 
    cache -- dictionary containing A, W and b; stored to compute backward propagation step
    """
    
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    
    return Z, cache

In [12]:
def linear_activation_forward(A_prev, W, b, activation):
    """
    Implement the forward propagation for the LINEAR->ACTIVATION layer

    Arguments:
    A_prev -- activations from previous layer
    W -- weights matrix
    b -- bias vector
    activation -- the activation to be used in this layer

    Returns:
    A -- output of the activation function
    cache -- dictionary stored linear_cache and activation_cache to compute backward propagation step
    """
    
    Z, linear_cache = linear_forward(A_prev, W, b)
    
    if activation == "sigmoid":
        A, activation_cache = sigmoid(Z)
    
    if activation == "relu":
        A, activation_cache = relu(Z)
    
    cache = (linear_cache, activation_cache)
    
    return A, cache
        

## 7 - Cost function

In [13]:
def compute_cost(Yhat, Y):
    """
    Arguments:
    Yhat -- probabilities vector
    Y -- labels vector
    
    Returns:
    cost -- cross-entropy cost
    """
    m = Y.shape[1]
    
    cost = (-1 / m) * np.sum(np.multiply(Y, np.log(Yhat)) + np.multiply(1 - Y, np.log(1 - Yhat)))
    cost = np.squeeze(cost)
    
    return cost


## 8 - Backward propagation

In [14]:
def linear_backward(dZ, cache):
    """
    Arguments:
    dZ -- gradients of activations
    cache -- tuple of values (A_prev, W, b)
    
    Returns:
    dA_prev -- gradients of activations
    dW -- gradients of weights
    db -- gradients of biases
    """
    
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = np.dot(dZ, cache[0].T) / m
    db = np.sum(dZ, axis= 1, keepdims= True) / m
    dA_prev = np.dot(W.T, dZ)
    
    
    return dA_prev, dW, db

In [15]:
def linear_activation_backward(dA, cache, activation):
    """
    Arguments:
    dA -- gradients of activations
    cache -- tuple of values (linear_cache, activation_cache)
    activation -- activation function to use
    
    Returns:
    dA_prev -- gradients of activations
    dW -- gradients of weights
    db -- gradients of biases
    """
    linear_cache, activation_cache = cache
    
    if activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
    
    
    dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

## 9 - Update parameters

In [16]:
def update_parameters(parameters, grads, learning_rate = 1.2):
    """    
    Arguments:
    parameters -- dictionary containing the parameters
    grads -- dictionary contaning the gradients
    learning_rate -- the learning rate

    Returns:
    parameters -- dictionary containing updated parameters 
    """
    
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
  
    dW1 = grads['dW1']
    db1 = grads["db1"]
    dW2 = grads["dW2"]
    db2 = grads["db2"]
    
    W1 = W1 - learning_rate * dW1
    b1 = b1 - learning_rate * db1
    W2 = W2 - learning_rate * dW2
    b2 = b2 - learning_rate * db2
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

## 10 - The model
Everything come together here

In [17]:
def model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000):
    """
    X -- training examples
    Y -- training labels
    layers_dims -- layers dimensions
    learning_rate -- the learning rate
    num_iterations -- number of iterations for gradient descent
    
    Returns:
    parameters -- the parameters for the final iteration
    """
    grads = {}
    m = X.shape[1]
    (n_x, n_h, n_y) = layers_dims
    
    parameters = initialize_parameters(n_x, n_h, n_y)
    
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    for i in range(0, num_iterations):
        A1, cache1 = linear_activation_forward(X, W1, b1, "relu")
        A2, cache2 = linear_activation_forward(A1, W2, b2, "sigmoid")
        
        cost = compute_cost(A2, Y)
        
        dA2 = -(np.divide(Y, A2) - np.divide(1 - Y, 1 - A2))
        
        dA1, dW2, db2  = linear_activation_backward(dA2, cache2, "sigmoid")
        dA0, dW1, db1 = linear_activation_backward(dA2, cache1, "relu")
        
        
        grads['dW1'] = dW1
        grads['db1'] = db1
        grads['dW2'] = dW2
        grads['db2'] = db2
        
        parameters = update_parameters(parameters, grads, learning_rate)
        
        W1 = parameters["W1"]
        b1 = parameters["b1"]
        W2 = parameters["W2"]
        b2 = parameters["b2"]
        
        if (i + 1) % 100 == 0:
            print("Cost after iteration {}: {}".format(i + 1, np.squeeze(cost)))
        
    return parameters
    
        

In [18]:
parameters = model(X_train, Y_train, layers_dims = (7, 8, 1))


Cost after iteration 100: 0.6828774561981352
Cost after iteration 200: 0.6678030684920057
Cost after iteration 300: 0.6460550024098107
Cost after iteration 400: 0.6242610823755671
Cost after iteration 500: 0.6049439815656301
Cost after iteration 600: 0.5878589234151088
Cost after iteration 700: 0.5725023463016533
Cost after iteration 800: 0.5589844758384467
Cost after iteration 900: 0.5471854603136336
Cost after iteration 1000: 0.5372414351292499
Cost after iteration 1100: 0.5289295758627119
Cost after iteration 1200: 0.521969799903805
Cost after iteration 1300: 0.5160715160656766
Cost after iteration 1400: 0.5108690715467288
Cost after iteration 1500: 0.5061433494174733
Cost after iteration 1600: 0.501572733314198
Cost after iteration 1700: 0.4974340595883999
Cost after iteration 1800: 0.49364395301528635
Cost after iteration 1900: 0.4901557332570256
Cost after iteration 2000: 0.48684995225119143
Cost after iteration 2100: 0.48368303307776483
Cost after iteration 2200: 0.4802736651659

## 11 - Prediction

Use `X_test` and `Y_test` to make predictions

In [19]:
def predict(X, Y, parameters):
    """
    X -- test examples
    Y -- test labels
    parameters: gradient descent parameters
    
    Returns:
    percent -- the percentage of correction
    """
    
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    A1, cache = linear_activation_forward(X, W1, b1, "relu")
    A2, cache = linear_activation_forward(A1, W2, b2, "sigmoid")
    
    m = Y.shape[1]
    
    predictions = (Y == (A2 > 0.5))
    percent = np.sum(predictions) / m * 100
    
    return percent
    


In [20]:
print("{0:.2f}%".format(predict(X_test, Y_test, parameters)))


96.17%
