In [1]:
import pandas as pd
import numpy as np

In [2]:
input_df = pd.read_csv('./data/train.csv').set_index('id')
input_df.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.106643,3.59437,132.804,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,0.01233,...,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496,0
1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,0.037706,...,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719,0
2,0.03633,1.49747,233.546,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,0.056995,...,4.883,0.085222,0.032396,0.116092,-0.001688,-0.520069,2.14112,0.124464,0.148209,0
3,-0.014077,0.246,779.967,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,0.123494,...,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873,0
4,-0.003259,3.71542,156.128,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,0.034166,...,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798,1


In [42]:
def _initialize_with_zeros(n_feat):
    '''
    This function initializes the model parameters for Logistic Regression with a given dimension of 
    number of training features: w = weights and b = bias
    '''
    w = np.zeros((n_feat,1))
    b = 0.
    return w,b

def _sigmoid(z):
    '''
    This function computes the sigmoid function for a given inpiut z
    '''
    a = 1/(1+np.exp(-z))
    return a

def _propagation(w, b, X, Y):
    '''
    This function first performs the forward propagation step of Logistic Regression to compute:
    Z = wT.X + b,
    a = sigmoid(Z),
    J (cost function) = 1/m.sum(L(ai, yi)) = -1/m(yi.log(ai)+(1-yi).log(1-ai))
    
    NOTE: X must be of the shape (nx, m) and Y must be of the shape (1xm)
    
    Then, it performs backward propagation to compute:
    dw = 1/m.(X.dZ.T)
    db = 1/m.sum(dZ)
    where dZ = a - Y
    
    This funtion returns J, dw and db which will be used in the optimization step to perform gradient descent
    '''
    m = X.shape[1]
    Z = np.dot(w.T, X) + b
    a = _sigmoid(Z)
    cost = (-1/m)*(np.sum((Y*np.log(a)) + ((1-Y)*np.log(1-a))))
    
    dZ = a - Y
    dw = 1/m*(np.dot(X,dZ.T))
    db = 1/m*np.sum(dZ)
    
    gradients = {'dw': dw,
                'db': db}
    return gradients, cost

def _model_optimization(w, b, X, Y, learning_rate, iterations):
    '''
    This function performs gradient descent to find the optimal values of the model parameters w and b using the 
    following update steps:
    w := w - learning_rate*dw
    b := b - learning_rate*b
    These upate steps are run "iterations" times, unlike the version of gradient descent where we have a 
    stopping criteria of (no. of iterations is determined by) minimal change in cost function.
    
    Returns:
    params -- dictionary containing the weights w and bias b
    grads -- dictionary containing the gradients of the weights and bias with respect to the cost function
    costs -- list of all the costs computed during the optimization, this will be used to plot the learning curve.
    '''
    costs = []
    #For each step of Gradient Descent Algorithm, compute the gradients and cost values and update w and b.
    #Stop when stopping criteria is reached.
    for i in range(iterations):
        #Compute gradients and cost:
        gradients, cost = _propagation(w, b, X, Y)
        dw = gradients["dw"] #From the dictionary
        db = gradients["db"] #From the dictionary
        
        #Update w and b:
        w = w - learning_rate*dw
        b = b - learning_rate*db
        
        #Record the costs and print after every 100 iterations:
        if(i % 100==0):
            costs.append(cost)
            print(f'Cost after iteration {i} is {cost}')
            
    params = {'w': w,
             'b': b}
    
    gradients = {'dw': dw,
                'db': db}
    
    return params, gradients, costs

def _predict(w, b, X):
    '''
    This function uses the optimized values of model parameters w, b to predict y_hat = a for each test example.
    
    y_hat = a = sigmoid(z)
    
    y_hat = 0 if a <= 0.5, else 1. (0.5 is the threshold value to determine the class)
    '''
    
    m = X.shape[1]
    y_hat = np.zeros((1,m))
    Z = np.dot(w.T, X) + b
    a = _sigmoid(Z)
    
    for i in range(a.shape[1]):
        y_hat[0][i] = 1 if a[0][i] > 0.5 else 0
    return y_hat

def _model(X_train, Y_train, X_test, Y_test, learning_rate, iterations):
    '''
    This function combines all the pieces above to create the Logistic Regression model with Gradient Descent.
    lea
    Returns: dictionary containing all the information about the model - costs, y_hat_train, y_hat_test, 
             w, b, learning_rate, iterations
    '''
    
    #Step 1: Initialize parameters with zeros:
    m_train = X_train.shape[1]
    n_feat = X_train.shape[0]
    
    w, b = _initialize_with_zeros(n_feat)
    print(f"Step 1: b = {b}")
    
    #Step 2: Run fwd and backward propagation steps + gradient descent to learn the parameters w, b:
    params, gradients, costs = _model_optimization(w, b, X_train, Y_train, learning_rate, iterations)
    w = params["w"]
    b = params["b"]
    print(f"Step 2: b = {b}")

    #Step 3: Use the learned w, b from Step 2 to make predictions on training and test datasets:
    y_pred_train = _predict(w, b, X_train)
    y_pred_test = _predict(w, b, X_test)
    
    #Print tarining/test errors (mean absolute error):
    train_error = np.mean(np.abs(Y_train - y_pred_train))*100
    test_error = np.mean(np.abs(Y_test - y_pred_test))*100
    train_accuracy = 100 - train_error
    test_accuracy = 100 - test_error
    print('Step 3:')
    print(f'Train accuracy = {train_accuracy}')
    print(f'Test accuracy = {test_accuracy}')
    
    d = {"costs": costs,
         "y_hat_train": y_pred_train,
         "y_hat_test": y_pred_test,
         "w": w,
         "b": b,
         "learning_rate": learning_rate,
         "iterations": iterations}
    return d

In [45]:
#Split dataset into training and test data:
#First shuffle the input_data df and then split into 550k and 50k
input_df_shuffled = input_df.iloc[np.random.permutation(input_df.shape[0])].reset_index(drop = True)
train_data = input_df_shuffled.loc[:550000-1]
# print(train_data.shape)
# print(train_data.head())
test_data = input_df_shuffled.loc[550000:]
# print(test_data.shape)

#Get the matrices X_train and X_test in the shape of (nx, m) and y)train and y_test in the hshape of (1, m):
X_train = train_data.drop(columns='target').T
# print(X_train.shape)
y_train = np.array(train_data[['target']].T)
# print(type(y_train))

X_test = test_data.drop(columns='target').T
# print(X_test.shape)
y_test = np.array(test_data[['target']].T)
# print(y_test.shape)

#Run Logistic Regression and print the output:
logistic_regression = _model(X_train, y_train, X_test, y_test, learning_rate = 0.000001, iterations = 305)

Step 1: b = 0.0
Cost after iteration 0 is 0.6931471805599452
Cost after iteration 100 is 0.6928911155893501
Cost after iteration 200 is 0.6928288536420201
Cost after iteration 300 is 0.6927981111986126
Step 2: b = 2.363623692969738e-06
Step 3:
Train accuracy = 50.990545454545455
Test accuracy = 50.792


In [46]:
logistic_regression

{'costs': [0.6931471805599452,
  0.6928911155893501,
  0.6928288536420201,
  0.6927981111986126],
 'y_hat_train': array([[1., 0., 1., ..., 0., 1., 1.]]),
 'y_hat_test': array([[1., 1., 0., ..., 0., 1., 1.]]),
 'w': array([[ 8.66824018e-07],
        [-1.12280638e-07],
        [-5.79875000e-05],
        [-3.41361751e-06],
        [ 2.40002759e-06],
        [-8.25142907e-07],
        [ 8.32607245e-06],
        [ 9.00937905e-06],
        [ 3.06183680e-05],
        [-1.57368051e-06],
        [ 1.49932891e-05],
        [ 9.86394063e-06],
        [ 5.36921573e-07],
        [ 6.83392882e-06],
        [ 2.40892002e-06],
        [ 3.42436930e-06],
        [-3.26797447e-06],
        [-2.82513484e-06],
        [ 8.19742529e-06],
        [ 1.22791532e-06],
        [ 1.86611892e-06],
        [-3.14496924e-07],
        [-9.35664359e-06],
        [-4.76912262e-07],
        [ 2.90248071e-06],
        [-1.05731310e-05],
        [-3.28889285e-06],
        [ 1.88628832e-06],
        [ 1.62086841e-06],
   

In [None]:
#Checks:

In [28]:
w =  np.array([[1.], [2]])
b = 1.5
X = np.array([[1., -2., -1.], [3., 0.5, -3.2]])
Y = np.array([[1, 1, 0]])
gradients, cost = _propagation(w, b, X, Y)

assert type(gradients["dw"]) == np.ndarray
assert gradients["dw"].shape == (2, 1)
assert type(gradients["db"]) == np.float64


print ("dw = " + str(gradients["dw"]))
print ("db = " + str(gradients["db"]))
print ("cost = " + str(cost))

dw = [[ 0.25071532]
 [-0.06604096]]
db = -0.1250040450043965
cost = 0.15900537707692405


In [52]:
params, grads, costs = _model_optimization(w, b, X, Y, iterations=101, learning_rate=0.009)

print ("w = " + str(params["w"]))
print ("b = " + str(params["b"]))
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print("Costs = " + str(costs))

Cost after iteration 0 is 0.15900537707692405
Cost after iteration 100 is 0.10541138368032707
w = [[0.80795802]
 [2.05125464]]
b = 1.5956687730718366
dw = [[ 0.178049  ]
 [-0.04827102]]
db = -0.08860601121976527
Costs = [0.15900537707692405, 0.10541138368032707]


In [54]:
w = np.array([[0.1124579], [0.23106775]])
b = -0.3
X = np.array([[1., -1.1, -3.2],[1.2, 2., 0.1]])
print ("predictions = " + str(_predict(w, b, X)))

predictions = [[1. 1. 0.]]
