In [69]:
# Useful starting lines

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import datetime
import myFunctions1 as my


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading and Cleaning the training data

In [70]:
from myFunctions1 import *
DATA_TRAIN_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/train.csv' # TODO: download train data and supply path here 
y, X, ids = load_csv_data(DATA_TRAIN_PATH)

In [71]:
print(y.shape, X.shape)

(250000,) (250000, 30)


In [72]:
# For entries with missing data, the value -999 is filled, therefore we try to figure out ...
# ... how much of the data is missing

count_miss_instances=np.zeros((len(y),1))
for id in ids:
    count_miss_instances[id-100000]=sum(X[id-100000] == -999.0)
print(np.median(count_miss_instances))
print(np.mean(count_miss_instances))
    

7.0
6.320208


As can be seen from above, for every instance on an average about 6 field/attribute values are missing, we further perform a feature-wise check for the missing values.

In [73]:
count_miss_features=np.zeros((X.shape[1],1))
for d in range(X.shape[1]):
    count_miss_features[d]=sum(X[:,d] == -999.0)
print(count_miss_features.T)    

[[  38114.       0.       0.       0.  177457.  177457.  177457.       0.
        0.       0.       0.       0.  177457.       0.       0.       0.
        0.       0.       0.       0.       0.       0.       0.   99913.
    99913.   99913.  177457.  177457.  177457.       0.]]


Here we realize that only some (11 in number - although not few) of the features have missing values. We fill in these values with the median, the way we approximate this is by inserting zeros at the NaN positions in the standardized data matrix.

In [74]:
def standardize_badFeatures(X):
    
    # Function that calculate the mean and std of bad features without elements equal to -999
    # Then, it replaces -999 values by zeros, zeros won't influence the train of the model... 
    mean_x = np.zeros((X.shape[1],))
    std_x = np.zeros((X.shape[1],))
    for d in range(X.shape[1]):
        idx = np.where(X[:,d] == -999)[0]
        mean_x[d] = np.mean(np.delete(X[:,d], (idx)))
        std_x[d] = np.std(np.delete(X[:,d], (idx)))
        X[:,d] = (X[:,d]-mean_x[d])/std_x[d]
        X[idx,d] = 0
    return X, mean_x, std_x


def clean_data(X):

    # find indices of features that have at least one value -999, we call them "bad" features
    idx_badFeatures = []
    for d in range(X.shape[1]):
        if sum(X[:,d] == -999) > 0:
            idx_badFeatures.append(d)

    # separate "good" and "bad" features
    X_badFeatures = X[:,idx_badFeatures]
    X_goodFeatures = np.delete(X,(idx_badFeatures), axis=1)

    # Standardize it differently (see : standardize_badFeatures(X))
    tX, mean_x, std_x = standardize(X_goodFeatures)
    tX2, mean_x2, std_x2 = standardize_badFeatures(X_badFeatures)

    # comment the 3 next lines if you want to work only with "good" features
    tX = np.hstack((tX, tX2))
    mean_x = np.hstack((mean_x, mean_x2))
    std_x = np.hstack((std_x, std_x2))
    
    return tX, mean_x, std_x

# Now tX already has ones in the first column...
tX, mean_x, std_x = clean_data(X)

Do your thing crazy machine learning thing here :) ...

## Method 1: Linear regression using gradient descent: 

least_squares_GD (y, tx, gamma, max_iters) 

In [42]:
# Linear regression using gradient descent

def compute_loss(y, tx, w):
    
    # Returns the mean squared error for a given data and ...
    # ... set of weights with respect to the labels y
    
    e=y-np.dot(tx,w)
    L= ( 1/(2*len(y)) )*np.dot(e.T,e) # Least squares error - assuming the (1/2N)*(e.T*e) form
    return L

def compute_gradient(y, tx, w):
    
    # Returns the gradient of the loss function with respect to weights ...
    # ... at a given point w in the space of weights
    
    e=y-np.dot(tx,w)
    grad_L = (-1/len(y))*np.dot(tx.T,e) #Using the expression gradient of Loss = (-1/N)*(X.T*e)
    return grad_L

def gradient_descent(y, tx, initial_w, max_iters, gamma): 
    
    # Executes the gradient descent algorithm to find optimzal weights 
    # Returns as output the set of weight vectors in the update process
    # and the corresponding losses 
    
    """Gradient descent algorithm."""
    # Define parameters to store w and loss
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        
        #print(n_iter)
        
        # Compute Loss and Gradient
        L = compute_loss(y, tx, w)
        grad_L = compute_gradient(y, tx, w)
        
        # update w by gradient
        w = w - gamma*grad_L
        
        loss = L

        # store w and loss
        ws.append(np.copy(w))
        losses.append(loss)
        
        print("Gradient Descent({bi}/{ti}): loss={l}".format(
              bi=n_iter, ti=max_iters - 1, l=loss))

    return losses, ws

## Generate predictions with method 1 

In [8]:
DATA_TEST_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/test.csv' # TODO: download train data and supply path here 
_, X_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [9]:
#X_test = np.delete(X_test, del_features, axis=1)
tX_test, mean_tX_test, std_tX_test = standardize(X_test)
print(tX_test.shape)

(568238, 31)


In [10]:
initial_w = np.random.random(tX.shape[1],)
max_iters=1000
gamma=0.02
losses,ws = gradient_descent(y, tX, initial_w, max_iters, gamma)


In [11]:
OUTPUT_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/op1.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(ws[-1], tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

## Method 2: Linear regression using stochastic gradient descent:
least_squares_GD (y, tx, gamma, max_iters) 

In [79]:
def compute_stoch_gradient(y, tx, w):
    
    e=y-np.dot(tx,w)
    grad_L = (-1/(len(y)))*(np.dot(tx.T,e))
    # TODO: implement stochastic gradient computation.It's same as the gradient descent.
    
    return grad_L


def stochastic_gradient_descent(y, tx, gamma, max_epochs):
    """Stochastic gradient descent algorithm."""
    
    #max_epochs = 20
    #gamma = 0.02
    
    
    # Define parameters to store w and loss
    batch_size = 1
    initial_w = np.random.randn(tx.shape[1],)
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_epochs):
        
        
        for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size):
            
            # Compute Loss and Gradient
            L = compute_loss(minibatch_y, minibatch_tx, w)
            grad_L = compute_stoch_gradient(minibatch_y, minibatch_tx, w)
        
        
            # update w by gradient
            w = w - gamma*grad_L
            loss = L
        

            # store w and loss
            ws.append(np.copy(w))
            losses.append(loss)
    
            #print("Stochastic Gradient Descent({bi}): loss={l}".format(
            #  bi=i , l=loss, w0=w[0], w1=w[1]))
    
    return losses, ws

## Generate predictions with method 2 

In [80]:
# Define the parameters of the algorithm.
from myFunctions1 import *

max_epochs = 400
gamma = 0.02


# Start SGD.
start_time = datetime.datetime.now()
gradient_losses, gradient_ws = stochastic_gradient_descent(y, tX,gamma, max_epochs)
end_time = datetime.datetime.now()

# Print result
exection_time = (end_time - start_time).total_seconds()
#print("SGD: execution time={t:.3f} seconds".format(t=exection_time))

In [81]:
len(gradient_losses)
#ws[-1].shape

400

In [82]:
DATA_TEST_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)





In [83]:
tX_test.shape

(568238, 30)

In [84]:
x=tX_test

if mean_x is None:
    mean_x = np.mean(x, axis=0)
    x = x - mean_x
if std_x is None:
    std_x = np.std(x, axis=0)

x[:, std_x>0] = x[:, std_x>0] / std_x[std_x>0]

tx_test = np.hstack((np.ones((x.shape[0],1)), x))

tx_test.shape

(568238, 31)

In [85]:
OUTPUT_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/op2.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(gradient_ws[-1], tx_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

## Method 3: Least squares regression using normal equations:
least_squares (y, tx)

In [34]:
def least_squares(y,tx):
    
    # Explicitly determining optimal w using normal equations
    # returns the optimzal weight vector
    
    weights=np.dot( np.linalg.inv(np.dot(tx.T,tx)), np.dot(tx.T,y) )
    return weights

## Generate predictions with method 3

In [37]:
import myFunctions1 as my
print(tX.shape)
weights_method_3 = least_squares(y,my.build_poly(tX,3))
print(weights_method_3.shape)

(250000, 31)
(91,)


In [39]:
DATA_TEST_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)


#Adding the column of ones (for the bias term)
ek = np.ones((tX_test.shape[0],1))
tX_test = np.c_[ ek , tX_test]
print(tX_test.shape)

import myFunctions1 as my
tX_test=my.build_poly(tX_test,3)
tX_test.shape

(568238, 31)


(568238, 91)

In [40]:
OUTPUT_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/op3.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights_method_3, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)