In [1]:
# Useful starting lines

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import datetime


## Loading and Cleaning the training data

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/train.csv' # TODO: download train data and supply path here 
y, X, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
print(y.shape, X.shape)

(250000,) (250000, 30)


In [4]:
# For entries with missing data, the value -999 is filled, therefore we try to figure out ...
# ... how much of the data is missing

count_miss_instances=np.zeros((len(y),1))
for id in ids:
    count_miss_instances[id-100000]=sum(X[id-100000] == -999.0)
print(np.median(count_miss_instances))
print(np.mean(count_miss_instances))
    

7.0
6.320208


As can be seen from above, for every instance on an average about 6 field/attribute values are missing, we further perform a feature-wise check for the missing values.

In [5]:
count_miss_features=np.zeros((X.shape[1],1))
for d in range(X.shape[1]):
    count_miss_features[d]=sum(X[:,d] == -999.0)
print(count_miss_features.T)    

[[  38114.       0.       0.       0.  177457.  177457.  177457.       0.
        0.       0.       0.       0.  177457.       0.       0.       0.
        0.       0.       0.       0.       0.       0.       0.   99913.
    99913.   99913.  177457.  177457.  177457.       0.]]


Here we realize that only some (11 in number - although not few) of the features have missing values. We fill in these values with the median, the way we approximate this is by inserting zeros at the NaN positions in the standardized data matrix.

In [6]:
from helpers import *

def standardize_badFeatures(X):
    
    # Function that calculate the mean and std of bad features without elements equal to -999
    # Then, it replaces -999 values by zeros, zeros won't influence the train of the model... 
    mean_x = np.zeros((X.shape[1],))
    std_x = np.zeros((X.shape[1],))
    for d in range(X.shape[1]):
        idx = np.where(X[:,d] == -999)[0]
        mean_x[d] = np.mean(np.delete(X[:,d], (idx)))
        std_x[d] = np.std(np.delete(X[:,d], (idx)))
        X[:,d] = (X[:,d]-mean_x[d])/std_x[d]
        X[idx,d] = 0
    return X, mean_x, std_x


def clean_data(X):

    # find indices of features that have at least one value -999, we call them "bad" features
    idx_badFeatures = []
    for d in range(X.shape[1]):
        if sum(X[:,d] == -999) > 0:
            idx_badFeatures.append(d)

    # separate "good" and "bad" features
    X_badFeatures = X[:,idx_badFeatures]
    X_goodFeatures = np.delete(X,(idx_badFeatures), axis=1)

    # Standardize it differently (see : standardize_badFeatures(X))
    tX, mean_x, std_x = standardize(X_goodFeatures)
    tX2, mean_x2, std_x2 = standardize_badFeatures(X_badFeatures)

    # comment the 3 next lines if you want to work only with "good" features
    tX = np.hstack((tX, tX2))
    mean_x = np.hstack((mean_x, mean_x2))
    std_x = np.hstack((std_x, std_x2))
    
    return tX, mean_x, std_x

# Now tX already has ones in the first column...
tX, mean_x, std_x = clean_data(X)

In [7]:
#Counting the number of features that have missing values

#count_miss_features=np.zeros((X.shape[1],1))
#del_features=[]

# We create an array del_features (since we plan to drop these features) ...
# ... to store the index of the attributes with missing values 
#for d in range(X.shape[1]):
#    count_miss_features[d]=sum(X[:,d] == -999.0)
#    if count_miss_features[d]>0:
#            del_features=np.r_[del_features,d]
#print(del_features)
#print(sum(count_miss_features > 0))

# The features having indices in del_features computed above are now dropped from the data ... 
# ... thus reducing the tX matrix to 19 columns (deleting 11)

#X_not_cleaned = X # Let's keep a copy of the old data, before cleaning it
#X = np.delete(X, del_features, axis=1)
#print(X.shape)
    
# In the next step we standardize the data using functions from the helper file
    
#from helpers import *
#tX, mean_tX, std_tX = standardize(X)
#print(tX.shape) #this tX has the column of 1's appended in the beginning of the data matrix



Do your thing crazy machine learning thing here :) ...

## Method 1: Linear regression using gradient descent: 

least_squares_GD (y, tx, gamma, max_iters) 

In [81]:
# Linear regression using gradient descent

def least_squares_GD(y,tx,gamma,max_iters):
    
    initial_w = np.random.randn(tx.shape[1])
    losses, ws = gradient_descent(y, tx, initial_w, max_iters, gamma)
    final_w = ws[-1][:]
    
    return final_w, ws, losses
    
    

def compute_loss(y, tx, w):
    e=y-np.dot(tx,w)
    L= ( 1/(2*len(y)) )*np.dot(e.T,e) # Least squares error - assuming the (1/2N)*(e.T*e) form
    return L

def compute_gradient(y, tx, w):
    e=y-np.dot(tx,w)
    grad_L = (-1/len(y))*np.dot(tx.T,e) #Using the expression gradient of Loss = (-1/N)*(X.T*e)
    return grad_L

def gradient_descent(y, tx, initial_w, max_iters, gamma): 
    
    """Gradient descent algorithm."""
    # Define parameters to store w and loss
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        
        print(n_iter)
        
        # Compute Loss and Gradient
        L = compute_loss(y, tx, w)
        grad_L = compute_gradient(y, tx, w)
        
        # update w by gradient
        w = w - gamma*grad_L
        
        loss = L

        # store w and loss
        ws.append(np.copy(w))
        losses.append(loss)
        
        print("Gradient Descent({bi}/{ti}): loss={l}".format(
              bi=n_iter, ti=max_iters - 1, l=loss))

    return losses, ws

## Generate predictions with method 1 

In [82]:
DATA_TEST_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/test.csv' # TODO: download train data and supply path here 
_, X_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [83]:
#X_test = np.delete(X_test, del_features, axis=1)
tX_test, mean_tX_test, std_tX_test = standardize(X_test)
print(tX_test.shape)

(568238, 31)


In [84]:
initial_w = np.random.random(tX.shape[1],)
max_iters=1000
gamma=0.02
losses,ws = gradient_descent(y, tX, initial_w, max_iters, gamma)


0
Gradient Descent(0/999): loss=8.736730707093587
1
Gradient Descent(1/999): loss=7.568116274034843
2
Gradient Descent(2/999): loss=6.612022210485747
3
Gradient Descent(3/999): loss=5.824915737812227
4
Gradient Descent(4/999): loss=5.172680683151277
5
Gradient Descent(5/999): loss=4.6285300567130765
6
Gradient Descent(6/999): loss=4.171386405884067
7
Gradient Descent(7/999): loss=3.7846246193695285
8
Gradient Descent(8/999): loss=3.455095620107414
9
Gradient Descent(9/999): loss=3.1723677839193276
10
Gradient Descent(10/999): loss=2.928137164481591
11
Gradient Descent(11/999): loss=2.715768632816067
12
Gradient Descent(12/999): loss=2.529938577637235
13
Gradient Descent(13/999): loss=2.366356423903104
14
Gradient Descent(14/999): loss=2.2215473460907096
15
Gradient Descent(15/999): loss=2.0926825169882237
16
Gradient Descent(16/999): loss=1.9774463029738534
17
Gradient Descent(17/999): loss=1.8739321947231562
18
Gradient Descent(18/999): loss=1.7805611043269731
19
Gradient Descent(19/9

In [89]:
OUTPUT_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/op1.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(ws[-1], tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

## Method 2: Linear regression using stochastic gradient descent:
least_squares_GD (y, tx, gamma, max_iters) 

In [95]:
def compute_stoch_gradient(y, tx, w):
    
    e=y-np.dot(tx,w)
    grad_L = (-1/(len(y)))*(np.dot(tx.T,e))
    # TODO: implement stochastic gradient computation.It's same as the gradient descent.
    
    return grad_L


def stochastic_gradient_descent(y, tx, gamma, max_epochs):
    """Stochastic gradient descent algorithm."""
    
    # Define parameters to store w and loss
    batch_size = 1
    initial_w = np.random.randn(tx.shape[1],)
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_epochs):
        i=0
        
        for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size):
            i+=1
            
            # Compute Loss and Gradient
            L = compute_loss(minibatch_y, minibatch_tx, w)
            grad_L = compute_stoch_gradient(minibatch_y, minibatch_tx, w)
        
        
            # update w by gradient
            w = w - gamma*grad_L
            loss = L
        

            # store w and loss
            ws.append(np.copy(w))
            losses.append(loss)
    
            print("Stochastic Gradient Descent({bi}): loss={l}".format(
              bi=i , l=loss, w0=w[0], w1=w[1]))
    
    return losses, ws

## Generate predictions with method 2 

In [None]:
# Define the parameters of the algorithm.
max_epochs = 2
gamma = 0.000002


# Start SGD.
start_time = datetime.datetime.now()
gradient_losses, gradient_ws = stochastic_gradient_descent(
    y, tX, gamma, max_epochs)
end_time = datetime.datetime.now()

# Print result
exection_time = (end_time - start_time).total_seconds()
#print("SGD: execution time={t:.3f} seconds".format(t=exection_time))

Stochastic Gradient Descent(1): loss=18.43338871520915
Stochastic Gradient Descent(2): loss=12.480479128171
Stochastic Gradient Descent(3): loss=23.709902178233804
Stochastic Gradient Descent(4): loss=2.4514312361880197
Stochastic Gradient Descent(5): loss=16.797311164050395
Stochastic Gradient Descent(6): loss=14.379283734776173
Stochastic Gradient Descent(7): loss=35.33105056950005
Stochastic Gradient Descent(8): loss=23.068806185243027
Stochastic Gradient Descent(9): loss=23.034319621524546
Stochastic Gradient Descent(10): loss=3.2791866116141453
Stochastic Gradient Descent(11): loss=27.44590260066062
Stochastic Gradient Descent(12): loss=141.04891974343045
Stochastic Gradient Descent(13): loss=3.4093056459395537
Stochastic Gradient Descent(14): loss=2.023631965205457
Stochastic Gradient Descent(15): loss=23.620042703153985
Stochastic Gradient Descent(16): loss=8.643117103123835
Stochastic Gradient Descent(17): loss=0.09997024236396375
Stochastic Gradient Descent(18): loss=2.5860793

In [100]:
len(gradient_ws)
ws[-1].shape

(31,)

In [127]:
DATA_TEST_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)





In [128]:
tX_test.shape

(568238, 30)

In [134]:
x=tX_test

if mean_x is None:
    mean_x = np.mean(x, axis=0)
    x = x - mean_x
if std_x is None:
    std_x = np.std(x, axis=0)

x[:, std_x>0] = x[:, std_x>0] / std_x[std_x>0]

tx_test = np.hstack((np.ones((x.shape[0],1)), x))

tx_test.shape

(568238, 31)

In [135]:
OUTPUT_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/op2.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(ws[-1], tx_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

## Method 3: Least squares regression using normal equations:
least_squares (y, tx)

In [136]:
def least_squares(y,tx):
    weights=np.dot( np.linalg.inv(np.dot(tx.T,tx)), np.dot(tx.T,y) )
    return weights

## Generate predictions with method 3

In [137]:
print(tX.shape)
weights_method_3 = least_squares(y,my.build_poly(tX,3))
print(weights_method_3.shape)

(250000, 31)
(91,)


In [139]:
DATA_TEST_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)


#Adding the column of ones (for the bias term)
ek = np.ones((tX_test.shape[0],1))
tX_test = np.c_[ ek , tX_test]
print(tX_test.shape)

import myFunctions as my
tX_test=my.build_poly(tX_test,3)
tX_test.shape

(568238, 31)


(568238, 91)

In [141]:
OUTPUT_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/op3.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights_method_3, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

## Method 4: Ridge regression using normal equations

In [None]:
import myFunctions as my
from plots import bias_variance_decomposition_visualization

def cross_validation(y, x, k_indices, k, lamb):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train: TODO
    tx_tr = x[np.delete(k_indices, (k), axis=0).flatten()]
    y_tr = y[np.delete(k_indices, (k), axis=0).flatten()]
    tx_te = x[k_indices[k]]
    y_te = y[k_indices[k]]
    
    
    # regression/classification method
    w = my.ridge_regression(y_tr, tx_tr, lamb)
    
    # calculate the loss for train and test data: TODO
    loss_tr = my.compute_loss(y_tr, tx_tr, w)
    loss_te = my.compute_loss(y_te, tx_te, w)
    
    
    return loss_tr, loss_te

def cross_validation_demo(y, X):
    # parameters
    seed = 56
    k_fold = 10
    
    # hyperparameters
    degrees = [1, 2, 3]
    lambdas = np.logspace(-5, 2, 15)
    
    # split data in k fold
    k_indices = my.build_k_indices(y, k_fold, seed)
    
    
    # define lists to store the loss of training data and test data
    rmse_tr = np.zeros((len(degrees),len(lambdas)))
    rmse_te = np.zeros((len(degrees),len(lambdas)))
    var_tr = np.zeros((len(degrees),len(lambdas)))
    var_te = np.zeros((len(degrees),len(lambdas)))
    
    # Hyperparameter 1
    for ind, degree in enumerate(degrees):
        print(ind)
        # Build polynomial function
        tX = my.build_poly(X, degree)
        
        loss_tr = np.zeros((len(lambdas),))
        loss_te = np.zeros((len(lambdas),))
        
        for ind2, lamb in enumerate(lambdas):
        
            # Cross-validation
            for k in range(k_fold):
                loss_tr[k], loss_te[k] = cross_validation(y, tX, k_indices, k, lamb)
                
            rmse_tr[ind,ind2] = np.mean(loss_tr)
            rmse_te[ind,ind2] = np.mean(loss_te)
            #var_tr[ind] = np.std(loss_tr)
            #var_te[ind] = np.std(loss_te)
        
        
    return rmse_tr, rmse_te, var_tr, var_te, degrees, lambdas

rmse_tr, rmse_te, var_tr, var_te, degrees, lambdas = cross_validation_demo(y, tX)


In [None]:
from plots import *

cross_validation_visualization(lambdas, rmse_tr[0].T, rmse_te[0].T) 

In [None]:
degree = 12
lamb = 0.0006
final_X = my.build_poly(tX, degree)
w = my.ridge_regression(y, final_X, lamb)
y_pred = predict_labels(w, final_X)
loss = len(np.nonzero(y_pred-y)[0])/len(y)
loss

In [None]:
w.shape

## Generate predictions with method 4

In [None]:
DATA_TEST_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
tX_test = np.delete(tX_test, del_features, axis=1)
print(tX_test.shape)

#Adding the column of ones (for the bias term)
ek = np.ones((tX_test.shape[0],1))
tX_test = np.c_[ ek , tX_test]
print(tX_test.shape)

tX_test=my.build_poly(tX_test,12)
tX_test.shape

In [None]:
OUTPUT_PATH = 'results.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(w, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

## Method 5: Logistic regression using gradient descent or SGD

In [None]:
def sigmoid(t):
    """apply sigmoid function on t."""
    
    return 1 / (1 + np.exp(-t))

In [None]:
def calculate_loss(y, tx, w):
    """compute the cost by negative log likelihood."""
    sum=0
    for i in range(y.shape[0]):
        sum += np.log( 1 + np.exp(tx[i:i+1,:].dot(w))) - y[i]*(tx[i:i+1,:].dot(w))
        
    return sum

In [None]:
def calculate_gradient(y, tx, w):
    """compute the gradient of loss."""
    return tx.T.dot(sigmoid(tx.dot(w)) - y)

In [None]:
def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descen using logistic regression.
    Return the loss and the updated w.
    """
    
    # compute the cost: TODO
    loss = calculate_loss(y, tx, w)

    # compute the gradient: TODO
    grad = calculate_gradient(y, tx, w)
    
    # update w: TODO
    w = w - gamma*grad
    
    return loss, w

In [None]:
y.shape

In [None]:
#from helpers import de_standardize

#def logistic_regression_gradient_descent_demo(y, tx):
    
    # init parameters
y[np.where(y == -1)[0]] = 0
max_iter = 10000
threshold = 1e-8
gamma = 0.001
losses = []

    # build tx
    # tx = np.c_[np.ones((y.shape[0], 1)), x]
w = np.zeros((tX.shape[1], 1))

    # start the logistic regression
for iter in range(max_iter):
    print(iter)
        # get loss and update w.
    loss, w = learning_by_gradient_descent(y, tX, w, gamma)
        # log info
    if iter % 1000 == 0:
        print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
        # converge criteria
    losses.append(loss)
    if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
        break
    # visualization
    # visualization(y, x, mean_x, std_x, w, "classification_by_logistic_regression_gradient_descent")
print("The loss={l}".format(l=calculate_loss(y, tx, w)))

#logistic_regression_gradient_descent_demo(y, tX)

## Generate predictions with method 5

## Generate predictions and save ouput in csv format for submission:

In [None]:
a.shape

In [None]:
gamma=0.1
max_iters=1000
final_w, ws, losses=least_squares_GD(y,tX,gamma,max_iters)

#### Loading testing data

In [None]:
DATA_TEST_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
tX_test = np.delete(tX_test, del_features, axis=1)
print(tX_test.shape)
ek = np.ones((tX_test.shape[0],1))
tX_test = np.c_[ ek , tX_test]
print(tX_test.shape)
tX_test=build_poly(tX_test,3)
tX_test.shape

In [None]:
weights = least_squares(y,tX)

In [None]:
weights = np.linalg.solve(np.dot(tX.T,tX),np.dot(tX.T,y))

In [None]:
OUTPUT_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/op0.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights_method_3, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)