In [14]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [15]:
from proj1_helpers import *
DATA_TRAIN_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [16]:
print(y.shape, tX.shape)

(250000,) (250000, 30)


In [17]:
# For entries with missing data, the value -999 is filled, therefore we try to figure out ...
# ... how much of the data is missing

count_miss_instances=np.zeros((len(y),1))
for id in ids:
    count_miss_instances[id-100000]=sum(tX[id-100000] == -999.0)
print(np.median(count_miss_instances))
print(np.mean(count_miss_instances))
    

7.0
6.320208


As can be seen from above, for every instance on an average about 6 field/attribute values are missing, we further perform a feature-wise check for the missing values.

In [18]:
count_miss_features=np.zeros((tX.shape[1],1))
for d in range(tX.shape[1]):
    count_miss_features[d]=sum(tX[:,d] == -999.0)
print(count_miss_features)    

[[  38114.]
 [      0.]
 [      0.]
 [      0.]
 [ 177457.]
 [ 177457.]
 [ 177457.]
 [      0.]
 [      0.]
 [      0.]
 [      0.]
 [      0.]
 [ 177457.]
 [      0.]
 [      0.]
 [      0.]
 [      0.]
 [      0.]
 [      0.]
 [      0.]
 [      0.]
 [      0.]
 [      0.]
 [  99913.]
 [  99913.]
 [  99913.]
 [ 177457.]
 [ 177457.]
 [ 177457.]
 [      0.]]


Here we realize that only some (although not few) of the features have missing values. Since the number of instances where these features have missing values is quite a large fraction of the data, we decide to drop these features from our data for further analysis. 

In [19]:
#Counting the number of features
print(sum(count_miss_features > 0))

count_miss_features=np.zeros((tX.shape[1],1))
del_features=[]

# We create an array del_features (since we plan to drop these features) ...
# ... to store the index of the attributes with missing values 
for d in range(tX.shape[1]):
    count_miss_features[d]=sum(tX[:,d] == -999.0)
    if count_miss_features[d]>0:
            del_features=np.r_[del_features,d]
print(del_features)   
    
# The features having indices in del_features computed above are now dropped from the data ... 
# ... thus reducing the tX matrix to 19 columns (deleting 11)

tX_with_missing = tX # Let's keep a copy of the old data, before cleaning it
tX = np.delete(tX, del_features, axis=1)
print(tX.shape)
    



[11]
[  0.   4.   5.   6.  12.  23.  24.  25.  26.  27.  28.]
(250000, 19)


In [20]:
# Trying to find outliers
#plt.scatter(tX1[:,4],tX1[:,3])

## Do your thing crazy machine learning thing here :) ...

#### Method 1: Linear regression using gradient descent: least_squares_GD (y, tx, gamma, max_iters) 

In [21]:
len(y)


250000

In [22]:
# Linear regression using gradient descent

def least_squares_GD(y,tx,gamma,max_iters):
    
    initial_w = np.random.randn(tx.shape[1])
    losses, ws = gradient_descent(y, tx, initial_w, max_iters, gamma)
    final_w = ws[-1][:]
    
    return final_w, ws, losses
    
    

def compute_loss(y, tx, w):
    e=y-np.dot(tx,w)
    L= ( 1/(2*len(y)) )*np.dot(e.T,e) # Least squares error - assuming the (1/2N)*(e.T*e) form
    return L

def compute_gradient(y, tx, w):
    e=y-np.dot(tx,w)
    grad_L = (-1/len(y))*np.dot(tx.T,e) #Using the expression gradient of Loss = (-1/N)*(X.T*e)
    return grad_L

def gradient_descent(y, tx, initial_w, max_iters, gamma): 
    """Gradient descent algorithm."""
    # Define parameters to store w and loss
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        
        # Compute Loss and Gradient
        L = compute_loss(y, tx, w)
        grad_L = compute_gradient(y, tx, w)
        
        # update w by gradient
        w = w - gamma*grad_L
        
        loss = L

        # store w and loss
        ws.append(np.copy(w))
        losses.append(loss)
        
        print("Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format(
              bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))

    return losses, ws

## Generate predictions and save ouput in csv format for submission:

In [23]:
tX.shape

(250000, 19)

In [24]:
gamma=0.000002
max_iters=1000
final_w, ws, losses=least_squares_GD(y,tX,gamma,max_iters)

Gradient Descent(0/999): loss=104277.74827748955, w0=0.5331945156671677, w1=-1.892826192761536
Gradient Descent(1/999): loss=59693.89344263162, w0=0.5561225327707616, w1=-1.8445377386462873
Gradient Descent(2/999): loss=34962.69866796885, w0=0.5724560372222726, w1=-1.8082358119000954
Gradient Descent(3/999): loss=21238.2742435524, w0=0.5838865404996956, w1=-1.780852099878007
Gradient Descent(4/999): loss=13616.331494764829, w0=0.5916734254132643, w1=-1.7601021000638324
Gradient Descent(5/999): loss=9377.860473887842, w0=0.5967543176350527, w1=-1.7442849249245123
Gradient Descent(6/999): loss=7015.400359953048, w0=0.5998272671915122, w1=-1.7321342386790617
Gradient Descent(7/999): loss=5693.190177128206, w0=0.6014119399425585, w1=-1.722707266368582
Gradient Descent(8/999): loss=4947.868816526901, w0=0.6018951801112692, w1=-1.7153021511724786
Gradient Descent(9/999): loss=4522.538948754126, w0=0.6015649356577523, w1=-1.7093964195389204
Gradient Descent(10/999): loss=4274.769738633382, w0

In [25]:
weights=final_w

In [19]:
DATA_TEST_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [20]:
tX_test.shape

(568238, 30)

In [21]:
tX_test = np.delete(tX_test, del_features, axis=1)

In [22]:
tX_test.shape


(568238, 19)

In [23]:
OUTPUT_PATH = '/Users/akhileshgotmare/Desktop/Git_Junta/data-ml-course-project1/op0.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)