# Importing Libraries

In [1]:
import numpy as np

## Intitializing the values

In [2]:
m=150
d=75
X = np.random.rand(m,d)
true_theta = np.zeros(d)

for i in range(10):
    if(np.random.randint(2)) ==0:
        true_theta[i]=10
    else :
        true_theta[i]=-10
        
eps = 0.31622776601 * np.random.randn(m, 1) 
eps = eps.flatten()


## Evaluating train ,test ,val for the given dataset

In [3]:
y = np.matmul(X,true_theta) + eps
xtrain = X[:80]
ytrain = y[:80]
xval = X[80:100]
yval = y[80:100]
xtest = X[100:150]
ytest = y[100:150]


## Functions to minimize the objectives 
### These will be called from the inbuilt scipy.minimize function

In [4]:
def fun(param,x,y):
    theta = param[1:]
    lam = param[0]
    return sum(np.square(y - np.matmul(x,theta))) + lam*sum(np.square(theta))

def fun2(lam,theta,x,y):
    
    return sum(np.square(y - np.matmul(x,theta))) + lam*sum(np.square(theta))

def fun3(theta,lam,x,y):

    return sum(np.square(y - np.matmul(x,theta))) + lam*sum(np.square(theta))

def constrain(x):
    return x[0]

## Minimizing the loss function with l2 regularization 

### We choose optimal theta values from the train set  along with a dummy lamda value

In [5]:
from scipy.optimize import minimize
lam = 1
theta = np.ones(d)

param = np.insert(theta,0,lam)
res = minimize(fun,param,args=(xtrain,ytrain),constraints = {'type':'ineq','fun':constrain})


## We choose the best lambda from the validation set 

### We minimize the error on the validation set , and choose the corresponding lamda

In [6]:

lam = res.x[0]
theta = res.x[1:]
res = minimize(fun2,lam,args=(theta,xval,yval),constraints = {'type':'ineq','fun':constrain})

lamda_estimate = res.x


## We evaluate the theta values on the test set with the chosen lamda value 
### We get the corresponding theta values

In [7]:
res = minimize(fun3,theta,args=(lamda_estimate,xtest,ytest),constraints = {'type':'ineq','fun':constrain})
predicted_theta = res.x

## Checking the number of mismatched values without the threshold

### Setting the threshold and assigning all values below it to 0

In [8]:
# This function checks the discrepancy by the rule given in the assignment pdf
def discrepancy(a1,a2):
    count =0
    for i in range(a1.shape[0]):
        if((a1[i]==0 and a2[i]!=0) or (a1[i]!=0 and a2[i]==0)):
            count = count + 1
    return count

# This function assigns all the values below the threshold to 0, we have choosen 1 as a threshold.
def threshold(a1):
    for i in range(a1.shape[0]):
        if(a1[i]<=0.01):
            a1[i] =0
    return a1

## Discrepancy without the threshold for user defined implementation of Ridge

In [9]:
count1 = discrepancy(predicted_theta,true_theta)
print count1 # indicates number of mismatched values without the thershold

65


## Discrepancy with threshold for user defined implementation of Ridge

In [10]:
threshold_theta = threshold(predicted_theta)
count2 = discrepancy(threshold_theta,true_theta)
print count2  # indicates number of mismatched values with the threshold

38


## Debugging  using the inbuilt Ridge function

In [11]:
from sklearn import linear_model
clf = linear_model.Ridge(alpha=0.1)
clf.fit(xtrain,ytrain)
clf.predict(xtest)
clf.coef_

array([  9.19418957e+00,  -9.41515277e+00,   8.79131162e+00,
         1.00699244e+01,   1.02784643e+01,  -8.57423594e+00,
         1.01348909e+01,  -8.77206716e+00,  -9.77229906e+00,
         9.11664010e+00,  -3.70550429e-02,  -7.12839359e-01,
        -8.29771232e-01,  -1.92319782e-01,  -2.49461030e-01,
        -2.09566015e-01,  -1.39518319e-01,  -1.99264142e-01,
        -3.60916860e-02,   9.09092277e-02,   1.22747391e-01,
         2.03976306e-01,   1.55942149e-01,  -2.35711430e-01,
        -8.29489277e-01,   6.05773760e-01,   1.85144681e-01,
         1.24019575e-01,   2.75244902e-01,   6.89157084e-01,
         5.58328696e-01,   3.74268765e-02,   7.82787180e-01,
         7.70377985e-01,  -1.27142922e-01,  -3.28523626e-01,
        -2.09331754e-01,  -1.05270692e-01,  -3.25855016e-01,
        -6.36372388e-02,   6.35567987e-02,  -1.29876757e-01,
         3.19600551e-01,   2.01747532e-01,   7.06608009e-01,
         7.86635813e-01,  -3.15011263e-01,   8.09068814e-01,
         7.80645126e-01,

## Discrepancy without the threshold for inbuilt Ridge regression

In [12]:
count3 = discrepancy(true_theta,clf.coef_)
print count3

65


## Discrepancy with threshold for inbuilt Ridge regression

In [13]:
count4 = discrepancy(threshold(clf.coef_),true_theta)
print count4

32


## Running the inbuilt Lasso function to get the corresponding theta values

In [14]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.1)
clf.fit(xtrain,ytrain)
clf.predict(xtest)
clf.coef_

array([  8.26254924,  -8.15024583,   8.43647684,   8.55649595,
        10.14498126,  -8.2579131 ,   9.71550763,  -8.37300553,
        -7.56942777,   8.95469465,   0.        ,   0.        ,
        -0.        ,  -0.        ,  -0.        ,  -0.        ,
         0.        ,   0.        ,  -0.        ,  -0.        ,
        -0.        ,   0.        ,   0.        ,  -0.        ,
        -0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,  -0.        ,   0.        ,
         0.        ,   0.        ,  -0.        ,  -0.        ,
        -0.        ,   0.        ,   0.        ,   0.        ,
        -0.        ,  -0.        ,  -0.        ,  -0.        ,
        -0.        ,   0.        ,   0.        ,  -0.        ,
         0.        ,  -0.        ,   0.        ,  -0.        ,
        -0.        ,   0.        ,  -0.        ,   0.        ,
        -0.        ,   0.        ,  -0.        ,  -0.        ,
         0.        ,   0.        ,   0.        ,  -0.  

## Discrepancy without the threshold

In [15]:
count5 = discrepancy(clf.coef_,true_theta)
print count5

0


# Report

## Self implemented Ridge regression

Choosing the value of lamda through validation set indicates a good approximation of the true weights though the the discrepancy "count1" is high , if we threshold the value we get an extremely good approximation as can be seen from the discrepancy "count2" , which means we would get a good generalization of our model on future datasets.

A similar trend can be seen in the built in version of Ridge regression

## Built in Lasso regression

Lasso regression performs extremely well without thresholding as well, as can be seen in discreapncy "count5".
This means we get a good generallized approximation of the model even without thresholding the theta values.



# Conclusion

It is best to choose the hyperparameter through a validation set to get a better generallization for the model.