# Running a single variable regression with L1 penalty

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df = pd.read_csv(r"Data/kc_house_data.csv")

train_data = df.sample(frac=0.8,random_state=0)
simple_feature_matrix = np.ones(shape = (train_data.shape[0], 3))
simple_feature_matrix[:,1:] = train_data[['sqft_living', 'bedrooms']].values
output = train_data['price'].values

In [25]:
# Normalize features 
## In lasso, normalization is required as features with diff scales can have different weights and can be unnecessarily pushed to zero! Norm ensures equal consideration to all features.
def normalize_features(features):
    norm = np.linalg.norm(features, axis = 0) # This is 2-norm
    norm_features = features/norm 
    return norm_features, norm

## Example
# normalize_features(np.array([[3.,6.,9.],[4.,8.,12.]]))

In [26]:
simple_feature_matrix, norms = normalize_features(simple_feature_matrix)

In [28]:
## single co-ord descent step:
theta = np.array([0., 0., 0.])
prediction = np.dot(simple_feature_matrix, theta)
l1_penalty = 1e7
def lasso_coordinate_descent_step(i, simple_feature_matrix, output, theta, l1_penalty):
    prediction = np.dot(simple_feature_matrix, theta)
    ro_i = (simple_feature_matrix[:, i]*(output - prediction + theta[i]*simple_feature_matrix[:,i])).sum()
    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i 
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i + l1_penalty/2
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i - l1_penalty/2
    else:
        new_weight_i = 0.
    return new_weight_i

In [29]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    weights = initial_weights
    optimized = False
    while not optimized:
        weights_diff = []
        for i in range(len(weights)):
            old_weights_i = weights[i] # remember old value of weight[i], as it will be overwritten
            
            # the following line uses new values for weight[0], weight[1], ..., weight[i-1]
            #     and old values for weight[i], ..., weight[d-1]
            weights[i] = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            # use old_weights_i to compute change in coordinate
            weights_diff.append(abs(old_weights_i - weights[i]))
        
        if sum(weights_diff) < tolerance:
            optimized = True
    return weights

In [33]:
simple_feature_matrix, norms = normalize_features(simple_feature_matrix)
initial_weights = np.zeros(3)
l1_penalty = 1e7
tolerance = 1.0

weights = lasso_cyclical_coordinate_descent(simple_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)

print(weights)

[21755990.92839076 53962852.17886249        0.        ]


# Adding more features

In [35]:
all_features = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront', 
                'view', 
                'condition', 
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 
                'yr_renovated']
simple_feature_matrix = np.ones(shape = (train_data.shape[0], len(all_features)+1))
simple_feature_matrix[:,1:] = train_data[all_features].values
output = train_data['price'].values

In [42]:
simple_feature_matrix, norms = normalize_features(simple_feature_matrix)
initial_weights = np.zeros(len(all_features)+1)
l1_penalty = 1e7
tolerance = 1.0

weights = lasso_cyclical_coordinate_descent(simple_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)
pd.DataFrame(data = [all_features, weights[1:]]).T

Unnamed: 0,0,1
0,bedrooms,0.0
1,bathrooms,0.0
2,sqft_living,48405400.0
3,sqft_lot,0.0
4,floors,0.0
5,waterfront,2602710.0
6,view,7501070.0
7,condition,0.0
8,grade,0.0
9,sqft_above,0.0


In [47]:
l1_penalty = 2e7
tolerance = 1.0
weights = lasso_cyclical_coordinate_descent(simple_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)
pd.DataFrame(data = [all_features, weights[1:]]).T

Unnamed: 0,0,1
0,bedrooms,0.0
1,bathrooms,0.0
2,sqft_living,19084100.0
3,sqft_lot,0.0
4,floors,0.0
5,waterfront,0.0
6,view,6576310.0
7,condition,0.0
8,grade,0.0
9,sqft_above,0.0


# Notes:
1. In lasso, normalization is required as features with diff scales can have different weights and can be unnecessarily pushed to zero! Norm ensures equal consideration to all features.           
2. Generate the $rho$ for each weights based on formula, check that against the condition to calculate the new weight. Typically, the values keep decreasing and sent to zero.          
3. Tolerance is used to check the update of weights between iterations, if there are no updates to weights then iterations stops. All weights asbolute diff is used against tolerance.          
4. Coordinate Descent algorithm is used for the update because of L1 norm, which doesnt have proper derivative. It can either solved through sub-set derivative approach or corr desc. The algorithm core is you update one parameter at a time by keeping all other parameters fixed. Consider you are navigating frm Place A to Place B, you need traverse like manhattan distance and not euclidean based distance. Similar analogy.