## Week five - Lasso Regression
### Assignment One

<p>First, import necessary libraries. Then load test and train data into them.</p>

In [144]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [145]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype = dtype_dict)
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype = dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

### Now do some feature engineering

In [146]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

### And train a lasso model on the entire set of sales data

In [147]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']


In [148]:
model_all = linear_model.Lasso(alpha=5e2, normalize=True)
model_all.fit(sales[all_features], sales['price'])

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [149]:
for index, feature in enumerate(all_features):
    print feature + ': ' + str(model_all.coef_[index])

bedrooms: 0.0
bedrooms_square: 0.0
bathrooms: 0.0
sqft_living: 134.439313955
sqft_living_sqrt: 0.0
sqft_lot: 0.0
sqft_lot_sqrt: 0.0
floors: 0.0
floors_square: 0.0
waterfront: 0.0
view: 24750.0045856
condition: 0.0
grade: 61749.1030907
sqft_above: 0.0
sqft_basement: 0.0
yr_built: -0.0
yr_renovated: 0.0


### Engineer features on our data splits

In [150]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

### Train and validate the sets for multiple l1 penalties

In [151]:
def calculate_l1(training_data, validation_data, l1_penalties, feature_list, output):
    for index, value in enumerate(l1_penalties):
        model = linear_model.Lasso(alpha=value, normalize=True)
        model.fit(training_data[feature_list], training_data[output])
        #calculate RSS:
        rss = sum((model.predict(validation_data[feature_list].values) - validation_data[output].values) ** 2)
        print 'L1 penalty - ' + str(value) + ': ' + str(rss)

In [152]:
penalties = np.logspace(1, 7, num=13)
calculate_l1(training, validation, penalties, all_features, 'price')

L1 penalty - 10.0: 3.982133273e+14
L1 penalty - 31.6227766017: 3.99041900253e+14
L1 penalty - 100.0: 4.29791604073e+14
L1 penalty - 316.227766017: 4.63739831045e+14
L1 penalty - 1000.0: 6.45898733634e+14
L1 penalty - 3162.27766017: 1.22250685943e+15
L1 penalty - 10000.0: 1.22250685943e+15
L1 penalty - 31622.7766017: 1.22250685943e+15
L1 penalty - 100000.0: 1.22250685943e+15
L1 penalty - 316227.766017: 1.22250685943e+15
L1 penalty - 1000000.0: 1.22250685943e+15
L1 penalty - 3162277.66017: 1.22250685943e+15
L1 penalty - 10000000.0: 1.22250685943e+15


### Now compute RSS for test data on the best L1 - 10.0

In [153]:
calculate_l1(training, testing, [10.0], all_features, 'price')

L1 penalty - 10.0: 9.84674025527e+13


In [154]:
model = linear_model.Lasso(alpha=10.0, normalize=True)
model.fit(training[all_features], training_data['price'])
np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)

15

### Finally, we will learn models with the number of nonzero weights we specify

In [155]:
max_nonzero = 7

In [156]:
penalties = np.logspace(1, 4, num=20)

In [176]:
nonzero_dict = {}

for index, value in enumerate(penalties):
    model = linear_model.Lasso(alpha=value, normalize=True)
    model.fit(training[all_features], training['price'])
    nonzeros = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    nonzero_dict[value] = nonzeros
for key, value in nonzero_dict.iteritems():
    print str(key) + ': ' + str(value)

42.8133239872: 13
1128.83789168: 3
6951.92796178: 1
1623.77673919: 3
14.3844988829: 15
2335.72146909: 2
10.0: 15
127.42749857: 10
61.5848211066: 12
784.759970351: 5
4832.93023857: 1
183.298071083: 7
20.6913808111: 15
263.665089873: 6
3359.81828628: 1
88.586679041: 11
29.7635144163: 15
379.269019073: 6
545.559478117: 6
10000.0: 1


In [173]:
range_max = 545.55947811685144
range_min = 127.42749857031335
rss_list = []
min_val = (0,0)
for value in np.linspace(range_min, range_max):
    model = linear_model.Lasso(alpha=value, normalize=True)
    model.fit(training[all_features], training['price'])
    rss = sum((model.predict(validation[all_features].values) - validation['price'].values) ** 2)
    if np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_) == max_nonzero:
        rss_list.append(rss)
        if rss == min(rss_list):
            min_val = (value, rss, model)
print 'L1 penalty - ' + str(min_val[0]) + ': ' + str(min_val[1])
for index,feature in enumerate(all_features):
    print feature + ': ' + str(min_val[2].coef_[index])

L1 penalty - 161.56072139: 4.40595813867e+14
bedrooms: -0.0
bedrooms_square: -0.0
bathrooms: 9447.62911292
sqft_living: 163.752369128
sqft_living_sqrt: 0.0
sqft_lot: -0.0
sqft_lot_sqrt: -0.0
floors: 0.0
floors_square: 0.0
waterfront: 502105.185559
view: 41868.3046036
condition: 0.0
grade: 115681.834862
sqft_above: 0.0
sqft_basement: 0.0
yr_built: -2572.38912262
yr_renovated: 0.0
