In [1]:
import pandas as pd

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
sales = sales.sort(['sqft_living','price'])



In [2]:
l2_small_penalty = 1.5e-5

In [3]:
def polynomial_dataframe(feature, degree): # feature is pandas.Series type
    # assume that degree >= 1
    # initialize the dataframe:
    poly_dataframe = pd.DataFrame()
    # and set poly_dataframe['power_1'] equal to the passed feature
    poly_dataframe['power_1'] = feature
    # first check if degree > 1
    if degree > 1:
        # then loop over the remaining degrees:
        for power in range(2, degree+1):
            # first we'll give the column a name:
            name = 'power_' + str(power)
            # assign poly_dataframe[name] to be feature^power; use apply(*)
            tmp = feature.apply(lambda x : x**power)
            poly_dataframe[name] = tmp
    return poly_dataframe

In [5]:
from sklearn import linear_model
import numpy as np

poly15_data = polynomial_dataframe(sales['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model.fit(poly15_data, sales['price'])

Ridge(alpha=1.5e-05, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [6]:
model.coef_

array([  1.24873306e+02,  -4.77376011e-02,   3.01446238e-05,
        -2.44419942e-09,  -1.94153675e-13,   8.54085686e-18,
         1.51142121e-21,   8.27979094e-26,   6.52603100e-31,
        -3.27895017e-34,  -3.87962315e-38,  -2.72437650e-42,
        -1.07790800e-46,   3.78242694e-51,   1.39790296e-54])

In [8]:
poly15_data.head()

Unnamed: 0,power_1,power_2,power_3,power_4,power_5,power_6,power_7,power_8,power_9,power_10,power_11,power_12,power_13,power_14,power_15
19452,290.0,84100.0,24389000.0,7072810000.0,2051115000000.0,594823300000000.0,1.724988e+17,5.002464e+19,1.450715e+22,4.207072e+24,1.220051e+27,3.5381479999999994e+29,1.026063e+32,2.9755819999999998e+34,8.629189e+36
15381,370.0,136900.0,50653000.0,18741610000.0,6934396000000.0,2565726000000000.0,9.493188e+17,3.512479e+20,1.299617e+23,4.808584e+25,1.779176e+28,6.582952e+30,2.4356920000000002e+33,9.012060999999999e+35,3.3344630000000003e+38
860,380.0,144400.0,54872000.0,20851360000.0,7923517000000.0,3010936000000000.0,1.144156e+18,4.347792e+20,1.652161e+23,6.278212e+25,2.385721e+28,9.065738e+30,3.44498e+33,1.3090930000000001e+36,4.9745520000000005e+38
18379,384.0,147456.0,56623104.0,21743270000.0,8349416000000.0,3206176000000000.0,1.231172e+18,4.727699e+20,1.815436e+23,6.971275e+25,2.67697e+28,1.027956e+31,3.9473530000000003e+33,1.515783e+36,5.8206080000000005e+38
4868,390.0,152100.0,59319000.0,23134410000.0,9022420000000.0,3518744000000000.0,1.37231e+18,5.352009e+20,2.087284e+23,8.140406e+25,3.174758e+28,1.2381560000000002e+31,4.828807e+33,1.883235e+36,7.344616000000001e+38


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
19452,3980300371,20140926T000000,142000.0,0.0,0.0,290.0,20875,1.0,0,0,...,1,290,0,1963,0,98024,47.5308,-121.888,1620.0,22850.0
15381,2856101479,20140701T000000,276000.0,1.0,0.75,370.0,1801,1.0,0,0,...,5,370,0,1923,0,98117,47.6778,-122.389,1340.0,5000.0
860,1723049033,20140620T000000,245000.0,1.0,0.75,380.0,15000,1.0,0,0,...,5,380,0,1963,0,98168,47.481,-122.323,1170.0,15000.0
18379,1222029077,20141029T000000,265000.0,0.0,0.75,384.0,213444,1.0,0,0,...,4,384,0,2003,0,98070,47.4177,-122.491,1920.0,224341.0
4868,6896300380,20141002T000000,228000.0,0.0,1.0,390.0,5900,1.0,0,0,...,4,390,0,1953,0,98118,47.526,-122.261,2170.0,6000.0


In [11]:

set_1 = pd.read_csv('wk3_kc_house_set_1_data.csv', dtype=dtype_dict)
set_2 = pd.read_csv('wk3_kc_house_set_2_data.csv', dtype=dtype_dict)
set_3 = pd.read_csv('wk3_kc_house_set_3_data.csv', dtype=dtype_dict)
set_4 = pd.read_csv('wk3_kc_house_set_4_data.csv', dtype=dtype_dict)

In [12]:
l2_small_penalty=1e-9

In [14]:
poly15_data1 = polynomial_dataframe(set_1['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model1 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model1.fit(poly15_data1, set_1['price'])

poly15_data2 = polynomial_dataframe(set_2['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model2 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model2.fit(poly15_data2, set_2['price'])

poly15_data3 = polynomial_dataframe(set_3['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model3 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model3.fit(poly15_data3, set_3['price'])

poly15_data4 = polynomial_dataframe(set_4['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model4 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model4.fit(poly15_data4, set_4['price'])

Ridge(alpha=1e-09, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [15]:
model1.coef_

array([  5.44669398e+02,  -3.55447604e-01,   1.22446380e-04,
        -1.17175308e-08,  -3.90512634e-13,  -1.39075978e-17,
         1.47860259e-20,   6.87492250e-25,  -7.57204275e-29,
        -1.04097316e-32,  -3.71843896e-37,   3.39989310e-41,
         5.56591963e-45,   2.53761354e-49,  -3.35152890e-53])

In [16]:
model2.coef_

array([  8.59362634e+02,  -8.18118252e-01,   4.28879964e-04,
        -9.12770587e-08,  -2.69604533e-12,   3.73980307e-15,
        -1.42711886e-19,  -6.30794673e-23,  -1.44559614e-27,
         7.44321256e-31,   9.25865747e-35,   3.28056306e-41,
        -1.29543524e-42,  -1.38781282e-46,   1.66546461e-50])

In [17]:
model3.coef_

array([ -7.55395888e+02,   9.75579449e-01,  -4.58945950e-04,
         7.77957921e-08,   7.15013787e-12,  -2.88602042e-15,
        -2.13677557e-20,   3.38085227e-23,   2.19178144e-27,
        -1.97067765e-31,  -4.15993095e-35,  -1.80196224e-39,
         3.19071106e-43,   5.08456989e-47,  -3.93304285e-51])

In [18]:
model4.coef_

array([  1.11944576e+03,  -9.83760301e-01,   3.38770980e-04,
         3.60376781e-08,  -4.37813901e-11,   5.77191576e-15,
         7.66795056e-19,  -9.49297212e-23,  -1.96030831e-26,
        -2.10883053e-32,   3.31005042e-34,   3.47733855e-38,
        -2.43039036e-42,  -8.79553560e-46,   6.44569785e-50])

In [24]:
l2_large_penalty=1.23e2
poly15_data1 = polynomial_dataframe(set_1['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model1 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model1.fit(poly15_data1, set_1['price'])

poly15_data2 = polynomial_dataframe(set_2['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model2 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model2.fit(poly15_data2, set_2['price'])

poly15_data3 = polynomial_dataframe(set_3['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model3 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model3.fit(poly15_data3, set_3['price'])

poly15_data4 = polynomial_dataframe(set_4['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model4 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model4.fit(poly15_data4, set_4['price'])

Ridge(alpha=123.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [25]:
model1.coef_

array([  2.32806803e+00,   3.53621608e-04,   3.31969692e-08,
         2.00082477e-12,   1.11492559e-16,   6.57786122e-21,
         4.12939525e-25,   2.70393755e-29,   1.81614763e-33,
         1.23824277e-37,   8.51872481e-42,   5.89455598e-46,
         4.09542560e-50,   2.85464889e-54,   1.99547476e-58])

In [26]:
model2.coef_

array([  2.09756903e+00,   3.90817483e-04,   6.67189944e-08,
         8.90002997e-12,   9.72639877e-16,   9.69733682e-20,
         9.50564475e-24,   9.44491031e-28,   9.57191338e-32,
         9.86945155e-36,   1.03101115e-39,   1.08729784e-43,
         1.15453748e-47,   1.23211305e-51,   1.31986696e-55])

In [27]:
model3.coef_

array([  2.28906258e+00,   4.12472190e-04,   6.08835345e-08,
         6.58572163e-12,   6.15278155e-16,   5.64446634e-20,
         5.28834396e-24,   5.07091402e-28,   4.94657273e-32,
         4.88043809e-36,   4.85009106e-40,   4.84161534e-44,
         4.84635021e-48,   4.85883628e-52,   4.87558469e-56])

In [28]:
model4.coef_

array([  2.08596194e+00,   4.05035772e-04,   7.46864647e-08,
         1.13096608e-11,   1.45864442e-15,   1.73561251e-19,
         2.01609632e-23,   2.34605255e-27,   2.75636073e-31,
         3.27043069e-35,   3.91046855e-39,   4.70118041e-43,
         5.67212304e-47,   6.85958087e-51,   8.30843630e-55])

In [30]:
train_valid_shuffled = pd.read_csv('wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)
test = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)

In [32]:
n = len(train_valid_shuffled)
k = 10 # 10-fold cross-validation

for i in xrange(k):
    start = (n*i)/k
    end = (n*(i+1))/k-1
    print i, (start, end)

0 (0, 1938)
1 (1939, 3878)
2 (3879, 5817)
3 (5818, 7757)
4 (7758, 9697)
5 (9698, 11636)
6 (11637, 13576)
7 (13577, 15515)
8 (15516, 17455)
9 (17456, 19395)


In [34]:
train_valid_shuffled1 = train_valid_shuffled[0:0].append(train_valid_shuffled[1938+1:n])
train_valid_shuffled2 = train_valid_shuffled[0:1939].append(train_valid_shuffled[3878+1:n])
train_valid_shuffled3 = train_valid_shuffled[0:3879].append(train_valid_shuffled[5817+1:n])
train_valid_shuffled4 = train_valid_shuffled[0:5818].append(train_valid_shuffled[7757+1:n])
train_valid_shuffled5 = train_valid_shuffled[0:7758].append(train_valid_shuffled[9697+1:n])
train_valid_shuffled6 = train_valid_shuffled[0:9698].append(train_valid_shuffled[11636+1:n])
train_valid_shuffled7 = train_valid_shuffled[0:11637].append(train_valid_shuffled[13576+1:n])
train_valid_shuffled8 = train_valid_shuffled[0:13577].append(train_valid_shuffled[15515+1:n])
train_valid_shuffled9 = train_valid_shuffled[0:15516].append(train_valid_shuffled[17455+1:n])
train_valid_shuffled10 = train_valid_shuffled[0:17456].append(train_valid_shuffled[19395+1:n])


In [66]:
def k_fold_cross_validation(k, l2_penalty, data, output):
    RSS_list= []
    for i in xrange(k):
        start = (n*i)/k
        end = (n*(i+1))/k-1
        # print i, (start, end)
        train_set = data[start:end+1]
        valid_set = data[0:start].append(data[end+1:n])
        model = linear_model.Ridge(alpha=l2_penalty, normalize=True)
        model.fit(data, output)
        RSS = np.sum((model.predict(data) -output)**2)
        RSS_list.append(RSS)
    average_validation_error = np.mean(RSS_list)
    print average_validation_error
    return average_validation_error

In [67]:
np.logspace(3, 9, num=13)

array([  1.00000000e+03,   3.16227766e+03,   1.00000000e+04,
         3.16227766e+04,   1.00000000e+05,   3.16227766e+05,
         1.00000000e+06,   3.16227766e+06,   1.00000000e+07,
         3.16227766e+07,   1.00000000e+08,   3.16227766e+08,
         1.00000000e+09])

In [68]:
poly_poly = polynomial_dataframe(train_valid_shuffled['sqft_living'], 15)

In [69]:
for l2_penalty in l2_pens:
    k_fold_cross_validation(10, l2_penalty, poly_poly, train_valid_shuffled['price'])

2.65052019507e+15
2.65701202728e+15
2.65908020854e+15
2.65973577141e+15
2.65994323382e+15
2.66000885474e+15
2.66002960745e+15
2.66003617019e+15
2.66003824553e+15
2.66003890181e+15
2.66003910934e+15
2.66003917497e+15
2.66003919573e+15


In [75]:
train_data = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
#train_data = train_data.sort(['sqft_living','price'])
train_data_poly = polynomial_dataframe(train_data['sqft_living'], 15)

In [76]:
model = linear_model.Ridge(alpha=l2_pens[0], normalize=True)
model.fit(train_data_poly, train_data['price'])

Ridge(alpha=1000.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [77]:
test_data = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
test_data_poly = polynomial_dataframe(test_data['sqft_living'], 15)

In [79]:
rss_test = np.sum((model.predict(test_data_poly) - test_data['price']) **2)