In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

import time

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../GM2/kc_house_data.csv")

In [3]:
price_arr = np.array(df['price'])

In [4]:
df = df.drop(columns=['id', 'price'])
df.shape

(21613, 19)

In [5]:
df.head()

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,20141013T000000,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,20141209T000000,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,20150225T000000,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,20141209T000000,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,20150218T000000,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [6]:
def unix_date(dataframe):
    unix_times = np.zeros(len(dataframe))
    for index, row in dataframe.iterrows():
        human_time = row[1]
        unix_time = pd.Timestamp(human_time).value
        unix_times[index] += unix_time
    return unix_times

unix_times = unix_date(df)
df['date'] = pd.DataFrame(unix_times)

In [7]:
df.head()

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,3.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,3.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,2.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,4.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,3.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

# De-mean each data
data = df.as_matrix()
for i in range(data.shape[1]):
    column = data[:,i]
    mean = np.mean(column)
    column -= mean
    
data = scale(data)

In [9]:
data.shape

(21613, 19)

In [10]:
pca = PCA(n_components=10)
X_new = pca.fit_transform(data)

In [11]:
X_new.shape

(21613, 10)

In [12]:
print(pca.explained_variance_ratio_)
print()
print(np.sum(pca.explained_variance_ratio_))

[0.29482985 0.12065187 0.09711362 0.08406549 0.06386613 0.05499871
 0.04963488 0.04436009 0.04100893 0.03285118]

0.8833807462853775


In [13]:
pca.singular_values_

array([347.95257515, 222.58779819, 199.69831258, 185.79892293,
       161.94577552, 150.28325507, 142.76699961, 134.96790844,
       129.76977074, 116.14748471])

In [14]:
# Directions of max variance

principal_components = pca.components_
principal_components.shape

(10, 19)

In [15]:
# first_pc = principal_components[0]
# second_pc = principal_components[1]

In [16]:
lin_reg = LinearRegression()

x_NO_PCA = data
x_PCA = X_new

y = price_arr

In [17]:
x_NO_PCA.shape, x_PCA.shape, y.shape

((21613, 19), (21613, 10), (21613,))

In [18]:
# Linear regression WITHOUT using PCA

X_train, X_test, y_train, y_test = train_test_split(x_NO_PCA, y, test_size=0.3, random_state=0)
print (X_train.shape,  y_train.shape)
print()
print (X_test.shape, y_test.shape)
start_time = time.time()
lin_reg.fit(X_train, y_train)
end_time = time.time()
print()
print (end_time - start_time)

(15129, 19) (15129,)

(6484, 19) (6484,)

0.00935816764831543


In [19]:
NO_PCA_predictions = lin_reg.predict(X_test)

NO_PCA_prediction_error = mse(y_test, NO_PCA_predictions) / len(y_test)
print (NO_PCA_prediction_error)

6614379.9182441635


In [20]:
# Linear regression using PCA

X_train, X_test, y_train, y_test = train_test_split(x_PCA, y, test_size=0.3, random_state=0)
print (X_train.shape,  y_train.shape)
print
print (X_test.shape, y_test.shape)

(15129, 10) (15129,)
(6484, 10) (6484,)


In [21]:
start_time = time.time()

lin_reg.fit(X_train, y_train)

end_time = time.time()
print (end_time - start_time)
print (end_time / start_time)

0.0065038204193115234
1.000000000004277


In [22]:
PCA_predictions = lin_reg.predict(X_test)

PCA_prediction_error = mse(y_test, PCA_predictions) / len(y_test)
print (PCA_prediction_error)

7398106.588726568


In [23]:
from __future__ import division
print (PCA_prediction_error / NO_PCA_prediction_error)

1.1184883058078785
