In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math

In [3]:
## Read data
ic = pd.read_csv("IowaCityHomeSales.csv")

## Split to training and testing sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(ic, test_size=0.2, random_state=7)

## Create target
train_y = train['sale.amount']

## Create predictor matrix (numeric predictors only for simplicity, but we could use OHE if we wanted to)
train_X = train.select_dtypes("number").drop('sale.amount',axis=1)

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline 

## Simple pipeline
pipe = Pipeline([
('scaler', StandardScaler()),
('model', LinearRegression())
])

In [5]:
## Fit the pipeline to the training data
lin_mod = pipe.fit(train_X, train_y)

## print the model's coefficients
lin_mod.named_steps['model'].coef_

array([-1.25805804e+03,  1.92884548e+03,  6.72865255e+02, -1.92085650e+03,
       -7.95482914e+01, -3.12181937e+03,  1.97731725e+03,  8.24129242e+02,
        1.82409979e+03,  3.74282983e+02,  2.29862180e+00,  8.78235016e+04])

In [6]:
#Question 1a
## Create target
train_y_assessed = train['assessed']

## Create predictor matrix (numeric predictors only for simplicity, but we could use OHE if we wanted to)
train_X_assessed = train.select_dtypes("number").drop('sale.amount',axis=1)

train_X_assessed = train_X_assessed[['bedrooms', 'area.lot', 'area.garage1']]

In [7]:
from sklearn.preprocessing import PowerTransformer
pipe = Pipeline([
    ('powertransformer', PowerTransformer(method = 'yeo-johnson')),
    ('scaler', MinMaxScaler()),
    ('model', LinearRegression())
])



In [8]:
from sklearn.model_selection import cross_validate
cv_res = cross_validate(pipe, train_X_assessed, train_y_assessed, cv = 5, scoring='neg_mean_squared_error')
rmse = np.sqrt(-np.mean(cv_res['test_score']))
print(f'rmse is {rmse}')

rmse is 70679.75801136302


In [9]:
# Question 2a
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

pipe = Pipeline([
('scaler', StandardScaler()),
('regressor', KNeighborsRegressor())
])

parms = {
         'regressor__n_neighbors': [3, 5, 10, 20, 30],
         'regressor__weights': ['uniform','distance'],
        }

mod_comp = GridSearchCV(pipe, parms, cv=5, scoring='neg_mean_squared_error').fit(train_X_assessed, train_y_assessed)
print(mod_comp.best_estimator_)

Pipeline(steps=[('scaler', StandardScaler()),
                ('regressor',
                 KNeighborsRegressor(n_neighbors=20, weights='distance'))])


In [10]:
# Question 2b
pipe = Pipeline([
('scaler', StandardScaler()),
('regressor', KNeighborsRegressor(n_neighbors=20, weights='distance'))
])

cv_res = cross_validate(pipe, train_X_assessed, train_y_assessed, cv = 5, scoring='neg_mean_squared_error')
rmse = np.sqrt(-np.mean(cv_res['test_score']))
print(f'rmse is {rmse}')

# knn rmse is lower than that of linear regression

rmse is 56843.41479618382


# Question 3a

Cost function: 
$$
\text{Cost} = \frac{1}{n}\sum^n _{i=0} (y_i-x_i \cdot w)^2
$$

Gradient: 
$$
\text{Gradient} = \frac{-2}{n} \sum^n _{i=0} (x_i^2 \cdot w)
$$

In [28]:
# Qeustion 3b
## Define cost function
def cost_function(x, y, w):
    n = len(y)
    pred_y = x * w
    err = y - pred_y
    cost = (1./n)* err ** 2               
    return sum(cost)    

def grad_descent(x, y, w, alpha, n_iter):
    costs = np.zeros(n_iter)
    n = len(y) 
    
    for i in range(n_iter):
        grad = (-2./n) * x**2 * w
        w = w - alpha*grad
        costs[i] = cost_function(x, y, w)
        
    return w, costs

In [37]:
cost_function(x_assess, y_sale_price, w_init)

40735272624.85346

In [34]:
#Question 3c

## Standardized version of the only predictor "assessed"
x_assess = (train_X['assessed'] - np.average(train_X['assessed']))/np.std(train_X['assessed'])
y_sale_price = train['sale.amount'].to_numpy()
w_init = np.zeros(len(x_assess))


In [35]:
## Run gradient descent
weight, costs = grad_descent(x = x_assess, 
                             y = y_sale_price, 
                             w = w_init, 
                             alpha = 0.3, 
                             n_iter = 100)

In [36]:
costs

array([4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
       4.07352726e+10, 4.07352726e+10, 4.07352726e+10, 4.07352726e+10,
      