In [None]:
import pandas as pd
import numpy as np

## Load data from csv file

In [None]:
names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','PRICE']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data',
                header=None, names=names , delim_whitespace = True, na_values='?')

"""
Attribute Information:
    1.  CRIM      per capita crime rate by town
    2.  ZN        proportion of residential land zoned for lots over 
                  25,000 sq.ft.
    3.  INDUS     proportion of non-retail business acres per town
    4.  CHAS      Charles River dummy variable (= 1 if tract bounds 
                  river; 0 otherwise)
    5.  NOX       nitric oxides concentration (parts per 10 million)
    6.  RM        average number of rooms per dwelling
    7.  AGE       proportion of owner-occupied units built prior to 1940
    8.  DIS       weighted distances to five Boston employment centres
    9.  RAD       index of accessibility to radial highways
    10. TAX       full-value property-tax rate per $10,000
    11. PTRATIO   pupil-teacher ratio by town
    12. B         1000(Bk - 0.63)^2 where Bk is the proportion of blocks by town
    13. LSTAT     % lower status of the population
    14. MEDV      Median value of owner-occupied homes in $1000's
"""



In [None]:
print ('df is an object of ', type(df))
print ('\n')
print(df.head(5))
print(df.shape)

### Store values in the pandas dataframe as numpy arrays
- we want to use the average number of rooms to predict the housing price
- we need to extract the data from df and convert them to numpy arrays

In [None]:
y = df['PRICE'].values
x = df['RM'].values

crime = df['CRIM'].values
print ('both x and y are now objects of', type(x))

print(crime.shape)

### Plot the housing price against the average number of rooms

In [None]:
import matplotlib.pyplot as plt

plt.plot(x,y,'o')
plt.xlabel('Average Number of Rooms')
plt.ylabel('Price')
plt.grid()

# Guess a line to fit the data

In [None]:
w1 = 9
w0 = -30
xplt = np.linspace(3,9,100)
yplt = w1 * xplt + w0
plt.plot(x,y,'o')    # Plot the data points

yplt1 = 12*xplt -53
yplt2 = xplt*13 -60

plt.plot(xplt,yplt,'-',linewidth=3)  # Plot the line
plt.plot(xplt,yplt1,'-',linewidth=3)  # Plot the line
plt.plot(xplt,yplt2,'-',linewidth=3)  # Plot the line
plt.xlabel('Average number of rooms in a region')
plt.ylabel('Price')
plt.grid()


## Calculate the Mean Squared Error (MSE) and Mean Absolute Error (MAE) to determine goodness of fit

### Reminder :

Given :
- a dataset : $(x_i, y_i)$, $i = 1, 2, 3, ..., N$
- a model : $\hat{y} = w_1x + w_0$

We can compute the following two error functions :
- Mean Squared Error: $\displaystyle MSE = \frac{1}{N}\sum_{i=1}^N || y_i - \hat{y_i}||^2$
- Mean Absolute Error: $\displaystyle MAE = \frac{1}{N}\sum_{i=1}^N |y_i - \hat{y_i}|$

In [None]:
## To-do
yhat = 13*x -60
mse_ = np.mean((y - yhat)**2)
mae_ = np.mean(np.abs(y - yhat))

print(mse_)
print(mae_)

In [None]:
ones_v = np.ones((x.shape[0],1))
X = np.hstack([ones_v, x.reshape((-1,1))])

w = np.linalg.inv(X.transpose()@X)@X.transpose()@y
print(w)

y_hat = w[1] * x + w[0]
plt.plot(x,y,'o')   

plt.plot(x,y_hat,'-',linewidth=3) 
plt.xlabel('Average number of rooms in a region')
plt.ylabel('Price')
plt.grid()

mse_ = np.mean((y - yhat)**2)
mae_ = np.mean(np.abs(y - yhat))

print(mse_)
print(mae_)