### What is Machine Learning?

>Field of study that gives computers the ability to learn without being explicitly programmed (Arthur Samuel)

### Let's install some popular Python tools for Machine Learning

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import sklearn as sklearn

In [4]:
import matplotlib.pyplot as plt

In [5]:
%matplotlib inline

### Let's load and explore some data

In [6]:
sales = pd.read_csv('home_data.csv')

In [7]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


### Simple Linear Regression,  simplification one

In [8]:
simple_features=['sqft_living']
sales[simple_features+['price']].head()

Unnamed: 0,sqft_living,price
0,1180,221900
1,2570,538000
2,770,180000
3,1960,604000
4,1680,510000


In [1]:
f, ax = plt.subplots()

NameError: name 'plt' is not defined

### Simple Linear Regression, simplification two

$$y = w_0+w_1\times x_1$$

### Linear Regression as optimization problem

<img src="rss.png">

$$\sum_{i=1}^{N}(y_i-(w_0+w_1\times x_i)^{2}$$

### digression: training vs test data

In [4]:
from sklearn.cross_validation import train_test_split

In [5]:
train_data, test_data = train_test_split(sales, test_size=0.2, random_state=42)

NameError: name 'sales' is not defined

In [12]:
len(test_data)/len(sales)

0.2000185073798177

In [13]:
from sklearn import linear_model

In [14]:
simple_model=linear_model.LinearRegression()

In [1]:
X=train_data[simple_features]

NameError: name 'train_data' is not defined

In [2]:
y=train_data[['price']]

NameError: name 'train_data' is not defined

In [17]:
simple_model.fit(train_features, train_target)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [18]:
print(simple_model.intercept_)
print(simple_model.coef_)

[-41999.18454229]
[[ 279.55477851]]


In [None]:
plt.scatter(train_features, train_target, color='gray')
plt.plot(train_features,simple_model.predict(train_features), color='red')

### Let's look at few houses

In [19]:
houseA = sales[sales['id'] == 5309101200]
houseB = sales[sales['id'] == 1925069082]

In [20]:
print ('Predicted price (house A, simple model):', simple_model.predict(houseA[simple_features])[0])
print ('Actual price (house A):',houseA['price'].values[0])

Predicted price (house A, simple model): [ 628932.28388041]
Actual price (house A): 620000


In [21]:
print ('Predicted price (house B, simple model):', simple_model.predict(houseB[simple_features])[0])
print ('Actual price (house B):',houseB['price'].values[0])

Predicted price (house B, simple model): [ 1255134.9877416]
Actual price (house B): 2200000


In [22]:
house_Gates = pd.DataFrame(data={'bedrooms':[8], 
              'bathrooms':[25], 
              'sqft_living':[50000], 
              'sqft_lot':[225000],
              'floors':[4], 
              'zipcode':['98039'], 
              'condition':[10], 
              'grade':[10],
              'waterfront':[1],
              'view':[4],
              'sqft_above':[37500],
              'sqft_basement':[12500],
              'yr_built':[1994],
              'yr_renovated':[2010],
              'lat':[47.627606],
              'long':[-122.242054],
              'sqft_living15':[5000],
              'sqft_lot15':[40000]})

In [23]:
price = simple_model.predict(house_Gates[simple_features])[0,0]
print("Predicted price for Bill Gates' house, according to the simple model:", price)

Predicted price for Bill Gates' house, according to the simple model: 13935739.7409


>All models are wrong, but some are useful (George Box)

### Let's try another, more complicated model

In [24]:
other_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [25]:
train_features=train_data[other_features]

In [26]:
train_target=train_data['price']

In [27]:
other_model=linear_model.LinearRegression()

In [28]:
other_model.fit(train_features, train_target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [29]:
print ('Predicted price (house A, other model):', other_model.predict(houseA[other_features])[0])
print ('Actual price (house A):',houseA['price'].values[0])

Predicted price (house A, other model): 632032.391385
Actual price (house A): 620000


In [30]:
print ('Predicted price (house B, other model):', other_model.predict(houseB[other_features])[0])
print ('Actual price (house B):',houseB['price'].values[0])

Predicted price (house B, other model): 1263455.2931
Actual price (house B): 2200000


In [35]:
price = other_model.predict(house_Gates[other_features])[0]
print("Predicted price for Bill Gates' house, according to the other model:", price)

Predicted price for Bill Gates' house, according to the other model: 15553462.1759


### Evaluate models

In [31]:
def rss(model, data, target):
    predictions = model.predict(data)
    return np.sum((predictions-target)**2)

In [32]:
simple_rss = rss(simple_model, test_data[simple_features], test_data['price'].values)

In [33]:
other_rss = rss(other_model, test_data[other_features], test_data['price'].values)

In [34]:
print('Simple model RSS:',simple_rss)
print('Other model RSS:',other_rss)

Simple model RSS: 4.17519829097e+18
Other model RSS: 3.16823143923e+14
