# Machine Learning using Linear Regression

In [None]:
from IPython.display import Image

In [None]:
Image('https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Linear_regression.svg/400px-Linear_regression.svg.png') 

In [None]:
Image('https://image.slidesharecdn.com/8-1209490505240696-9/95/multiple-linear-regression-16-638.jpg')

In [None]:
Image('http://3.bp.blogspot.com/-N5Rl3a87jgE/UTNCaRNMl3I/AAAAAAAAAcs/nJZVjAEiR3g/s1600/multi+regression+equation.png') 

# Housing Data Set

http://archive.ics.uci.edu/ml/datasets/Housing

#### Attribute Information:

    1. CRIM      per capita crime rate by town
    2. ZN        proportion of residential land zoned for lots over 
                 25,000 sq.ft.
    3. INDUS     proportion of non-retail business acres per town
    4. CHAS      Charles River dummy variable (= 1 if tract bounds 
                 river; 0 otherwise)
    5. NOX       nitric oxides concentration (parts per 10 million)
    6. RM        average number of rooms per dwelling
    7. AGE       proportion of owner-occupied units built prior to 1940
    8. DIS       weighted distances to five Boston employment centres
    9. RAD       index of accessibility to radial highways
    10. TAX      full-value property-tax rate per $10,000
11. PTRATIO  pupil-teacher ratio by town
12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
                 by town
13. LSTAT    % lower status of the population
14. MEDV     Median value of owner-occupied homes in $1000's
    


#### Source:

*Origin:* 

This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University. 

*Creator:* 

Harrison, D. and Rubinfeld, D.L. 
'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.




In [None]:
import sklearn
import pandas as pd
import numpy as np

In [None]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data", delimiter = r"\s+", 
                  names = ["CRIM", "ZN", "INDUS", "CHAS", "NOX","RM","AGE", "DIS", "RAD","TAX", "PTRATIO", "B", "LSTAT", "MEDV"])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df[pd.isnull(df).any(axis=1)]

In [None]:
X = df[["CRIM", "ZN", "INDUS", "CHAS", "NOX","RM","AGE", "DIS", "RAD","TAX", "PTRATIO", "B", "LSTAT"]]

In [None]:
X.shape

In [None]:
y = df["MEDV"]

In [None]:
y.shape

In [None]:
type(X)

In [None]:
type(y)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
Image('https://i.stack.imgur.com/8RlJk.png')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=5)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
reg = linear_model.LinearRegression()

In [None]:
reg.fit(X_train,y_train)

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
y_pred = reg.predict(X_test)

In [None]:
type(y_pred)

In [None]:
type(y_test)

In [None]:
y_test_m = y_test.as_matrix()

In [None]:
type(y_test_m)

In [None]:
plt.figure(figsize=(15,15))
plt.plot(y_test_m)
plt.plot(y_pred)
legend_list = ['y_test_m','y_pred']
plt.legend(legend_list, fontsize='25', loc=4)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
r2_score(y_test,y_pred)

In [None]:
reg.predict([[1.23247,0.00,6.140,0,0.5380,6.1420,85.70,3.9769,4,307.0,21.00,406.90,18.72]])