# Linear Regression

    1. CRIM      per capita crime rate by town
    2. ZN        proportion of residential land zoned for lots over 
                 25,000 sq.ft.
    3. INDUS     proportion of non-retail business acres per town
    4. CHAS      Charles River dummy variable (= 1 if tract bounds 
                 river; 0 otherwise)
    5. NOX       nitric oxides concentration (parts per 10 million)
    6. RM        average number of rooms per dwelling
    7. AGE       proportion of owner-occupied units built prior to 1940
    8. DIS       weighted distances to five Boston employment centres
    9. RAD       index of accessibility to radial highways
    10. TAX      full-value property-tax rate per $10,000
    11. PTRATIO  pupil-teacher ratio by town
    12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
                 by town
    13. LSTAT    % lower status of the population
    14. MEDV     Median value of owner-occupied homes in $1000's


In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# load csv file
df = pd.read_csv('./dataset/housing.csv', header=None, delim_whitespace=True)

y = df[13]
X = df.drop([13], axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

print("x_train: {}\n".format(X_train.shape))
print("x_test: {}\n".format(X_test.shape))

model = linear_model.LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))
# The mean squared error
print("Mean squared error: %.2f".format(mean_squared_error(y_test, y_pred)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))



x_train: (455, 13)

x_test: (51, 13)

Coefficients: [ -1.15902489e-01   4.38372231e-02   2.08592006e-03   2.83835630e+00
  -1.93610545e+01   3.46575843e+00   5.52454431e-03  -1.55658844e+00
   2.90907102e-01  -1.05398181e-02  -9.71060523e-01   8.40192032e-03
  -5.43886263e-01]

Mean squared error: %.2f
Variance score: 0.86


# Polynomial Regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline



for count, degree in enumerate([3, 4, 5]):
    model = make_pipeline(PolynomialFeatures(degree), Ridge())
    model.fit(X, y)
    y_plot = model.predict(X_plot)
    plt.plot(x_plot, y_plot, color=colors[count], linewidth=lw,
             label="degree %d" % degree)

# Logistic Regression