# Linear Regression Example

This example uses the only the first feature of the diabetes dataset, in order to illustrate a two-dimensional plot of this regression technique.

http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html

In [1]:
# Import libraries
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Note, the library sklearn.datasets has a diabetes dataset builtin for example, which we will use in this demonstration.

In [97]:
# Load the diabetes dataset
diabetes = datasets.load_diabetes()

In [98]:
# Note, the loaded dataset is not a numpy or pandas array
type(diabetes)

sklearn.utils.Bunch

In [99]:
# We access the numpy array via the data property
type(diabetes.data)

numpy.ndarray

## Let's view the contents of the loaded diabetes dataset

In [100]:
diabetes.data

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ..., 
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])

In [101]:
# View the shape of the array: 440 rows by 10 columns
diabetes.data.shape

(442, 10)

## For this simple linear regression example, we will use only one feature (independent variable) from the dataset.

In [102]:
# Use only one feature (all rows, column 2)
# Reshape the array to be a 2D array of a single feature
diabetes_X1 = diabetes.data[:,2].reshape(-1,1)

In [103]:
# Display the contents of the single column,
diabetes_X1

array([[ 0.06169621],
       [-0.05147406],
       [ 0.04445121],
       [-0.01159501],
       [-0.03638469],
       [-0.04069594],
       [-0.04716281],
       [-0.00189471],
       [ 0.06169621],
       [ 0.03906215],
       [-0.08380842],
       [ 0.01750591],
       [-0.02884001],
       [-0.00189471],
       [-0.02560657],
       [-0.01806189],
       [ 0.04229559],
       [ 0.01211685],
       [-0.0105172 ],
       [-0.01806189],
       [-0.05686312],
       [-0.02237314],
       [-0.00405033],
       [ 0.06061839],
       [ 0.03582872],
       [-0.01267283],
       [-0.07734155],
       [ 0.05954058],
       [-0.02129532],
       [-0.00620595],
       [ 0.04445121],
       [-0.06548562],
       [ 0.12528712],
       [-0.05039625],
       [-0.06332999],
       [-0.03099563],
       [ 0.02289497],
       [ 0.01103904],
       [ 0.07139652],
       [ 0.01427248],
       [-0.00836158],
       [-0.06764124],
       [-0.0105172 ],
       [-0.02345095],
       [ 0.06816308],
       [-0

In [104]:
# Extract the y (label)
diabetes_y = diabetes.target

In [105]:
# Display the contents of the y (label) column
diabetes_y

array([ 151.,   75.,  141.,  206.,  135.,   97.,  138.,   63.,  110.,
        310.,  101.,   69.,  179.,  185.,  118.,  171.,  166.,  144.,
         97.,  168.,   68.,   49.,   68.,  245.,  184.,  202.,  137.,
         85.,  131.,  283.,  129.,   59.,  341.,   87.,   65.,  102.,
        265.,  276.,  252.,   90.,  100.,   55.,   61.,   92.,  259.,
         53.,  190.,  142.,   75.,  142.,  155.,  225.,   59.,  104.,
        182.,  128.,   52.,   37.,  170.,  170.,   61.,  144.,   52.,
        128.,   71.,  163.,  150.,   97.,  160.,  178.,   48.,  270.,
        202.,  111.,   85.,   42.,  170.,  200.,  252.,  113.,  143.,
         51.,   52.,  210.,   65.,  141.,   55.,  134.,   42.,  111.,
         98.,  164.,   48.,   96.,   90.,  162.,  150.,  279.,   92.,
         83.,  128.,  102.,  302.,  198.,   95.,   53.,  134.,  144.,
        232.,   81.,  104.,   59.,  246.,  297.,  258.,  229.,  275.,
        281.,  179.,  200.,  200.,  173.,  180.,   84.,  121.,  161.,
         99.,  109.,

## Next we split the X and Y datasets into train and test data.

In [106]:
# Split the dataset into train and test datasets, with 20% as test data
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(diabetes_X1, diabetes_y, test_size=0.2, random_state=101)

In [107]:
# Display the number of rows in the train and test datasets
print("X_train rows ", diabetes_X_train.shape)
print("X_test  rows ", diabetes_X_test.shape)
print("y_train rows ", diabetes_y_train.shape)
print("y_test  rows", diabetes_y_test.shape)

X_train rows  (353, 1)
X_test  rows  (89, 1)
y_train rows  (353,)
y_test  rows (89,)


## Next we create an instance of the linear regression model and fit the training data

In [108]:
# Create linear regression object
model = linear_model.LinearRegression()

# Fit the training data to the model
model.fit(diabetes_X_train, diabetes_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## Now we use the test data to make predictions using the model we trained

In [109]:
# Make predictions using the testing set
diabetes_y_pred = model.predict(diabetes_X_test)

In [110]:
# Show the predicted values
diabetes_y_pred

array([ 124.5468645 ,  134.64510677,  209.37209952,  156.86123975,
        208.36227529,  152.82194284,  196.24438458,  119.49774337,
        126.56651296,   80.11459854,   93.24231349,  200.28368148,
        113.43879801,  167.96930624,  116.46827069,  131.61563409,
        127.57633718,  170.99877892,  221.48999024,  100.31108307,
        173.01842737,  128.58616141,  203.31315416,  101.3209073 ,
         97.28161039,  122.52721605,  153.83176707,  149.79247016,
        147.77282171,   67.99670783,  120.5075676 ,  108.38967688,
        193.2149119 ,  112.42897379,   90.21284081,  241.68647476,
        118.48791915,  164.93983356,   87.18336813,  170.99877892,
        122.52721605,  111.41914956,  115.45844647,  209.37209952,
        218.46051756,  108.38967688,  123.53704028,  302.27592835,
        119.49774337,  191.19526344,  123.53704028,  141.71387635,
        120.5075676 ,  130.60580986,   89.20301658,  157.87106397,
        129.59598564,  207.35245107,  185.13631809,  209.37209

## Now we calculate how accurate the predictions (diabetes_test_pred) are to the known correct values (diabetes_test_y)

In [124]:
#accuracy_score(diabetes_y_test, diabetes_y_pred)

In [94]:
# The coefficients
print('Coefficients: \n', model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))

Coefficients: 
 [ 936.92049926]
Mean squared error: 3720.09
Variance score: 0.42
