#### basic linear regression practice book on california housing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
np.random.seed(306)
# plt.style.use('seaborn')

In [3]:
shuffle_split_cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state= 0)


### loading dataset

In [4]:
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)

In [5]:
print('shape of the features is:', features.shape)
print('shape of output is:', labels.shape)

shape of the features is: (20640, 8)
shape of output is: (20640,)


In [6]:
# cross validation sanity check
assert(features.shape[0] == labels.shape[0])

In [7]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(
  features, labels, random_state=42)

In [8]:
print('# training samples is:', train_features.shape)
print('# test samples is:', test_features.shape)

# training samples is: (15480, 8)
# test samples is: (5160, 8)


sanity check for test and train dataset

In [9]:
assert(train_features.shape[0] == train_labels.shape[0])
assert(test_features.shape[0] == test_labels.shape[0])

### Preprocessing section starting here
* StandardScaler STEP 1
* LinearRegression STEP 2

In [10]:
lin_reg_pipeline = Pipeline([
  ('features_scaling', StandardScaler()),
  ('lin_regressor', LinearRegression())
])
lin_reg_pipeline.fit(train_features, train_labels)

### Checking estimated weight vectors

In [11]:
print('intercept (w_0)', lin_reg_pipeline[-1].intercept_)
print('weight vectors:', lin_reg_pipeline[-1].coef_)

intercept (w_0) 2.0703489205426377
weight vectors: [ 0.85210815  0.12065533 -0.30210555  0.34860575 -0.00164465 -0.04116356
 -0.89314697 -0.86784046]


STEP4: Model evaluation

In [12]:
test_score = lin_reg_pipeline.score(test_features, test_labels)
print('model performance on test set:', test_score)

train_score = lin_reg_pipeline.score(train_features, train_labels)
print('model performance in train set:', train_score)

model performance on test set: 0.5910509795491354
model performance in train set: 0.609873031052925


In [13]:
lin_reg_score = cross_val_score(lin_reg_pipeline,
                                train_features,
                                train_labels,
                                scoring='neg_mean_squared_error', 
                                cv=shuffle_split_cv)
print(lin_reg_score)

# mean and standard deviation of the score
print(f"\nScore of linear regression model on the test set:\n",
      f"{lin_reg_score.mean():.3f} +/- {lin_reg_score.std():.3f}")


[-0.50009976 -0.52183352 -0.55931218 -0.52110499 -0.56059203 -0.50510767
 -0.52386194 -0.54775518 -0.5007161  -0.54713448]

Score of linear regression model on the test set:
 -0.529 +/- 0.022


### Other scoring methods are:
* explained_variance
* max_error
* neg_mean_absolute_error
* neg_root_mean_squared_error
* neg_mean_squared_log_error
* neg_median_absolute_error
* neg_mean_absolute_percentage_error
* r2

In [None]:
def plot_learning_curve(train_size, train_scores, test_scores, fit_times, score_times):

  train_score_mean = np.mean(-train_scores, axis=1)
  train_scores_std = np.std(-train_scores, axis=1)
  test_scores_mean = np.mean(-test_scores, axis=1)
  test_scores_std = np.std(-test_scores, axis=1)
  fit_times_mean = np.mean(fit_times, axis=1)
  fit_times_std = np.std(fit_times, axis=1)

  plt.fill_between(
    train_size,
    train_score_mean - train_scores_std,
    train_score_mean + train_scores_std,
    alpha= 0.1,
    color= 'r' ,
  )
  plt.fill_between(
    train_size,
    test_scores_mean - test_scores_std,
    test_scores_mean + test_scores_std,
    alpha= 0.1,
    color='g',
  )
  plt.plot(train_size, train_score_mean, 
           'o-')
