# Import software libraries and load the dataset #

In [None]:
import sys                                             # Read system parameters.
import numpy as np                                     # Work with multi-dimensional arrays and matrices.
import pandas as pd                                    # Manipulate and analyze data.
import matplotlib as mpl                               # Create 2D charts.
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sb                                   # Perform data visualization.
import sklearn                                         # Perform data mining and analysis.
from sklearn import datasets

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- Matplotlib {}'.format(mpl.__version__))
print('- Seaborn {}'.format(sb.__version__))
print('- scikit-learn {}\n'.format(sklearn.__version__))

# Load the dataset.
boston = datasets.load_boston()
print('Loaded {} records.'.format(len(boston.data)))

# Get acquainted with the dataset

In [None]:
# Convert array to pandas DataFrame.
data_raw = pd.DataFrame(boston['data'], columns = boston['feature_names'])
data_raw['target'] = boston['target']

print(data_raw.info())      # View data types and see if there are missing entries.
data_raw.head(10)           # View first 10 records.

# Examine the distribution of various features

In [None]:
# Use Matplotlib to plot figures.
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

data_raw.hist(figsize=(20,15));
plt.figure();

# Examine a general summary of statistics

In [None]:
with pd.option_context('float_format', '{:.2f}'.format): 
    print(data_raw.describe())

# Look for columns that correlate with `target` (median house value)#

In [None]:
# Correlations between numeric features and 'target'.
print('Correlations with median house value')
print(data_raw.corr()['target'].sort_values(ascending=False))

# Split the label from the dataset

In [None]:
# 'target' is the dependent variable (value to be predicted), so it will be
# removed from the training data and put into a separate DataFrame for labels.
label_columns = ['target']

X, y = data_raw.loc[:, 'CRIM': 'LSTAT'], data_raw[label_columns]

# Compare the number of columns in the original data to the new training and label sets.
print(f'Original set:    {data_raw.shape}')
print('------------------------------')
print(f'Training data:   {X.shape}')
print(f'Training labels: {y.shape}')

# Drop columns that won't be used for training

In [None]:
# Drop column from dataset that shows weak correlation.
def drop_unused(dataset):
    
    print('Columns before drop:\n\n{}\n'.format(list(dataset.columns)))
        
    dataset = dataset.drop(['CHAS'], axis = 1)
    
    print('Columns after drop:\n\n{}\n'.format(list(dataset.columns)))
    return dataset

X = drop_unused(X.copy())

# Standardize the features

In [None]:
def standardize(X):
    result = X.copy()
    
    for feature in X.columns:
        result[feature] = (X[feature] - X[feature].mean()) / X[feature].std()  # z-score formula.
        
    return result

X = standardize(X)

print('The features have been standardized.')

In [None]:
with pd.option_context('float_format', '{:.2f}'.format): 
    print(X.describe())

# Train a model and calculate its scores

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error as mse

# Use cross-validation to split/train datasets.
def model_train(model):
    predict = cross_val_predict(model, X, np.ravel(y), cv = 5)
    score = cross_val_score(model, X, np.ravel(y), cv = 5).mean()
    cost = mse(y, predict)
    
    print('Mean variance score on test set: {:.0f}%'.format(np.round(score * 100)))
    print('Cost (mean squared error): {:.2f}'.format(cost))
    
print('The function to train the model has been defined.')

# Evaluate several regularized linear regression models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

# Create non-regularized and regularized linear regression models.
def model_eval(a, l1):
    for name, model in [
        ('None', LinearRegression()),
        ('Ridge', Ridge(alpha = a, solver = 'cholesky')),
        ('Lasso', Lasso(alpha = a)),
        ('Elastic net', ElasticNet(alpha = a, l1_ratio = l1))]:

        print('Regularization: {}'.format(name))
        print('--------------------')
        model_train(model)
        print('\n')
        
print('The function to evaluate the linear regression models has been defined.')

In [None]:
model_eval(1, 0.5)

In [None]:
model_eval(0.1, 0.3)

# Plot lines of best fit for the `RM` (average number of rooms) feature

In [None]:
lin_reg = LinearRegression()
predict_no_reg = cross_val_predict(lin_reg, X, np.ravel(y), cv = 5)

lin_reg_enet = ElasticNet(alpha = 0.1, l1_ratio = 0.3)
predict_enet = cross_val_predict(lin_reg_enet, X, np.ravel(y), cv = 5)

line_color_1 = {'color': 'red'}
line_color_2 = {'color': 'black'}

fig, ax = plt.subplots(1, 1, figsize = (8, 5))
sb.regplot(X['RM'], np.ravel(predict_no_reg), line_kws = line_color_1)
sb.regplot(X['RM'], np.ravel(predict_enet), line_kws = line_color_2)
plt.ylabel('Price')

# Compare predicted values to actual values

In [None]:
predict_df = y.copy()
predict_df['PredictedMEDV-NoReg'] = predict_no_reg
predict_df['PredictedMEDV-Enet'] = predict_enet

N = 5  # Plot every Nth value to save time and space
predict_df = predict_df.sort_values('target')[::N]

predict_df['diff-noreg'] = predict_df['target'] - predict_df['PredictedMEDV-NoReg']
predict_df['recnum'] = np.arange(len(predict_df))
predict_df['error_pct-noreg'] = abs(predict_df['diff-noreg'] / predict_df['target']) * 150

predict_df['diff-enet'] = predict_df['target'] - predict_df['PredictedMEDV-Enet']
predict_df['error_pct-enet'] = abs(predict_df['diff-enet'] / predict_df['target']) * 150

ax = plt.figure(figsize = [18, 10])
plt.ylabel('Median House Value')
plt.xlabel('House')
plt.plot(predict_df['recnum'], predict_df['target'], color = 'blue');
plt.scatter(predict_df['recnum'],
            predict_df['PredictedMEDV-NoReg'], 
            predict_df['error_pct-noreg'], 
            color = 'red');
plt.scatter(predict_df['recnum'],
            predict_df['PredictedMEDV-Enet'], 
            predict_df['error_pct-enet'], 
            color = 'green');

ax.legend(['Actual', 'Predicted (NoReg)', 'Predicted (Enet)'], 
           loc = 'lower center',
           ncol = 3, 
           title = 'Median house value predicted using linear regression')

plt.show()