# Import software libraries and load the dataset #

In [None]:
import sys                                             # Read system parameters.
import numpy as np                                     # Work with multi-dimensional arrays and matrices.
import pandas as pd                                    # Manipulate and analyze data.
import sklearn                                         # Perform data mining and analysis.
from sklearn import datasets
from time import time                                  # Calculate training time.

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- scikit-learn {}\n'.format(sklearn.__version__))

# Load the dataset.
boston = datasets.load_boston()
print('Loaded {} records.'.format(len(boston.data)))

# Convert array to pandas DataFrame.
data_raw = pd.DataFrame(boston['data'], columns = boston['feature_names'])
data_raw['target'] = boston['target']

# Split the datasets

In [None]:
from sklearn.model_selection import train_test_split

# 'target' is the dependent variable (value to be predicted), so it will be
# removed from the training data and put into a separate DataFrame for labels.
label_columns = ['target']

# Split the training and test datasets and their labels.
X_train, X_test, y_train, y_test = train_test_split(data_raw.loc[:, 'CRIM': 'LSTAT'],
                                                                            data_raw[label_columns],
                                                                            random_state = 2)

# Compare the number of rows and columns in the original data to the training and test sets.
print(f'Original set:        {data_raw.shape}')
print('------------------------------')
print(f'Training features:   {X_train.shape}')
print(f'Test features:       {X_test.shape}')
print(f'Training labels:     {y_train.shape}')
print(f'Test labels:         {y_test.shape}')

# Drop columns that won't be used for training

In [None]:
# Drop column from dataset that shows weak correlation.
def drop_unused(dataset):
    
    print('Columns before drop:\n\n{}\n'.format(list(dataset.columns)))
        
    dataset = dataset.drop(['CHAS'], axis = 1)
    
    print('Columns after drop:\n\n{}\n'.format(list(dataset.columns)))
    return dataset

X_train, X_test = drop_unused(X_train.copy()), drop_unused(X_test.copy())

# Standardize the features

In [None]:
def standardize(X):
    result = X.copy()
    
    for feature in X.columns:
        result[feature] = (X[feature] - X[feature].mean()) / X[feature].std()  # z-score formula.
        
    return result

X_train = standardize(X_train)
X_test = standardize(X_test)

print('The features have been standardized.')

# Train a model and calculate its cost

In [None]:
from sklearn.metrics import mean_squared_error as mse

def model_train(model):
    start = time()
    model.fit(X_train, np.ravel(y_train))
    end = time()
    train_time = (end - start) * 1000
    
    predict = model.predict(X_test)
    
    cost = mse(y_test, predict)
    
    print('Linear regression model took {:.2f} milliseconds to fit.'.format(train_time))
    print('Cost (mean squared error): {:.2f}'.format(cost))
    
print('The function to train the model and calculate its cost has been defined.')

# Evaluate linear regression models using both closed-form and iterative solutions

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor

# Create closed-form and iterative ridge regression models.
def model_eval(eta):
    for name, model in [
        ('Ridge regression (closed form)', Ridge(alpha = 0.1, solver = 'cholesky')),
        ('Ridge regression (gradient descent)', SGDRegressor(penalty = 'l2',
                                                             alpha = 0.1,
                                                             tol = 1e-3,
                                                             learning_rate = 'constant',
                                                             eta0 = eta,
                                                             random_state = 2))]:

        print('Model: {}'.format(name))
        print('--------------------')
        model_train(model)
        print('\n')
        
print('The function to evaluate the linear regression models has been defined.')

In [None]:
model_eval(0.09)

In [None]:
model_eval(0.08)

In [None]:
model_eval(0.05)

In [None]:
model_eval(0.01)