In [1]:
from sklearn import model_selection, preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, validation_curve, cross_val_score
from sklearn. linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from get_data import split_data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Set random seed
seed = 42

## Import Cleaned Data

In [None]:
X = pd.read_csv('rhs_cleaned_dataset.csv')
y = X.pop('max ultimate height')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.8, random_state=seed)

In [10]:
X.head()

Unnamed: 0,min ultimate height,min ultimate spread,max ultimate spread,min time to ultimate height,max time to ultimate height
0,1.5,1.5,2.5,10.0,20.0
1,12.0,4.0,8.0,20.0,50.0
2,12.0,4.0,8.0,50.0,50.0
3,12.0,4.0,8.0,20.0,50.0
4,1.0,1.0,1.5,5.0,10.0


In [3]:
# Normalise data
scaler = preprocessing.StandardScaler()
X_train_scaler= scaler.fit(X_train)
X_scaled = X_train_scaler.transform(X_train)

## Fit Linear Model

In [7]:
# Fit model
linear_regression_model = LinearRegression(normalize=True)
linear_regression_model.fit(X_train, y_train)

print(f'Score on the training set is: {linear_regression_model.score(X_train, y_train)}')
print(f'Score on the validation set is: {linear_regression_model.score(X_val, y_val)}')
print(f'Linear regression coefficients are: {linear_regression_model.coef_}')


# Check cross validation score on Validation set
lin_reg_scores = cross_val_score(linear_regression_model, X_train, y_train, cv=10)
print(f'cross validation scores: {lin_reg_scores}')
print("%0.2f accuracy with a standard deviation of %0.2f" % (lin_reg_scores.mean(), lin_reg_scores.std()))

y_pred = linear_regression_model.predict(X)
print(f'predicted values: {y_pred}')

Score on the training set is: 0.9389720331472529
Score on the validation set is: 0.9299984636179803
Linear regression coefficients are: [ 0.84891818 -0.39056198  0.6814394  -0.00662977  0.00492183]
cross validation scores: [0.93211118 0.95823971 0.91782921 0.93000995 0.94281591 0.94680839
 0.9079411  0.94636185 0.91660568 0.9517921 ]
0.94 accuracy with a standard deviation of 0.02
predicted values: [ 2.74070312 14.50721297 14.30831997 ...  2.08831669  4.20514896
  1.23939851]


In [None]:
# Mean squared error score
def mse_score(y_pred, y):
    mse_score = round(mean_squared_error(y, y_pred, squared=True),2)
    print(f'The mean squared error is {mse_score}')
mse_score(y_pred, y)

def calculate_loss(y_pred, y):
    return np.mean((y_pred - y) ** 2)

# Plot predictions with true labels
def plot_predictions(y_pred, y):
    samples = len(y_pred)
    plt.figure(figsize=(8,6))
    plt.scatter(np.arange(samples), y_pred, c='r', label='predictions')
    plt.scatter(np.arange(samples), y, c='b', label='true labels', marker='x')
    plt.legend()
    plt.xlabel('Sample numbers')
    plt.ylabel('Values')
    plt.show()
    
calculate_loss(y_pred, y)
plot_predictions(y_pred, y)

In [None]:
def plot_linear_model(y_pred, y):
    sample_y_pred = y_pred[:60]
    sample_y = y[:60]
    plt.figure(figsize=(8,6))
    plt.plot(sample_y, label='True values')
    plt.plot(sample_y_pred, label='Predicted values')
    plt.xlabel('Number of samples')
    plt.ylabel('Values')
    plt.legend(prop=dict(size=10))
    plt.title('Linear regression: comparison of predicted values with true values')
plot_linear_model(y_pred, y)

In [None]:
from sklearn.linear_model import BayesianRidge

bayesian_model = BayesianRidge(compute_score=True)
bayesian_model.fit(X_train, y_train)
bayesian_y_pred = bayesian_model.predict(X)
print(f'Predicted values: {bayesian_y_pred}')

In [None]:
# Check cross validation score on TRAINING set
bayesian_scores = cross_val_score(bayesian_model, X, y, cv=10)
print('Bayesian regression cross validation scores:', bayesian_scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (bayesian_scores.mean(), bayesian_scores.std()))
mse_score(bayesian_y_pred, y)

In [None]:
def plot_bayesian_model(y_pred, y): 
    plt.figure(figsize=(8,6))
    plt.plot(y[:60], label='True values')
    plt.plot(y_pred[:60], label='Predicted values')
    plt.xlabel('Number of samples')
    plt.ylabel('Values')
    plt.legend(prop=dict(size=10))
    plt.title('Bayesian ridge regression: comparison of predicted values with true values')
    plt.show()
plot_bayesian_model(bayesian_y_pred, y)


In [None]:
def plot_weights(bayesian_model_weights, linear_regression_model_weights):
    lw = 2
    plt.figure(figsize=(6, 5))
    plt.title("Weights of the model")
    plt.plot(bayesian_model.coef_, color='lightgreen', linewidth=lw, label='Bayesian Ridge estimate')
    plt.plot(linear_regression_model.coef_, color='navy', linestyle='--', label='OLS estimate')
    plt.xlabel('Features')
    plt.ylabel('Value of the weights')
    plt.legend(loc='best', prop=dict(size=12))
    plt.show()
plot_weights(bayesian_model.coef_, linear_regression_model.coef_)

In [None]:
# Compare scores
def compare_regression_scores(bayesian_scores, lin_reg_scores):
    plt.figure(figsize=(8,6))
    plt.title('Compare scores')
    plt.plot(bayesian_scores, label='Bayesian ridge regression')
    plt.plot(lin_reg_scores, label='Linear regression')
    plt.xlabel('Number of scores')
    plt.ylabel('Score')
    plt.legend(loc='best', prop=dict(size=10))
    plt.show()
compare_regression_scores(bayesian_scores, lin_reg_scores)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Apply polynomial degree of 2 features to Linear Regression mode
def PolynomialRegression(degree=2, **kwargs):
    polynomial_features = PolynomialFeatures(degree=2)
    X_poly = polynomial_features.fit_transform(X)
    linear_regression_model.fit(X_poly, y)
    y_poly_pred = linear_regression_model.predict(X_poly)
    mse_score(y_poly_pred, y)
    return y_poly_pred, make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))

PolynomialRegression(X)
polynomial_regression_scores = cross_val_score(linear_regression_model, X, y, cv=10)
print(f'Polynomial regression scores: {polynomial_regression_scores}')
print("%0.2f accuracy with a standard deviation of %0.2f" % (polynomial_regression_scores.mean(), polynomial_regression_scores.std()))

plt.figure(figsize=(8,6))
plt.plot(y[:60], label='True values')
plt.plot(y_poly_pred[:60], label='Predicted values')
plt.xlabel('Number of samples')
plt.ylabel('Values')
plt.legend(prop=dict(size=10))
plt.title('Polynomial regression: comparison of predicted values with true values')
plt.show()

In [None]:
# Compare scores between 3 different model outputs
def compare_regression_scores(polynomial_regression_scores, bayesian_scores, lin_reg_scores):
    plt.figure(figsize=(8,6))
    plt.title('Compare scores')
    plt.plot(polynomial_regression_scores, label='Polynomial regression')
    plt.plot(bayesian_scores, label='Bayesian ridge regression')
    plt.plot(lin_reg_scores, label='Linear regression')
    plt.xlabel('Number of scores')
    plt.ylabel('Score')
    plt.legend(loc='best', prop=dict(size=10))
    plt.show()
compare_regression_scores(polynomial_regression_scores, bayesian_scores, lin_reg_scores)

In [None]:
# Check cross validation score on VALIDATION set
lin_reg_test_scores = cross_val_score(linear_regression_model, X_val, y_val, cv=10)
param_range = np.logspace(-7, 3, 3)
train_scores, validation_scores = validation_curve(BayesianRidge(), X, y, param_name='alpha_1', param_range=param_range, cv=10)
indexed_train_scores = train_scores[1]

train_scores_mean = np.mean(indexed_train_scores)
train_scores_std = np.std(indexed_train_scores)
test_scores_mean = np.mean(lin_reg_test_scores)
test_scores_std = np.std(lin_reg_test_scores)
print(f'Train scores mean: {train_scores_mean}. Train scores std: {train_scores_std}. Test scores mean: {test_scores_mean}. Test scores std: {test_scores_std}.')
plt.title("Validation Curve with Linear Regression")
plt.xlabel(r"$\gamma$")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")

In [None]:
from sklearn.model_selection import GridSearchCV

param = {
    'polynomialfeatures__degree': np.arange(10), 'linearregression__fit_intercept': [True, False], 'linearregression__normalize': [True, False]
}

bayesian_scores = cross_val_score(s(), X, y, cv=10)
# poly_grid = GridSearchCV(bayesian_model, param)