In [1]:
from sklearn import model_selection, preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, validation_curve, cross_val_score
from sklearn. linear_model import LinearRegression
from sklearn.metrics import r2_score
from get_data import split_data
import numpy as np
import pandas as pd

# Set random seed
seed = 42

## Import Cleaned Data

In [2]:
X = pd.read_csv('rhs_cleaned_dataset.csv')
y = X.pop('Max Time To Ultimate Height')
X = X[['Full Sun', 'Sheltered', 'Generally pest free']]

In [3]:
# Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.8, random_state=seed)

In [4]:
# Normalise data
scaler = preprocessing.StandardScaler()
X_train_scaler= scaler.fit(X_train)
X_scaled = X_train_scaler.transform(X_train)

## Fitting Linear Model
### Setting the baseline

In [5]:
# Fit model
linear_regression_model = LinearRegression(normalize=True)
linear_regression_model.fit(X_train, y_train)
lin_reg_train_score = linear_regression_model.score(X_train, y_train)
lin_reg_val_score = linear_regression_model.score(X_val, y_val)

# Check scores on train and validation
print(f'Score on the training set is: {lin_reg_train_score}')
print(f'Score on the validation set is: {lin_reg_val_score}')
print(f'Linear regression coefficients are: {linear_regression_model.coef_}')

# Check cross validation score on validation set
lin_reg_cross_val_scores = cross_val_score(linear_regression_model, X_val, y_val, cv=5)
print(f'cross validation scores: {lin_reg_cross_val_scores}')
print("%0.2f accuracy with a standard deviation of %0.2f" % (lin_reg_cross_val_scores.mean(), lin_reg_cross_val_scores.std()))

y_pred = linear_regression_model.predict(X)
print(f'predicted values: {y_pred}')

Score on the training set is: 0.020828962483377178
Score on the validation set is: -0.029735847724890352
Linear regression coefficients are: [ 3.19076528 -1.9370705  -3.25195479]
cross validation scores: [-0.00668128 -0.0082786  -0.00301021 -0.00376647 -0.01441289]
-0.01 accuracy with a standard deviation of 0.00
predicted values: [12.7042661  14.7025261  15.95622089 ...  9.51350082 14.7025261
 14.7025261 ]


In [6]:
X.columns

Index(['Full Sun', 'Sheltered', 'Generally pest free'], dtype='object')

### Analysis
These resulting coefficients tell us that the weighting of the full exposure of sunlight have a positive correlation to the target variable of predicting maximum height of the plants, whilst the sheltered plants and whether they are generally pest-free have negative weighting to the predictor. 

In [7]:
# Calculate R2 score
print(f'The R2 score for our baseline is: {r2_score(y, y_pred)}')

The R2 score for our baseline is: -0.01878579531821356


In [8]:
from sklearn.tree import DecisionTreeRegressor

# Fit model
regressor = DecisionTreeRegressor(random_state=seed)
regressor.fit(X_train, y_train)
regressor_train_score = regressor.score(X_train, y_train)
regressor_val_score = regressor.score(X_val, y_val)

# Check scores on train and validation
print(f'Score on the training set is: {regressor_train_score}')
print(f'Score on the validation set is: {regressor_val_score}')

# Check cross validation score on validation set
regessor_cross_val_score = cross_val_score(regressor, X_val, y_val, cv=5)
print(f'cross validation scores: {regessor_cross_val_score}')
print("%0.2f accuracy with a standard deviation of %0.2f" % (regessor_cross_val_score.mean(), regessor_cross_val_score.std()))

# Predict
regressor_y_pred = regressor.predict(X)
print(f'predicted values: {regressor_y_pred}')

Score on the training set is: 0.028705013037113702
Score on the validation set is: -0.020742472662614864
cross validation scores: [ 6.39066992e-05 -1.63624536e-02  1.86080767e-02  2.98665014e-02
 -2.85383412e-02]
0.00 accuracy with a standard deviation of 0.02
predicted values: [13.04166667 14.5625     14.78571429 ... 13.20833333 14.5625
 14.5625    ]


In [9]:
from sklearn.neighbors import KNeighborsRegressor

# Fit model
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train, y_train)
neigh_train_score = neigh.score(X_train, y_train)
neigh_val_score = neigh.score(X_val, y_val)

# Check scores on train and validation
print(f'Score on the training set is: {neigh_train_score}')
print(f'Score on the validation set is: {neigh_val_score}')

# Check cross validation score on validation set
neigh_cross_val_scores = cross_val_score(neigh, X_val, y_val, cv=5)
print(f'cross validation scores: {neigh_cross_val_scores}')
print("%0.2f accuracy with a standard deviation of %0.2f" % (neigh_cross_val_scores.mean(), neigh_cross_val_scores.std()))

# Predict
neigh_y_pred = neigh.predict(X)
print(f'predicted values: {neigh_y_pred}')

Score on the training set is: -0.4969942145889821
Score on the validation set is: -0.5089283546522478
cross validation scores: [-0.17651631 -0.15580016 -0.62018555 -0.34942463 -0.18788172]
-0.30 accuracy with a standard deviation of 0.18
predicted values: [15.   5.   5.  ... 27.5  5.   5. ]


### Hyperparameter search

In [30]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Define hyperparameters
param_grid = {
    'alpha': np.array([1,0.1,0.01,0.001,0.0001,0]),
}

# Fit grid search
ridge_reg = GridSearchCV(ridge_model, param_grid, scoring='neg_mean_squared_error', cv=5)
ridge_reg.fit(X_train, y_train)

# Best estimator
print(f'Best estimator: {ridge_reg.best_estimator_}')

# Best model
best_model = ridge_reg.best_estimator_
best_model.fit(X_train, y_train)

best_model_train = best_model.score(X_train, y_train)
best_model_val = best_model.score(X_val, y_val)

# Check scores on train and validation
print(f'Score on the training set is: {best_model_train}')
print(f'Score on the validation set is: {best_model_val}')

# Check cross validation score on validation set
best_model_scores = cross_val_score(best_model, X_val, y_val, cv=5)
print(f'cross validation scores: {best_model_scores}')
print("%0.2f accuracy with a standard deviation of %0.2f" % (best_model_scores.mean(), best_model_scores.std()))

# Predict
best_model_y_pred = best_model.predict(X)
print(f'predicted values: {best_model_y_pred}')

Best estimator: Ridge()
Score on the training set is: 0.020826324790739204
Score on the validation set is: -0.02917097006685543
cross validation scores: [-0.00665355 -0.00827174 -0.00300594 -0.0037527  -0.0143959 ]
-0.01 accuracy with a standard deviation of 0.00
predicted values: [12.72923546 14.69124112 15.93846354 ...  9.56834034 14.69124112
 14.69124112]
