In [27]:
from sklearn import model_selection, preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, validation_curve, cross_val_score
from sklearn. linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from get_data import split_data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Set random seed
seed = 42

## Import Cleaned Data

In [28]:
X = pd.read_csv('rhs_cleaned_dataset.csv')
y = X.pop('Max Time To Ultimate Height')

In [29]:
# Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.8, random_state=seed)

In [30]:
# Normalise data
scaler = preprocessing.StandardScaler()
X_train_scaler= scaler.fit(X_train)
X_scaled = X_train_scaler.transform(X_train)

## Fitting Linear Model
### Setting the baseline

In [32]:
# Fit model
linear_regression_model = LinearRegression(normalize=True)
linear_regression_model.fit(X_train, y_train)
lin_reg_train_score = linear_regression_model.score(X_train, y_train)
lin_reg_val_score = linear_regression_model.score(X_val, y_val)

# Check scores on train and validation
print(f'Score on the training set is: {lin_reg_train_score}')
print(f'Score on the validation set is: {lin_reg_val_score}')
print(f'Linear regression coefficients are: {linear_regression_model.coef_}')

# Check cross validation score on validation set
lin_reg_cross_val_scores = cross_val_score(linear_regression_model, X_val, y_val, cv=10)
print(f'cross validation scores: {lin_reg_cross_val_scores}')
print("%0.2f accuracy with a standard deviation of %0.2f" % (lin_reg_cross_val_scores.mean(), lin_reg_cross_val_scores.std()))

y_pred = linear_regression_model.predict(X)
print(f'predicted values: {y_pred}')

Score on the training set is: 0.7306164095469609
Score on the validation set is: 0.7341862387967846
Linear regression coefficients are: [ 1.63378209 -0.70512515 -0.39970037  1.52343794  0.19635033  0.22356054
  2.5414895 ]
cross validation scores: [0.66293946 0.67164717 0.72693639 0.83923137 0.77730913 0.74275462
 0.7457816  0.74006211 0.66380927 0.74166417]
0.73 accuracy with a standard deviation of 0.05
predicted values: [13.81170045 45.68126394 46.60992089 ...  8.14212358 19.13650209
  7.42898566]


In [34]:
X.columns

Index(['Full Sun', 'Sheltered', 'Generally pest free', 'Min Ultimate Height',
       'Max Ultimate Height', 'Min Ultimate Spread', 'Max Ultimate Spread'],
      dtype='object')

### Analysis
These resulting coefficients tell us that the weighting of the full exposure of sunlight have a positive correlation to the target variable of predicting maximum height of the plants, whilst the sheltered plants and whether they are generally pest-free have negative weighting to the predictor. 

In [38]:
# Calculate R2 score
print(f'The R2 score for our baseline is: {r2_score(y, y_pred)}')

The R2 score for our baseline is: 0.7335148333840158


In [47]:
from sklearn.tree import DecisionTreeRegressor

# Fit model
regressor = DecisionTreeRegressor(random_state=seed)
regressor.fit(X_train, y_train)
regressor_train_score = regressor.score(X_train, y_train)
regressor_val_score = regressor.score(X_val, y_val)

# Check scores on train and validation
print(f'Score on the training set is: {regressor_train_score}')
print(f'Score on the validation set is: {regressor_val_score}')

# Check cross validation score on validation set
regessor_cross_val_score = cross_val_score(regressor, X_val, y_val, cv=10)
print(f'cross validation scores: {regessor_cross_val_score}')
print("%0.2f accuracy with a standard deviation of %0.2f" % (regessor_cross_val_score.mean(), regessor_cross_val_score.std()))

regressor_y_pred = regressor.predict(X)
print(f'predicted values: {regressor_y_pred}')

Score on the training set is: 0.856941360664495
Score on the validation set is: 0.6442495327992072
cross validation scores: [0.65388424 0.71175702 0.72483935 0.84267534 0.72218291 0.75153061
 0.70319817 0.71756827 0.62448281 0.72833501]
0.72 accuracy with a standard deviation of 0.05
predicted values: [10.  50.  20.  ...  5.  12.5  5. ]


In [54]:
from sklearn.neighbors import KNeighborsRegressor

# Fit model
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train, y_train)
neigh_train_score = neigh.score(X_train, y_train)
neigh_val_score = neigh.score(X_val, y_val)

# Check scores on train and validation
print(f'Score on the training set is: {neigh_train_score}')
print(f'Score on the validation set is: {neigh_val_score}')

# Check cross validation score on validation set
neigh_cross_val_scores = cross_val_score(neigh, X_val, y_val, cv=10)
print(f'cross validation scores: {neigh_cross_val_scores}')
print("%0.2f accuracy with a standard deviation of %0.2f" % (neigh_cross_val_scores.mean(), neigh_cross_val_scores.std()))

neigh_y_pred = neigh.predict(X)
print(f'predicted values: {neigh_y_pred}')

Score on the training set is: 0.7943498483312859
Score on the validation set is: 0.637771756420163
cross validation scores: [0.49758661 0.64008145 0.64119027 0.75014302 0.63130527 0.67423374
 0.49115764 0.59007685 0.56526853 0.62790331]
0.61 accuracy with a standard deviation of 0.07
predicted values: [11.  50.  20.  ...  5.  12.5  5. ]
