# Create model to predict Active Power for Wind Turbines at Windy Hill Wind Turbine farm

In [1]:
from azureml.core import Workspace
from azureml.core.model import Model
from azureml.core import Run
import os 

In [2]:
# get workspace configuration
from azureml.core import Workspace
ws = Workspace.from_config()

In [None]:
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

In [4]:
# get experiemnt
from azureml.core import Experiment
experiment = Experiment(workspace=ws, name="turbines")

In [None]:
print(ws.datasets)

In [None]:
ws.datasets.get('turbines')

In [7]:
# load dataset
from azureml.core import Dataset
datastore = ws.get_default_datastore()
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'UI/03-29-2020_090748_UTC/turbines_filtered.csv')])
df_turbines = dataset.to_pandas_dataframe()

In [8]:
# verify
df_turbines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20831 entries, 0 to 20830
Data columns (total 11 columns):
Timestamp           20831 non-null datetime64[ns]
Turbine             20831 non-null object
Blade1 Actual       20831 non-null float64
Blade2 Actual       20831 non-null float64
Blade3 Actual       20831 non-null float64
Rotor Speed         20831 non-null float64
State               20831 non-null int64
Active Power        20831 non-null float64
Nacelle Position    20831 non-null float64
Air Temperature     20831 non-null float64
Wind Speed          20831 non-null float64
dtypes: datetime64[ns](1), float64(8), int64(1), object(1)
memory usage: 1.7+ MB


In [9]:
#Prepare the training & testing/scoring data sets, and split them randomly
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
#define the target variable to be predicted
y = df_turbines['Active Power'].values

# second part of this is done below in a loop to allow multiple models to be tested with different features sets

In [10]:
# create function to create a polynomial regression model based upon recommendations from various scientific papers including: https://www.hindawi.com/journals/jen/2016/8519785
# function below care of Animesh Agarwal:
# https://towardsdatascience.com/polynomial-regression-bbe8b9d97491
# https://github.com/animesh-agarwal/Machine-Learning-Datasets/blob/master/boston-housing/Polynomial_Regression.ipynb

def create_polynomial_regression_model(degree):
  "Creates a polynomial regression model for the given degree"
  poly_features = PolynomialFeatures(degree=degree)
  
  # transform the features to higher degree features.
  X_train_poly = poly_features.fit_transform(X_train)
  
  # fit the transformed features to Linear Regression
  poly_model = LinearRegression()
  poly_model.fit(X_train_poly, y_train)
  
  # predicting on training data-set
  y_train_predicted = poly_model.predict(X_train_poly)
  
  # predicting on test data-set
  y_test_predict = poly_model.predict(poly_features.fit_transform(X_test))
  
  # evaluating the model on training dataset
  rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted))
  r2_train = r2_score(y_train, y_train_predicted)
  
  # evaluating the model on test dataset
  rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predict))
  r2_test = r2_score(y_test, y_test_predict)

  return {'model':poly_model,'rmse_train':rmse_train,'r2_train':r2_train,'rmse_test':rmse_test,'r2_test':r2_test}

In [11]:
# Run a set of models and log information to the azure ml studio

import joblib # this is required, because there is also a joblib in sklearn

logging_azure = False # to log or not to log, for testing
debug = False # output more information

turbine_models = [] # results

# names for each feature set we want to test
features = ["Wind","Wind_Air"]
# features for each of those feature sets
feature_set = (['Wind Speed'],['Air Temperature','Wind Speed'])

# run for each set of features
for feature in range(0,len(features)):
    # split data based upon featues for this run
    X_train, X_test, y_train, y_test = train_test_split(df_turbines[feature_set[feature]].values, 
                                                         y, test_size=0.25, random_state=42)
    # for each degree in relation to polynomial regression
    for degree in (2,3,4):
        if logging_azure:
            run = experiment.start_logging()
            run.log("features", features[feature])
            run.log("degree",degree)
        # run the model and store the output
        turbine_models.append(create_polynomial_regression_model(degree))
        if logging_azure:
            # log model errors
            for error in 'rmse_train','r2_train','rmse_test','r2_test':
                if debug:
                    print(f'{error},{turbine_models[-1][error]}')
                run.log(f'{error}',f'{turbine_models[-1][error]}')
        if debug:
            print(f"""{turbine_models[-1]['rmse_train']}
                      {turbine_models[-1]['r2_train']}
                      {turbine_models[-1]['rmse_test']}
                      {turbine_models[-1]['r2_test']}
                      """)
        if logging_azure:
            # save the model to azure ml
            model_name = f'{features[feature]}_{degree}.pkl'
            filename = f'outputs/{model_name}'
            joblib.dump(value=turbine_models[-1]['model'], filename=filename)
            run.upload_file(name=model_name, path_or_stream=filename)
            run.complete()

In [None]:
# get a link to the azure ml portal for this experiement
experiment

In [13]:
# run review metrics

minimum_rmse_runid = None
minimum_rmse = None

for run in experiment.get_runs():
    run_metrics = run.get_metrics()
    run_details = run.get_details()
    # each logged metric becomes a key in this returned dict
    try:
        run_rmse = run_metrics["rmse_test"]
        run_id = run_details["runId"]

        if minimum_rmse is None:
            minimum_rmse = run_rmse
            minimum_rmse_runid = run_id
        else:
            if run_rmse < minimum_rmse:
                minimum_rmse = run_rmse
                minimum_rmse_runid = run_id
    except:
        # most likely here if a run does not have a metric: rmse
        pass
print("Best run_id: " + minimum_rmse_runid)
print("Best run_id rmse: " + str(minimum_rmse))  

Best run_id: fd3e84ae-7c00-4b8b-bac4-6539cac430a3
Best run_id rmse: 128.5577883507858


## save and register model

In [14]:
# identify model

best_run = Run(experiment=experiment, run_id=minimum_rmse_runid)
model_file_name = best_run.get_file_names()[0]
model_file_name

'Wind_Air_4.pkl'

In [15]:
# save the model to the filesystem
joblib.dump(value=turbine_models[-1]['model'], filename=model_file_name)

['Wind_Air_4.pkl']

In [None]:
# run model from filesystem - joblib created file

print(os.getcwd(),model_file_name)
turbine_model = joblib.load(os.path.join(os.getcwd(), model_file_name))

print(type(turbine_model))
new_input = [[45, 6.6]] #Temp=45 F, Wind Speed = 6.6 m/s
poly_features = PolynomialFeatures(degree=degree)
turbine_model.predict(poly_features.fit_transform(new_input))

In [17]:
# register model 
model = run.register_model(model_name='turbineActivePower', model_path=model_file_name)
print(model.name, model.id, model.version, sep='\t')

turbineActivePower	turbineActivePower:15	15


## load and test model

In [None]:

model_file_name = 'Wind_Air_4.pkl'
ws = Workspace.from_config()
model=Model(ws, 'turbineActivePower')

print(f'model version: {model.version}')
# download the model
print(model.download(target_dir=os.getcwd(), exist_ok=True))

# verify the downloaded model file
file_path = os.path.join(os.getcwd(), model_file_name)
os.stat(file_path)

In [None]:
# run model from filesystem - downloaded

print(os.getcwd(),model_file_name)
turbine_model = joblib.load(os.path.join(os.getcwd(), model_file_name))

print(type(turbine_model))
new_input = [[45, 6.6]] #Temp=45 F, Wind Speed = 6.6 m/s
poly_features = PolynomialFeatures(degree=degree)
turbine_model.predict(poly_features.fit_transform(new_input))