```yaml
titan: v1
service:
  image: scipy
  machine:
    cpu: 2
    memory: 1024MB
```

In [17]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import json
from sklearn.tree               import DecisionTreeRegressor
from sklearn.ensemble           import ExtraTreesRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network     import MLPRegressor
from sklearn.svm                import SVR
from sklearn.linear_model       import Ridge, Lasso, SGDRegressor, BayesianRidge
from sklearn.neighbors          import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.experimental       import enable_hist_gradient_boosting
from sklearn.ensemble           import HistGradientBoostingRegressor
from sklearn import pipeline      
from sklearn import preprocessing 
from sklearn import impute
from sklearn import compose
from sklearn import metrics
import numpy as np
import time

#from lightgbm                   import LGBMRegressor
from sklearn.model_selection import train_test_split
tree_regressors = {
    "Decision_tree_regressor": DecisionTreeRegressor(),
    #"AdaBoost_regressor": AdaBoostRegressor(),
    "Extra_trees_regressor": ExtraTreesRegressor(),
    "Random_forest_regressor": RandomForestRegressor(),
    "GBM_regressor": GradientBoostingRegressor(), 
    "HGB_regressor": HistGradientBoostingRegressor(),
    #"CATBoost_regressor": CatBoostRegressor(verbose=0),
    #"lightgbm_regressor": LGBMRegressor(),
        }
mult_regressors = {
    "Linear_regression": LinearRegression(), 
    "Ridge_regressor": Ridge(),
    "SVM_regressor": SVR(), 
    "MLP_regressor": MLPRegressor(learning_rate="adaptive", max_iter=10000),
    "SGD_regressor": SGDRegressor(),
    "KNN_regressor": KNeighborsRegressor(),
    "BR_regressor" : BayesianRidge(),
    "RNN_regressor": RadiusNeighborsRegressor(), 
        }


In [2]:
# Reading the dataset from a Gitlab repo
url = "https://storage.googleapis.com/tutorial-datasets/weather_data_GER_2016.csv"
weather = pd.read_csv(url)

In [3]:
weather.head()

Unnamed: 0,timestamp,cumulated hours,lat,lon,v1,v2,v_50m,h1,h2,z0,SWTDN,SWGDN,T,rho,p
0,2016-01-01T00:00:00Z,0,47.5,5.625,0.81,1.88,3.36,2,10,0.052526,0.0,0.0,277.350159,1.236413,99282.710938
1,2016-01-01T01:00:00Z,1,47.5,5.625,0.77,1.61,2.63,2,10,0.05251,0.0,0.0,277.025665,1.23939,99300.164062
2,2016-01-01T02:00:00Z,2,47.5,5.625,0.66,1.22,1.89,2,10,0.052495,0.0,0.0,277.223755,1.243861,99310.992188
3,2016-01-01T03:00:00Z,3,47.5,5.625,0.96,1.35,1.62,2,10,0.05248,0.0,0.0,277.13324,1.24739,99314.773438
4,2016-01-01T04:00:00Z,4,47.5,5.625,1.14,1.56,1.83,2,10,0.05248,0.0,0.0,276.867767,1.248869,99324.796875


Next, we read the weather data for Germany in 2016 by reading the full csv file.

The data in the file contains the following:

* wind
  * v1: velocity [m/s] @ height h1 (2 meters above displacement height)
  * v2: velocity [m/s] @ height h2 (10 meters above displacement height)
  * v_50m: velocity [m/s] @ 50 meters above ground
  * h1: height above ground [m] (h1 = displacement height +2m)
  * h2: height above ground [m] (h2 = displacement height +10m)
  * z0: roughness length [m]
* solar parameters:
  * SWTDN: total top-of-the-atmosphere horizontal radiation [W/m²]
  * SWGDN: total ground horizontal radiation [W/m²]
* temperature data
  * T: Temperature [K] @ 2 meters above displacement height (see h1)
* air data
  * Rho: air density [kg/m³] @ surface
  *p: air pressure [Pa] @ surface

In [4]:
# Reading the dataset from a Gitlab repo
url = "https://storage.googleapis.com/tutorial-datasets/time_series_60min_singleindex_filtered.csv"
production = pd.read_csv(url)


In [5]:
production.head()


Unnamed: 0,utc_timestamp,cet_cest_timestamp,DE_wind_generation_actual
0,2015-12-31T23:00:00Z,2016-01-01T00:00:00+0100,8638
1,2016-01-01T00:00:00Z,2016-01-01T01:00:00+0100,8579
2,2016-01-01T01:00:00Z,2016-01-01T02:00:00+0100,8542
3,2016-01-01T02:00:00Z,2016-01-01T03:00:00+0100,8443
4,2016-01-01T03:00:00Z,2016-01-01T04:00:00+0100,8295


In [6]:
# Merge datasets
weather_by_day = weather.groupby(weather.index).mean()
combined = pd.merge(production, weather_by_day, how='left', left_index=True, right_index=True)

print(combined.isna().sum())
print(len(combined))

utc_timestamp                0
cet_cest_timestamp           0
DE_wind_generation_actual    0
cumulated hours              0
lat                          0
lon                          0
v1                           0
v2                           0
v_50m                        0
h1                           0
h2                           0
z0                           0
SWTDN                        0
SWGDN                        0
T                            0
rho                          0
p                            0
dtype: int64
8784


In [7]:
combined.head()

Unnamed: 0,utc_timestamp,cet_cest_timestamp,DE_wind_generation_actual,cumulated hours,lat,lon,v1,v2,v_50m,h1,h2,z0,SWTDN,SWGDN,T,rho,p
0,2015-12-31T23:00:00Z,2016-01-01T00:00:00+0100,8638,0,47.5,5.625,0.81,1.88,3.36,2,10,0.052526,0.0,0.0,277.350159,1.236413,99282.710938
1,2016-01-01T00:00:00Z,2016-01-01T01:00:00+0100,8579,1,47.5,5.625,0.77,1.61,2.63,2,10,0.05251,0.0,0.0,277.025665,1.23939,99300.164062
2,2016-01-01T01:00:00Z,2016-01-01T02:00:00+0100,8542,2,47.5,5.625,0.66,1.22,1.89,2,10,0.052495,0.0,0.0,277.223755,1.243861,99310.992188
3,2016-01-01T02:00:00Z,2016-01-01T03:00:00+0100,8443,3,47.5,5.625,0.96,1.35,1.62,2,10,0.05248,0.0,0.0,277.13324,1.24739,99314.773438
4,2016-01-01T03:00:00Z,2016-01-01T04:00:00+0100,8295,4,47.5,5.625,1.14,1.56,1.83,2,10,0.05248,0.0,0.0,276.867767,1.248869,99324.796875


In [8]:
# Apply linear regression
lr = LinearRegression()
X_wind = combined[['v1', 'v2', 'v_50m', 'z0']]
#X_wind = combined[['v1', 'v2', 'v_50m']]
#X_wind = combined[['v1', 'v2']]
y_wind = combined['DE_wind_generation_actual']
num_vars = ['v1', 'v2', 'v_50m', 'z0']
cat_vars = []

#model = lr.fit(X_wind, y_wind)

In [9]:
x_train, x_val, y_train, y_val = train_test_split(X_wind, y_wind, test_size=0.3, random_state=909)

In [10]:
num_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='mean', add_indicator=False))]) # mean, median

cat_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan))])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_treeModels, num_vars),
    ('cat', cat_4_treeModels, cat_vars),],
    remainder='drop')
                          

In [12]:
all_tree_pipes = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_regressors.items()}
tree_results = pd.DataFrame({'Model': [],"EVS":[], 'ME': [],'Time': []})
for model_name, model in all_tree_pipes.items():
    print(f"Trying {model_name}")
    start_time = time.time()
    
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    model.fit(x_train, y_train)
    # GET PREDICTIONS USING x_val
    pred = model.predict(x_val)

    total_time = time.time() - start_time

    tree_results = tree_results.append({"Model":    model_name,
#                               "Accuracy": metrics.accuracy_score(y_val, pred)*100,
#                               "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "EVS":metrics.explained_variance_score(y_val, pred),
                              "ME": metrics.mean_squared_error(y_val, pred, squared=False),
                              #"MSE": metrics.mean_squared_error(y_val, pred, squared=True),
                              "Time":     total_time},
                              ignore_index=True)
                              
tree_results_ord = tree_results.sort_values(by=['ME'], ascending=True, ignore_index=True)
tree_results_ord.index += 1 
print(tree_results_ord)

Trying Decision_tree_regressor
Trying Extra_trees_regressor
Trying Random_forest_regressor
Trying GBM_regressor
Trying HGB_regressor
                     Model       EVS           ME           MSE      Time
1  Random_forest_regressor  0.656548  3885.513445  1.509721e+07  2.354658
2    Extra_trees_regressor  0.552572  4439.660601  1.971059e+07  1.055529
3            HGB_regressor  0.428973  5012.481389  2.512497e+07  0.553248
4  Decision_tree_regressor  0.375471  5233.059864  2.738492e+07  0.082865
5            GBM_regressor  0.343818  5369.059698  2.882680e+07  0.655833


In [13]:
num_4_multModels = pipeline.Pipeline(steps=[
    ('transformer',preprocessing.QuantileTransformer(n_quantiles=10)),
    #('imputer', impute.SimpleImputer(strategy='median', add_indicator=False))
]) # mean, median

cat_4_multModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan))])

mult_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_multModels, num_vars),
    ('cat', cat_4_multModels, cat_vars),],
    remainder='drop')


In [18]:
all_mult_pipes = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in mult_regressors.items()}
mult_results = pd.DataFrame({'Model': [],"EVS":[], 'ME': [], 'MSE': [], 'Time': []})
for model_name, model in all_mult_pipes.items():
    print(f"Trying {model_name}")
    start_time = time.time()
    
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    model.fit(x_train, y_train)
    # GET PREDICTIONS USING x_val
    pred = model.predict(x_val)

    total_time = time.time() - start_time

    mult_results = mult_results.append({"Model":    model_name,
#                               "Accuracy": metrics.accuracy_score(y_val, pred)*100,
#                               "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "EVS":metrics.explained_variance_score(y_val, pred),
                              "ME": metrics.mean_squared_error(y_val, pred, squared=False),
                              "MSE": metrics.mean_squared_error(y_val, pred, squared=True),
                              "Time":     total_time},
                              ignore_index=True)
                              
mult_results_ord = mult_results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
mult_results_ord.index += 1 
print(mult_results_ord)


Trying Linear_regression
Trying Ridge_regressor
Trying SVM_regressor
Trying MLP_regressor
Trying SGD_regressor
Trying KNN_regressor
Trying BR_regressor
Trying RNN_regressor
               Model       EVS           ME           MSE        Time
1      MLP_regressor  0.054839  6442.526369  4.150615e+07  217.592916
2      RNN_regressor  0.045768  6474.962142  4.192513e+07    0.140601
3  Linear_regression  0.029177  6530.957860  4.265341e+07    0.038852
4    Ridge_regressor  0.029132  6531.111671  4.265542e+07    0.023890
5       BR_regressor  0.028593  6532.937102  4.267927e+07    0.009210
6      SGD_regressor  0.027499  6537.387668  4.273744e+07    0.014462
7      KNN_regressor  0.003658  6611.389137  4.371047e+07    0.020170
8      SVM_regressor  0.000901  6791.269208  4.612134e+07    3.151960


In [35]:
model = all_tree_pipes["Random_forest_regressor"]
#print(pipe)
model.fit(X_wind, y_wind)
# print(pipe.named_steps['clf'].coef_)



Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['v1', 'v2', 'v_50m', 'z0']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('ordinal',
                                                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                                  unknown_value=nan))]),
                 

In [53]:
# y = pd.DataFrame([[1.0,3.1,1,0.1]], columns=['v1', 'v2', 'v_50m', 'z0'])
# x = model.predict(y)
# print(x)

[8143.3]


In [39]:
# Mock request object for local API testing
headers = {
'content-type': 'application/json'
}
body = json.dumps({
  "data": [[1.44, 1.77, 2, 0.054]]
})
REQUEST = json.dumps({ 'headers': headers, 'body': body })

In [56]:
# POST /prediction
body = json.loads(REQUEST)['body']
# predict the cluster for new samples. Function to be exposed through Titan
input_params = json.loads(body)['data']

#input_params = [[0.44, 1.77, 2, 0.054]]
d = pd.DataFrame(input_params, columns=['v1', 'v2', 'v_50m', 'z0'])
print(model.predict(d))

[13497.21]
