# This notebook

This is the continuation of IrrToProduction but by focusing onn a single plant at a time.

## Goals

- Predict AC_POWER with good accuracy and without offerfitting
- Forecast AC_POWER

### Predict AC_POWER

In [2]:
# Import

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import  make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error

In [3]:
# Import & Transforms data
def preprocess(d, plant_nb):
    d["generator"] = d["generator"].drop(columns=["SOURCE_KEY", "PLANT_ID", "DC_POWER", "TOTAL_YIELD", "DAILY_YIELD"])
    d['generator']['DATE_TIME'] = pd.to_datetime(d['generator']['DATE_TIME'], format='%d-%m-%Y %H:%M') if plant_nb==1 else pd.to_datetime(d['generator']['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')
    d["sensor"] = d["sensor"].drop(columns=["time", "date"])
    X = d['generator'].merge(d["sensor"], on="DATE_TIME").drop(columns=["DATE_TIME"])
    y = X.pop("AC_POWER")
    return X, y 


p1_X, p1_y = preprocess({"generator": pd.read_csv("dataset/Plant_1_Generation_Data.csv"), 
           "sensor": pd.read_parquet("dataset/parquets/plant_1_updated_sensor")}, 1)

p2_X, p2_y = preprocess({"generator": pd.read_csv("dataset/Plant_2_Generation_Data.csv"), 
           "sensor": pd.read_parquet("dataset/parquets/plant_2_updated_sensor")}, 2)

x_train, x_test, y_train, y_test = train_test_split(p1_X, p1_y, test_size=.5)

In [4]:
# Train
param_dist = {
    'n_estimators': np.arange(50, 200, 5),
    'max_depth': np.arange(2, 10, 1),
    'learning_rate': np.linspace(0.01, 1, 25),
    'subsample': np.arange(0.1, 1.0, 0.25), 
    'max_features': np.linspace(0.1, 1.0, 5),
    'lambda_l2': np.linspace(0.01, 0.5, 5),
}

model_search = RandomizedSearchCV(
    lgb.LGBMRegressor(), 
    param_dist, 
    n_iter=500,
    cv=None, 
    n_jobs=-1,
)

model_search.fit(x_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000469 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 861
[LightGBM] [Info] Number of data points in the train set: 34389, number of used features: 4
[LightGBM] [Info] Start training from score 306.373374


In [5]:
print(model_search.best_params_)

{'subsample': 0.6, 'n_estimators': 155, 'max_features': 0.775, 'max_depth': 8, 'learning_rate': 0.17500000000000002, 'lambda_l2': 0.01}


In [11]:
# Test

metrics = {
    "Name":[],
    "R2":[], 
    "MAE":[], 
    "MSE":[],
    "RMSE":[],
}

In [12]:
preds = model_search.predict(x_test)
metrics['Name'].append("Plain")
metrics['R2'].append(r2_score(preds, y_test))
metrics['MAE'].append(mean_absolute_error(preds, y_test))
metrics['MSE'].append(mean_squared_error(preds, y_test, squared=True))
metrics['RMSE'].append(mean_squared_error(preds, y_test, squared=False))



In [14]:
pd.DataFrame(metrics)

Unnamed: 0,Name,R2,MAE,MSE,RMSE
0,Plain,0.984504,16.898603,2387.16098,48.858581


### Forecast AC_POWER