# Exercise: Power demand forecasting 
https://github.com/abulbasar/data/blob/master/Combined_Cycle_Power_Plant.csv?raw=true
    
    Predict EP based on the other variables as feature. Calculate R2 and RMSE for training and test data. Create training and test set with 70/30 ratio and random seed = 1. Improve your model using log transformation of the output and polynomial transformation of the features.

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, pipeline,\
 model_selection, metrics, linear_model

In [3]:
df = pd.read_csv("/data/Combined_Cycle_Power_Plant.csv")
df.head()

Unnamed: 0,AT,V,AP,RH,EP
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47840 entries, 0 to 47839
Data columns (total 5 columns):
AT    47840 non-null float64
V     47840 non-null float64
AP    47840 non-null float64
RH    47840 non-null float64
EP    47840 non-null float64
dtypes: float64(5)
memory usage: 1.8 MB


In [5]:
target = "EP"
X = df.drop(columns=[target])
X = pd.get_dummies(X, drop_first=True).values.astype("float")
y = df[target]

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
                           test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
	("scaler", preprocessing.StandardScaler()),
	("est", linear_model.LinearRegression())
])


pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("training r2", metrics.r2_score(y_train, y_train_pred))
print("test r2", metrics.r2_score(y_test, y_test_pred))

print("training rmse:", np.sqrt(metrics.mean_squared_error(y_train
                                                , y_train_pred)))
print("test rmse:", np.sqrt(metrics.mean_squared_error(y_test
                                            , y_test_pred)))


training r2 0.928778403256508
test r2 0.9284502782189372
training rmse: 4.552608777072398
test rmse: 4.569084030756969


In [6]:
target = "EP"
X = df.drop(columns=[target])
X = pd.get_dummies(X, drop_first=True).values.astype("float")
y = np.log(df[target])

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
                           test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LinearRegression())
])


pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("training r2", metrics.r2_score(y_train, y_train_pred))
print("test r2", metrics.r2_score(y_test, y_test_pred))

print("training rmse:", np.sqrt(metrics.mean_squared_error(y_train
                                                , y_train_pred)))
print("test rmse:", np.sqrt(metrics.mean_squared_error(y_test
                                            , y_test_pred)))


training r2 0.9371394353763669
test r2 0.9369188951192259
training rmse: 0.00936796330181621
test rmse: 0.00939992074304685
