In [1]:
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
import warnings
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor

In [2]:
FINAL_DATA = pd.read_csv("data/KS VERI/KS10_FINAL_DATA.csv")
FINAL_DATA

Unnamed: 0,Timestamp,PUE,I_KOMP1_HIZ,I_KOMP1_SAAT,I_KOMP2_HIZ,I_KOMP2_SAAT,I_SIC_SET,I_NEM_SET,II_KOMP1_HIZ,II_KOMP1_SAAT,...,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11
0,2023-04-01 00:00:00,1.593893,25.000000,8968.0,0.0,3.0,45.0,23.0,36.200001,12770.0,...,0,0,1,0,0,0,0,0,0,0
1,2023-04-01 00:05:00,1.623764,45.200001,8968.0,0.0,3.0,45.0,23.0,34.000000,12770.0,...,0,0,1,0,0,0,0,0,0,0
2,2023-04-01 00:10:00,1.634981,25.700001,8968.0,0.0,3.0,45.0,23.0,34.700001,12770.0,...,0,0,1,0,0,0,0,0,0,0
3,2023-04-01 00:15:00,1.608970,28.400000,8968.0,0.0,3.0,45.0,23.0,39.900002,12770.0,...,0,0,1,0,0,0,0,0,0,0
4,2023-04-01 00:20:00,1.554008,28.000000,8968.0,0.0,3.0,45.0,23.0,36.900002,12770.0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59625,2023-10-25 00:45:00,1.611181,26.400000,13761.0,0.0,3.0,45.0,22.0,0.000000,17241.0,...,0,0,0,0,0,0,0,0,1,0
59626,2023-10-25 00:50:00,1.831158,39.200001,13761.0,0.0,3.0,45.0,22.0,49.700001,17241.0,...,0,0,0,0,0,0,0,0,1,0
59627,2023-10-25 00:55:00,1.627907,44.000000,13761.0,0.0,3.0,45.0,22.0,0.000000,17241.0,...,0,0,0,0,0,0,0,0,1,0
59628,2023-10-25 01:00:00,1.590126,0.000000,13761.0,0.0,3.0,45.0,22.0,29.400000,17241.0,...,0,0,0,0,0,0,0,0,1,0


In [3]:
# drop columns that has the same value for all rows
print("Columns that has the same value for all rows:")
for col in FINAL_DATA.columns:
    if len(FINAL_DATA[col].unique()) == 1:
        print(col)

Columns that has the same value for all rows:
I_KOMP2_HIZ
I_KOMP2_SAAT
I_SIC_SET
II_KOMP2_HIZ
II_KOMP2_SAAT
II_SIC_SET
III_KOMP2_HIZ
III_SIC_SET
IV_KOMP2_HIZ
IV_NEM_SET
month_1
month_2
month_3
month_11


In [4]:
# add extra columns
DATA = FINAL_DATA.copy()
DATA["PUE_lag_" + str(1)] = DATA["PUE"].shift(1)
DATA["PUE_lag_one_day"] = DATA["PUE"].shift(24*12)

DATA.dropna(inplace=True)
DATA.reset_index(drop=True, inplace=True)
DATA

Unnamed: 0,Timestamp,PUE,I_KOMP1_HIZ,I_KOMP1_SAAT,I_KOMP2_HIZ,I_KOMP2_SAAT,I_SIC_SET,I_NEM_SET,II_KOMP1_HIZ,II_KOMP1_SAAT,...,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,PUE_lag_1,PUE_lag_one_day
0,2023-04-02 00:00:00,1.576731,25.000000,8991.0,0.0,3.0,45.0,23.0,40.000000,12794.0,...,0,0,0,0,0,0,0,0,1.571346,1.593893
1,2023-04-02 00:05:00,1.621673,25.700001,8992.0,0.0,3.0,45.0,23.0,36.200001,12794.0,...,0,0,0,0,0,0,0,0,1.576731,1.623764
2,2023-04-02 00:10:00,1.645333,31.000000,8992.0,0.0,3.0,45.0,23.0,27.200001,12794.0,...,0,0,0,0,0,0,0,0,1.621673,1.634981
3,2023-04-02 00:15:00,1.603257,29.500000,8992.0,0.0,3.0,45.0,23.0,30.200001,12794.0,...,0,0,0,0,0,0,0,0,1.645333,1.608970
4,2023-04-02 00:20:00,1.588868,37.500000,8992.0,0.0,3.0,45.0,23.0,35.799999,12794.0,...,0,0,0,0,0,0,0,0,1.603257,1.554008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59337,2023-10-25 00:45:00,1.611181,26.400000,13761.0,0.0,3.0,45.0,22.0,0.000000,17241.0,...,0,0,0,0,0,0,1,0,1.752532,1.566098
59338,2023-10-25 00:50:00,1.831158,39.200001,13761.0,0.0,3.0,45.0,22.0,49.700001,17241.0,...,0,0,0,0,0,0,1,0,1.611181,1.681818
59339,2023-10-25 00:55:00,1.627907,44.000000,13761.0,0.0,3.0,45.0,22.0,0.000000,17241.0,...,0,0,0,0,0,0,1,0,1.831158,1.637288
59340,2023-10-25 01:00:00,1.590126,0.000000,13761.0,0.0,3.0,45.0,22.0,29.400000,17241.0,...,0,0,0,0,0,0,1,0,1.627907,1.822105


In [5]:
formula_x = "np.log(PUE-1)"

formula_y = ""
for i in range(2, len(DATA.columns)):
    formula_y += DATA.columns[i] + " + "
    
formula_y = formula_y[:-3]

formula = formula_x + " ~ " + formula_y
print(formula)

np.log(PUE-1) ~ I_KOMP1_HIZ + I_KOMP1_SAAT + I_KOMP2_HIZ + I_KOMP2_SAAT + I_SIC_SET + I_NEM_SET + II_KOMP1_HIZ + II_KOMP1_SAAT + II_KOMP2_HIZ + II_KOMP2_SAAT + II_SIC_SET + II_NEM_SET + III_KOMP1_HIZ + III_KOMP1_SAAT + III_KOMP2_HIZ + III_KOMP2_SAAT + III_SIC_SET + III_NEM_SET + IV_KOMP1_HIZ + IV_KOMP1_SAAT + IV_KOMP2_HIZ + IV_KOMP2_SAAT + IV_NEM_SET + CH1_CIKIS_SIC + CH1_GIRIS_SIC + CH2_CIKIS_SIC + CH2_GIRIS_SIC + CH3_CIKIS_SIC + CH3_GIRIS_SIC + SENSOR_I_TEMP + SENSOR_II_TEMP + OUTLET_TEMP + OUTLET_HUMIDITY + Mon + Tue + Wed + Thu + Fri + Sat + hour_0 + hour_1 + hour_2 + hour_3 + hour_4 + hour_5 + hour_6 + hour_7 + hour_8 + hour_9 + hour_10 + hour_11 + hour_12 + hour_13 + hour_14 + hour_15 + hour_16 + hour_17 + hour_18 + hour_19 + hour_20 + hour_21 + hour_22 + hour_23 + month_1 + month_2 + month_3 + month_4 + month_5 + month_6 + month_7 + month_8 + month_9 + month_10 + month_11 + PUE_lag_1 + PUE_lag_one_day


In [6]:
reg_model = sm.ols(formula=formula, data=DATA).fit()
print(reg_model.summary())

                            OLS Regression Results                            
Dep. Variable:        np.log(PUE - 1)   R-squared:                       0.685
Model:                            OLS   Adj. R-squared:                  0.685
Method:                 Least Squares   F-statistic:                     2151.
Date:                Sun, 26 Nov 2023   Prob (F-statistic):               0.00
Time:                        06:13:11   Log-Likelihood:                 73822.
No. Observations:               59342   AIC:                        -1.475e+05
Df Residuals:                   59281   BIC:                        -1.470e+05
Df Model:                          60                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           0.0002   3.11e-05     