In [47]:
# Basic libraries 
import pandas as pd
import numpy as np

# Data Visualization
import plotly.express as px
import plotly.graph_objs as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.io as pio
from IPython.display import display
from plotly.offline import init_notebook_mode
# Statsmodels for Linear Regression
import statsmodels.api as sm

# Hiding warnings 
import warnings
warnings.filterwarnings("ignore")

In [55]:
# loading the csv file
df = pd.read_csv('esb_day_level.csv')
df_to_use = df[['dist', 'idling_mins', 'speed', 'region_NC', 'region_SC', 'region_RM', 'energy']]
df_to_use.dropna(inplace=True)
df_to_use.head(2)

Unnamed: 0,dist,idling_mins,speed,region_NC,region_SC,region_RM,energy
0,6.265156,0.1,11.089115,1,0,0,33.618298
1,19.925346,6.066667,9.504779,1,0,0,149.258159


In [56]:
X = df_to_use.drop('energy', axis = 1) # Selecting independent features
y = df_to_use.energy # Selecting target variable

In [57]:
# Creating training and testing sets 
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size = .3, # 30% of data for testing
                                                   shuffle = True, # Shuffling values
                                                   random_state = 42)

In [58]:
# Adding a Constant term for the Intercept
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

In [59]:
X_train # Visualizing X dataframe with the constant

Unnamed: 0,const,dist,idling_mins,speed,region_NC,region_SC,region_RM
1831,1.0,50.327691,15.200000,16.854997,0,1,0
444,1.0,19.574233,24.033333,17.072813,1,0,0
2688,1.0,87.000000,102.000000,9.987592,0,0,0
1532,1.0,63.050000,48.600000,15.752686,0,1,0
856,1.0,69.440000,28.800000,19.336563,0,1,0
...,...,...,...,...,...,...,...
1638,1.0,41.140000,54.600000,11.239606,0,1,0
1095,1.0,89.970000,10.200000,29.155774,0,1,0
1130,1.0,72.470000,24.600000,26.880881,0,1,0
1294,1.0,91.530000,16.800000,17.628602,0,1,0


In [60]:
# Fitting model
model = sm.OLS(y_train, X_train, missing='drop').fit()
print(model.summary(alpha = 0.05))

                            OLS Regression Results                            
Dep. Variable:                 energy   R-squared:                       0.808
Model:                            OLS   Adj. R-squared:                  0.808
Method:                 Least Squares   F-statistic:                     1265.
Date:                Sun, 29 Dec 2024   Prob (F-statistic):               0.00
Time:                        16:38:59   Log-Likelihood:                -8279.9
No. Observations:                1806   AIC:                         1.657e+04
Df Residuals:                    1799   BIC:                         1.661e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const         -52.1681      2.801    -18.626      

In [61]:
#Let’s move forward and predict values for our dependent variable Y in the testing set.
y_pred = model.predict(X_test) # Running predictions
rmse = mean_squared_error(y_test, y_pred,squared = False) # Computing RMSE

print(f'\nRoot Mean Squared Error for Baseline Model: {rmse:.2f}')


Root Mean Squared Error for Baseline Model: 21.87


In [65]:
import plotly.express as px

fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual Values', 'y': 'Predicted Values'}, title='Actual vs Predicted Values')
fig.show()

In [None]:
X

In [62]:
y_pred

809      93.086147
1446     60.126092
761     139.969014
318      41.325972
961     173.456705
           ...    
158      45.550849
1609    136.265829
331      32.840296
1825     85.846776
2623     13.306832
Length: 775, dtype: float64