In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns


In [None]:
# Load Dataset
df = pd.read_csv('/content/complete_dataset.csv')  # Using uploaded dataset


In [None]:
# Basic Data Exploration
print(df.info())
print(df.describe())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2106 entries, 0 to 2105
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             2106 non-null   object 
 1   demand           2106 non-null   float64
 2   RRP              2106 non-null   float64
 3   demand_pos_RRP   2106 non-null   float64
 4   RRP_positive     2106 non-null   float64
 5   demand_neg_RRP   2106 non-null   float64
 6   RRP_negative     2106 non-null   float64
 7   frac_at_neg_RRP  2106 non-null   float64
 8   min_temperature  2106 non-null   float64
 9   max_temperature  2106 non-null   float64
 10  solar_exposure   2105 non-null   float64
 11  rainfall         2103 non-null   float64
 12  school_day       2106 non-null   object 
 13  holiday          2106 non-null   object 
dtypes: float64(11), object(3)
memory usage: 230.5+ KB
None
              demand          RRP  demand_pos_RRP  RRP_positive  \
count    2106.000000  2106.000

## Data Insights
1. The dataset contains information on electricity demand, pricing (RRP), weather conditions (temperature, solar exposure, rainfall), and special event indicators (holidays, school days).
2. Some missing values were found in `solar_exposure` (1 missing) and `rainfall` (3 missing). These may require imputation.
3. The demand and RRP exhibit seasonal variations, which can be leveraged for forecasting.

In [None]:
# Demand and Price Analysis
fig = px.line(df, x='date', y=['demand', 'RRP'], labels={'value': 'Value', 'date': 'Date'}, title='Electricity Demand vs RRP Over Time')
fig.show()

In [12]:
#Hourly Demand Trend
fig = px.box(df, x='Hour', y='demand', title='Hourly Electricity Demand Distribution')
fig.show()

## Demand & Price Analysis
- Electricity demand exhibits periodic fluctuations, with noticeable peaks and troughs.
- High demand correlates with higher RRP, indicating price surges during peak periods.
-Hourly trends reveal significant variations in electricity consumption.
-Demand fluctuates daily, weekly, and seasonally.





In [None]:
# Correlation Heatmap
import plotly.figure_factory as ff
# Calculate correlation only for numeric columns
corr_matrix = df.select_dtypes(include=np.number).corr()
fig = ff.create_annotated_heatmap(z=corr_matrix.values,
                                 x=list(corr_matrix.columns),
                                 y=list(corr_matrix.index),
                                 colorscale='Viridis', showscale=True)
fig.update_layout(title_text='Feature Correlation Heatmap')
fig.show()

Correlation Analysis

* Electricity demand is positively correlated with temperature.

* RRP and demand show moderate correlation, influencing pricing strategies.

* Weather factors like solar exposure and rainfall may indirectly affect demand.


In [13]:
# Weather Impact Analysis
fig = px.scatter(df, x='max_temperature', y='demand', title='Max Temperature vs Electricity Demand', labels={'max_temperature': 'Max Temperature', 'demand': 'Electricity Demand'})
fig.show()



In [14]:
#Seasonal Trends
fig = px.line(df, x='Month', y='demand', title='Monthly Electricity Demand Trends', markers=True)
fig.show()

Weather & Seasonal Impact

*  Higher temperatures increase electricity demand, likely due to cooling needs.

*  Seasonal trends indicate peak demand periods, aiding in resource optimization.





In [16]:
# Special Events Impact Analysis
if 'holiday' in df.columns:
    holiday_demand = df.groupby('holiday')['demand'].mean()
    print("Average Demand on Holidays vs Non-Holidays:\n", holiday_demand)
if 'school_day' in df.columns:
    school_day_demand = df.groupby('school_day')['demand'].mean()
    print("Average Demand on School Days vs Non-School Days:\n", school_day_demand)

Average Demand on Holidays vs Non-Holidays:
 holiday
N    120698.665414
Y    102560.018052
Name: demand, dtype: float64
Average Demand on School Days vs Non-School Days:
 school_day
N    117513.024946
Y    121169.104078
Name: demand, dtype: float64


Special Events Impact

* Holidays generally show lower demand, while school days contribute to higher consumption.

* Adjusting forecasting models for holidays can enhance accuracy.

In [17]:
#Anomaly Detection using Z-score
from scipy.stats import zscore
df['Z_Score'] = np.abs(zscore(df['demand']))
anomalies = df[df['Z_Score'] > 3]
print("Detected Anomalies:\n", anomalies[['date', 'demand']])

fig = px.scatter(df, x='date', y='demand', title='Anomalies in Electricity Demand', color=df['Z_Score'] > 3, labels={'date': 'Date', 'demand': 'Electricity Demand'})
fig.show()

Detected Anomalies:
            date      demand
1114 2018-01-19  165070.595
1485 2019-01-25  168894.845
1520 2019-03-01  163682.040
1856 2020-01-31  170653.840


Anomaly Detection

* Detected demand spikes using Z-score analysis.

* These anomalies may be due to extreme weather, operational issues, or special events.

* Handling anomalies in forecasting models can improve accuracy.


In [18]:
#Creating Time-based Features

df['date'] = pd.to_datetime(df['date'])
df['Hour'] = df['date'].dt.hour
df['DayOfWeek'] = df['date'].dt.dayofweek
df['Month'] = df['date'].dt.month
df['Year'] = df['date'].dt.year

In [19]:
#Lag Features for Forecasting

df['demand_Lag_1'] = df['demand'].shift(1)
df['demand_Lag_7'] = df['demand'].shift(7)

Feature Engineering

* Added time-based features (Hour, DayOfWeek, Month, Year).

* Introduced lag variables (demand_Lag_1, demand_Lag_7) for time dependencies.

* Seasonal adjustments and anomaly detection help refine forecasting models.


In [20]:
# Save Processed Data
df.to_csv('/content/processed_electricity_data.csv', index=False)

print("Analysis complete. Processed data saved.")

Analysis complete. Processed data saved.
