In [47]:
# Importing Libraries:

import pandas as pd
import numpy as np
from datetime import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
# Importing data:

df_raw = pd.read_csv("Solar_Power_Plant_Data.csv")

In [4]:
# Copy of the dataset:
df = df_raw.copy()

In [5]:
# Looking at the first 5 rolls:

df.head()

Unnamed: 0,Date-Hour(NMT),WindSpeed,Sunshine,AirPressure,Radiation,AirTemperature,RelativeAirHumidity,SystemProduction
0,01.01.2017-00:00,0.6,0,1003.8,-7.4,0.1,97,0.0
1,01.01.2017-01:00,1.7,0,1003.5,-7.4,-0.2,98,0.0
2,01.01.2017-02:00,0.6,0,1003.4,-6.7,-1.2,99,0.0
3,01.01.2017-03:00,2.4,0,1003.3,-7.2,-1.3,99,0.0
4,01.01.2017-04:00,4.0,0,1003.1,-6.3,3.6,67,0.0


In [6]:
# Information about the dataset:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date-Hour(NMT)       8760 non-null   object 
 1   WindSpeed            8760 non-null   float64
 2   Sunshine             8760 non-null   int64  
 3   AirPressure          8760 non-null   float64
 4   Radiation            8760 non-null   float64
 5   AirTemperature       8760 non-null   float64
 6   RelativeAirHumidity  8760 non-null   int64  
 7   SystemProduction     8760 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 547.6+ KB


In [7]:
# Missing data:

df.isna().sum()

Date-Hour(NMT)         0
WindSpeed              0
Sunshine               0
AirPressure            0
Radiation              0
AirTemperature         0
RelativeAirHumidity    0
SystemProduction       0
dtype: int64

In [8]:
# Changing Date-Hour(NMT) column type:

df['Date-Hour(NMT)'] = pd.to_datetime(df['Date-Hour(NMT)'], format="%d.%m.%Y-%H:%M")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date-Hour(NMT)       8760 non-null   datetime64[ns]
 1   WindSpeed            8760 non-null   float64       
 2   Sunshine             8760 non-null   int64         
 3   AirPressure          8760 non-null   float64       
 4   Radiation            8760 non-null   float64       
 5   AirTemperature       8760 non-null   float64       
 6   RelativeAirHumidity  8760 non-null   int64         
 7   SystemProduction     8760 non-null   float64       
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 547.6 KB


In [9]:
# Setting the Date-Hour(NUMT) as the index:

df.set_index("Date-Hour(NMT)", inplace=True)

In [10]:
# Descriptive Statistics analysis:

df.describe()

Unnamed: 0,WindSpeed,Sunshine,AirPressure,Radiation,AirTemperature,RelativeAirHumidity,SystemProduction
count,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0
mean,2.639823,11.180479,1010.361781,97.538493,6.978893,76.719406,684.746071
std,1.628754,21.171295,12.793971,182.336029,7.604266,19.278996,1487.454665
min,0.0,0.0,965.9,-9.3,-12.4,13.0,0.0
25%,1.4,0.0,1002.8,-6.2,0.5,64.0,0.0
50%,2.3,0.0,1011.0,-1.4,6.4,82.0,0.0
75%,3.6,7.0,1018.2,115.6,13.4,93.0,464.24995
max,10.9,60.0,1047.3,899.7,27.1,100.0,7701.0


In [11]:
# Let's change the type of the numeric variables:

df['Sunshine'] = df['Sunshine'].astype("int16")
df['RelativeAirHumidity'] = df['RelativeAirHumidity'].astype("int16")
df['WindSpeed'] = df['WindSpeed'].astype("float32")
df['Radiation'] = df['Radiation'].astype("float32")
df['AirTemperature'] = df['AirTemperature'].astype("float32")

The memory usage of the dataset reduced from 547.6 to 342.2 KB.

In [12]:
# Information about the dataset:

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8760 entries, 2017-01-01 00:00:00 to 2017-12-31 23:00:00
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   WindSpeed            8760 non-null   float32
 1   Sunshine             8760 non-null   int16  
 2   AirPressure          8760 non-null   float64
 3   Radiation            8760 non-null   float32
 4   AirTemperature       8760 non-null   float32
 5   RelativeAirHumidity  8760 non-null   int16  
 6   SystemProduction     8760 non-null   float64
dtypes: float32(3), float64(2), int16(2)
memory usage: 342.2 KB


We will look at the histogram of all variables

In [80]:
fig_sub = make_subplots(rows=3, cols=3, shared_yaxes=False, shared_xaxes=False, 
                        subplot_titles=["WindSpeed", "Sunshine", "RelativeAirHumidity", 
                                        "Radiantion", "AirTemperature", "AirPressure", "SystemProduction"])

fig_sub.add_trace(
    go.Histogram(x=df['WindSpeed'], showlegend=False),
    row=1, col=1
)

fig_sub.add_trace(
    go.Histogram(x=df['Sunshine'], showlegend=False),
    row=1, col=2
)

fig_sub.add_trace(
    go.Histogram(x=df["RelativeAirHumidity"], showlegend=False),
    row=1, col=3
)

fig_sub.add_trace(
    go.Histogram(x=df['Radiation'], showlegend=False),
    row=2, col=1
)

fig_sub.add_trace(
    go.Histogram(x=df["AirTemperature"], showlegend=False),
    row=2, col=2
)

fig_sub.add_trace(
    go.Histogram(x=df["AirPressure"], showlegend=False),
    row=2, col=3
)

fig_sub.add_trace(
    go.Histogram(x=df["SystemProduction"], showlegend=False),
    row=3, col=1
)



fig_sub