In [53]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [54]:
# load dataset (note that dataset is seperated by ';' rather than usual ',')
data = pd.read_csv('WindData.csv', sep = ';')
data


Unnamed: 0,Date,Wind Production PZ_1,Wind Production PZ_2,Station1_WND,Station1_DD,Station2_WND,Station2_DD,Station3_WND,Station3_DD,Station4_WND,...,Station8_WND,Station8_DD,Station9_WND,Station9_DD,Station10_WND,Station10_DD,Station11_WND,Station11_DD,Station12_WND,Station12_DD
0,01/01/2016 00:00,1396847852,1227055319,96,191,61,179,7,224,84,...,124,199,77,228,61,195,57,184,102,202
1,01/01/2016 01:00,1304469541,1051476672,98,188,61,177,66,227,83,...,132,197,73,231,6,193,53,185,107,200
2,01/01/2016 02:00,1201416846,80806082,99,188,66,174,63,231,83,...,134,197,69,235,63,193,53,188,104,199
3,01/01/2016 03:00,1198324268,6763768345,10,187,71,172,6,234,84,...,136,197,66,239,67,193,52,191,101,197
4,01/01/2016 04:00,1113926804,5157622764,98,188,75,172,61,232,85,...,13,202,64,237,7,193,48,195,97,197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8779,31/12/2016 19:00,304962372,843204789,127,250,125,249,131,244,129,...,145,255,142,239,121,251,109,254,11,244
8780,31/12/2016 20:00,3045754898,84866082,127,250,122,248,13,244,127,...,148,253,141,240,12,250,109,253,114,245
8781,31/12/2016 21:00,3091585559,8454864019,128,249,12,247,13,244,126,...,152,252,141,241,119,248,11,252,118,246
8782,31/12/2016 22:00,3092379163,8211822639,127,249,115,248,13,245,124,...,153,251,14,242,118,247,11,252,12,246


0       1396847852
1       1304469541
2       1201416846
3       1198324268
4       1113926804
           ...    
8779     304962372
8780    3045754898
8781    3091585559
8782    3092379163
8783    2935443872
Name: Wind Production PZ_1, Length: 8784, dtype: object

### Key points about the dataset
- Actual production for two regions (MW)  =  (Wind Production PZ_1, Wind Production PZ_2) columns
- forecasted wind speed (m/s) = WND columns
- forecasted wind direction (degrees) = DD columns


In [55]:
# check data types are correct
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Date                  8784 non-null   object
 1   Wind Production PZ_1  8784 non-null   object
 2   Wind Production PZ_2  8784 non-null   object
 3   Station1_WND          8784 non-null   object
 4   Station1_DD           8784 non-null   int64 
 5   Station2_WND          8784 non-null   object
 6   Station2_DD           8784 non-null   object
 7   Station3_WND          8784 non-null   object
 8   Station3_DD           8784 non-null   object
 9   Station4_WND          8784 non-null   object
 10  Station4_DD           8784 non-null   object
 11  Station5_WND          8784 non-null   object
 12  Station5_DD           8784 non-null   int64 
 13  Station6_WND          8784 non-null   object
 14  Station6_DD           8784 non-null   int64 
 15  Station7_WND          8784 non-null   

In [56]:
# convert columns to numeric data types
data['Date'] = pd.to_datetime(data['Date'], format="%d/%m/%Y %H:%M")

# commas causing a problem with Wind Production columns
data['Wind Production PZ_1'] = data['Wind Production PZ_1'].str.replace(',','')
data['Wind Production PZ_1'] = pd.to_numeric(data['Wind Production PZ_1'],errors='coerce')
data['Wind Production PZ_2'] = data['Wind Production PZ_2'].str.replace(',','')
data['Wind Production PZ_2'] = pd.to_numeric(data['Wind Production PZ_2'],errors='coerce')
for i in range(1,13):
    data[f'Station{i}_DD'] = pd.to_numeric(data[f'Station{i}_DD'],errors='coerce')
for i in range(1,13):
    data[f'Station{i}_WND'] = pd.to_numeric(data[f'Station{i}_DD'],errors='coerce')


In [62]:
# check for null values
data.isnull().sum()

# number of null values low so drop rows containing null values
data = data.dropna()


Date                    0
Wind Production PZ_1    0
Wind Production PZ_2    0
Station1_WND            0
Station1_DD             0
Station2_WND            0
Station2_DD             0
Station3_WND            0
Station3_DD             0
Station4_WND            0
Station4_DD             0
Station5_WND            0
Station5_DD             0
Station6_WND            0
Station6_DD             0
Station7_WND            0
Station7_DD             0
Station8_WND            0
Station8_DD             0
Station9_WND            0
Station9_DD             0
Station10_WND           0
Station10_DD            0
Station11_WND           0
Station11_DD            0
Station12_WND           0
Station12_DD            0
dtype: int64

Now data has been cleaned it is time to explore/visualise data