In [1]:
import pandas as pd

# Reload the initial data so we can do pre-processing
energy_dataset = pd.read_csv('data/energy_dataset.csv')
weather_features = pd.read_csv('data/weather_features.csv')
# Rename `dt_iso` column to `time` for consistency
weather_features = weather_features.rename(columns={'dt_iso': 'time'})

In [2]:
missing_columns = [
    'generation fossil coal-derived gas',
    'generation fossil oil shale',
    'generation fossil peat',
    'generation geothermal',
    'generation hydro pumped storage aggregated',
    'generation marine',
    'generation wind offshore',
    'forecast wind offshore eday ahead'
]

In [3]:
# Remove columns with a large number of missing values (and non-numeric columns)
energy_dataset_removed = energy_dataset.drop(columns=missing_columns)

# Remove rows with missing values
energy_dataset_removed = energy_dataset_removed.dropna()

# Look at the shape of the original and modified data
print(f'Original data shape: {energy_dataset.shape}')
print(f'Modified data shape: {energy_dataset_removed.shape}')

Original data shape: (35064, 29)
Modified data shape: (35018, 21)


In [4]:
df = pd.merge(energy_dataset_removed, weather_features, on='time', how='inner')

In [5]:
df

Unnamed: 0,time,generation biomass,generation fossil brown coal/lignite,generation fossil gas,generation fossil hard coal,generation fossil oil,generation hydro pumped storage consumption,generation hydro run-of-river and poundage,generation hydro water reservoir,generation nuclear,...,wind_speed,wind_deg,rain_1h,rain_3h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,2015-01-01 00:00:00+01:00,447.0,329.0,4844.0,4821.0,162.0,863.0,1051.0,1899.0,7096.0,...,1,62,0.0,0.0,0.0,0,800,clear,sky is clear,01n
1,2015-01-01 00:00:00+01:00,447.0,329.0,4844.0,4821.0,162.0,863.0,1051.0,1899.0,7096.0,...,1,309,0.0,0.0,0.0,0,800,clear,sky is clear,01n
2,2015-01-01 00:00:00+01:00,447.0,329.0,4844.0,4821.0,162.0,863.0,1051.0,1899.0,7096.0,...,0,226,0.0,0.0,0.0,0,800,clear,sky is clear,01
3,2015-01-01 00:00:00+01:00,447.0,329.0,4844.0,4821.0,162.0,863.0,1051.0,1899.0,7096.0,...,7,58,0.0,0.0,0.0,0,800,clear,sky is clear,01n
4,2015-01-01 00:00:00+01:00,447.0,329.0,4844.0,4821.0,162.0,863.0,1051.0,1899.0,7096.0,...,1,21,0.0,0.0,0.0,0,800,clear,sky is clear,01n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178161,2018-12-31 23:00:00+01:00,290.0,0.0,6926.0,2166.0,163.0,108.0,1069.0,1686.0,6075.0,...,2,300,0.0,0.0,0.0,0,800,clear,sky is clear,01n
178162,2018-12-31 23:00:00+01:00,290.0,0.0,6926.0,2166.0,163.0,108.0,1069.0,1686.0,6075.0,...,1,360,0.0,0.0,0.0,0,800,clear,sky is clear,01n
178163,2018-12-31 23:00:00+01:00,290.0,0.0,6926.0,2166.0,163.0,108.0,1069.0,1686.0,6075.0,...,2,100,0.0,0.0,0.0,0,800,clear,sky is clear,01n
178164,2018-12-31 23:00:00+01:00,290.0,0.0,6926.0,2166.0,163.0,108.0,1069.0,1686.0,6075.0,...,5,310,0.0,0.0,0.0,0,800,clear,sky is clear,01n


In [6]:
df.to_csv('data/cleaned_energy_data.csv', index=False)