In [1]:
import pandas as pd

In [2]:
df_wind = pd.read_csv("data/raw/wind_plants.csv")
df_gas = pd.read_csv("data/raw/gas_plants.csv")
df_gas_fr = pd.read_csv("data/raw/gas_fr_plants.csv")
df_db = pd.read_csv("data/db/database.csv")

In [3]:
df_wind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        957 non-null    object 
 1   Country     957 non-null    object 
 2   Technology  957 non-null    object 
 3   SiteName    957 non-null    object 
 4   Volume      957 non-null    float64
dtypes: float64(1), object(4)
memory usage: 37.5+ KB


In [4]:
df_gas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 962 entries, 0 to 961
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Date        962 non-null    object
 1   Country     962 non-null    object
 2   Technology  962 non-null    object
 3   SiteName    962 non-null    object
 4   Volume      962 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 37.7+ KB


In [5]:
df_gas_fr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 962 entries, 0 to 961
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        481 non-null    object 
 1   Country     481 non-null    object 
 2   Technology  481 non-null    object 
 3   SiteName    481 non-null    object 
 4   Volume      475 non-null    float64
dtypes: float64(1), object(4)
memory usage: 37.7+ KB


Understand the data - we have 5 columns
- 3 categorical dimension columns (Country, Technology, SiteName)
- 1 date dimension column (Date)
- 1 measure/fact column (Volume)

- Date is an object
- Volume is a float or int
- Gas and Gas_fr has 962 rows and Wind has 957 rows

In [6]:
df_gas.head(1)

Unnamed: 0,Date,Country,Technology,SiteName,Volume
0,01/01/2024,GB,Gas,Pembroke-1,6570


In [7]:
df_gas_fr.head(1)

Unnamed: 0,Date,Country,Technology,SiteName,Volume
0,01/01/2024,FR,Gas,Blenod-5,6753.0


In [8]:
df_wind.head(1)

Unnamed: 0,Date,Country,Technology,SiteName,Volume
0,01/01/2024,GB,Wind,Hornsea-1,260.166079


In [9]:
# Check for missing values
print(df_gas.isna().sum())
print(df_gas_fr.isna().sum())
print(df_wind.isna().sum())


Date          0
Country       0
Technology    0
SiteName      0
Volume        0
dtype: int64
Date          481
Country       481
Technology    481
SiteName      481
Volume        487
dtype: int64
Date          0
Country       0
Technology    0
SiteName      0
Volume        0
dtype: int64


In [10]:
# Fix space in column names (trailing space found on Country column but apply to all - cannot predict future incoming files)
df_gas.columns = df_gas.columns.str.strip()
df_gas_fr.columns = df_gas_fr.columns.str.strip()
df_wind.columns = df_wind.columns.str.strip()
# First convert the Date to a datetime object
df_gas["Date"] = pd.to_datetime(df_gas["Date"], dayfirst=True, errors="coerce")
df_gas_fr["Date"] = pd.to_datetime(df_gas_fr["Date"], dayfirst=True, errors="coerce")
df_wind["Date"] = pd.to_datetime(df_wind["Date"], dayfirst=True, errors="coerce")
# Convert Volume to float
df_gas["Volume"] = df_gas["Volume"].astype(float)
df_gas_fr["Volume"] = df_gas_fr["Volume"].astype(float)
df_wind["Volume"] = df_wind["Volume"].astype(float)
# Fill missing Volume values with 0 - as requested
df_gas["Volume"] = df_gas["Volume"].fillna(0)
df_gas_fr["Volume"] = df_gas_fr["Volume"].fillna(0)
df_wind["Volume"] = df_wind["Volume"].fillna(0)
# Drop rows where any dimensions (date or categorical) are blank
df_gas = df_gas.dropna(subset=["Date", "Country", "Technology", "SiteName"])
df_gas_fr = df_gas_fr.dropna(subset=["Date", "Country", "Technology", "SiteName"])
df_wind = df_wind.dropna(subset=["Date", "Country", "Technology", "SiteName"])
# Drop any (exact) duplicates - not mentioned but appears sensible to do
df_gas = df_gas.drop_duplicates()
df_gas_fr = df_gas_fr.drop_duplicates()
df_wind = df_wind.drop_duplicates()
# Note ignore trailing space on Wind technology as the database file has the trailing space - need to ask if this should be removed

In [11]:
# Check the data is now clean
print(df_gas.isna().sum())
print(df_gas_fr.isna().sum())
print(df_wind.isna().sum())

Date          0
Country       0
Technology    0
SiteName      0
Volume        0
dtype: int64
Date          0
Country       0
Technology    0
SiteName      0
Volume        0
dtype: int64
Date          0
Country       0
Technology    0
SiteName      0
Volume        0
dtype: int64
