# EDA - Data Preprocessing

In [1]:
# import the necessary libraries 
import duckdb
import pandas as pd

In [9]:
# Load csv to DuckDB

# Connect to a DuckDB database file (this will create 'my_database.duckdb' if it doesn't exist)
conn = duckdb.connect('../Instance/my_database.duckdb')

conn.execute(
    """
    CREATE TABLE IF NOT EXISTS water_data AS 
    SELECT * FROM read_csv_auto('../Data/global_water_consumption.csv')
    """
)

<duckdb.duckdb.DuckDBPyConnection at 0x27cd6cbe8f0>

In [None]:
# query the data to get the first 5 rows 
conn.sql("SELECT * FROM water_data LIMIT 5").fetchdf()


Unnamed: 0,Country,Year,Total Water Consumption (Billion Cubic Meters),Per Capita Water Use (Liters per Day),Water Scarcity Level,Agricultural Water Use (%),Industrial Water Use (%),Household Water Use (%),Rainfall Impact (Annual Precipitation in mm),Groundwater Depletion Rate (%)
0,Indonesia,2022,895.15,489.73,Low,20.78,13.75,34.99,1075.28,3.1
1,Indonesia,2024,502.89,311.95,High,48.51,8.44,32.88,2630.69,1.78
2,Spain,2000,843.39,440.09,Medium,25.16,31.7,34.62,2860.62,4.13
3,Canada,2021,803.34,478.98,High,45.74,6.13,18.99,1725.5,0.61
4,Brazil,2022,416.4,353.91,High,26.58,7.95,31.11,988.44,0.8


The names of the columns are quite long and hence needs to be shortened for easy usage. 

In [11]:
# Rename columns for easier handling
query = """
    SELECT 
        Country AS country,
        Year AS year,
        "Total Water Consumption (Billion Cubic Meters)" AS total_water_consumption_bcm,
        "Per Capita Water Use (Liters per Day)" AS per_capita_water_use_lpd,
        "Water Scarcity Level" AS water_scarcity_level,
        "Agricultural Water Use (%)" AS agricultural_water_use_pct,
        "Industrial Water Use (%)" AS industrial_water_use_pct,
        "Household Water Use (%)" AS household_water_use_pct,
        "Rainfall Impact (Annual Precipitation in mm)" AS rainfall_impact_mm,
        "Groundwater Depletion Rate (%)" AS groundwater_depletion_rate_pct
    FROM water_data
"""

water_df = conn.sql(query=query).fetchdf()
water_df.head()

Unnamed: 0,country,year,total_water_consumption_bcm,per_capita_water_use_lpd,water_scarcity_level,agricultural_water_use_pct,industrial_water_use_pct,household_water_use_pct,rainfall_impact_mm,groundwater_depletion_rate_pct
0,Indonesia,2022,895.15,489.73,Low,20.78,13.75,34.99,1075.28,3.1
1,Indonesia,2024,502.89,311.95,High,48.51,8.44,32.88,2630.69,1.78
2,Spain,2000,843.39,440.09,Medium,25.16,31.7,34.62,2860.62,4.13
3,Canada,2021,803.34,478.98,High,45.74,6.13,18.99,1725.5,0.61
4,Brazil,2022,416.4,353.91,High,26.58,7.95,31.11,988.44,0.8


In [12]:
# get info
water_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country                         5000 non-null   object 
 1   year                            5000 non-null   int64  
 2   total_water_consumption_bcm     5000 non-null   float64
 3   per_capita_water_use_lpd        5000 non-null   float64
 4   water_scarcity_level            5000 non-null   object 
 5   agricultural_water_use_pct      5000 non-null   float64
 6   industrial_water_use_pct        5000 non-null   float64
 7   household_water_use_pct         5000 non-null   float64
 8   rainfall_impact_mm              5000 non-null   float64
 9   groundwater_depletion_rate_pct  5000 non-null   float64
dtypes: float64(7), int64(1), object(2)
memory usage: 390.8+ KB


There is no null values inside the dataset.

In [14]:
# let us save the renamed dataset 
conn.sql(query=query).write_csv('../Data/renamed_global_water_consumption.csv')