# Energy and water use anlysis


## Setup

### Import libraries


In [1]:
from pathlib import Path

import cartopy.crs as ccrs
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.lines import Line2D

from functions.data_etl.file_io import read_gdf_from_csv
from functions.data_etl.geocoding import country_alpha3_to_alpha2
from functions.data_etl.imputation import PowerCapacityScenario
from functions.energy_and_water_use.climate_zones import read_koppen_tif
from functions.energy_and_water_use.direct_energy_water_use import (
    assign_pue_wue,
    assign_scenarios,
)
from functions.energy_and_water_use.indirect_water_use import (
    assign_multi_source_grid_zones,
    assign_water_use_to_power_plants,
    combine_dcs_and_pps,
    get_power_grid_stats,
    replace_zones_with_nearest,
    results_average_wue_pue,
    results_summary,
)
from functions.energy_and_water_use.regression_analysis import (
    mixed_effects_model_analysis,
    plot_col_and_log_transform_histograms,
    polynomial_regression_analysis,
    predict_white_space_from_total_space,
)


## Setting input and output paths


In [2]:
# Input paths
IMPUTED_DATA_CENTERS_INPUT_PREFIX = Path("data/outputs/1_data_etl/data_centers_impute_")
INPUTS_DIR = Path("data/inputs/2_energy_and_water_use")

# Output paths
OUTPUT_DIR = Path("data/outputs/2_energy_and_water_use/")
FIGURE_DIR = Path("data/outputs/figures/")

## Scaling specifications

In order to calculate energy and water use, we need the estimate of gross power for each data center. 65% of data centers have critical power provided. For the rest, this needs to be scaled from total area or white area. Hence we investigate the relationship between the two.

First, the variables are checked for normal distribution


In [3]:
# Set power capacity scenarios (minimum, maximum, and average)
power_scenarios = tuple(scenario.value for scenario in PowerCapacityScenario)
imputation_scenarios = (*power_scenarios, "baseline")  # Add baseline scenario without imputation

# Data center specifications under different power capacity scenarios, including imputation baseline
data_centers_imputation_scenarios = {
    scenario: pd.read_csv(f"{IMPUTED_DATA_CENTERS_INPUT_PREFIX}{scenario}.csv") for scenario in imputation_scenarios
}

In [4]:
# Plot histograms and log transformed histograms of critical power, total space, and white space
for column, color in [("critical_power_mw", "red"), ("total_space_m2", "blue"), ("white_space_m2", "lightblue")]:
    plot_col_and_log_transform_histograms(data_centers_imputation_scenarios["baseline"], column, color, FIGURE_DIR)

All variables are highly skewed. When log transformed, the variables appear more normally distributed.

In order to predict critical power, it is easier to have a variable with non missing values for all cases predicted (rather than a mix of missing and not missing between white and total space). Therefore, it may be best to first estimate white space from total space for the 13% which are missing it.


In [None]:
# Scatterplot of log total space vs log white space
plt.scatter(
    np.log1p(data_centers_imputation_scenarios["baseline"]["total_space_m2"]),
    np.log1p(data_centers_imputation_scenarios["baseline"]["white_space_m2"]),
    color="blue",
)
plt.xlabel("log Total space (m²)")
plt.ylabel("log White space (m²)")
plt.title("log Total Space vs log White space")
plt.savefig(f"{FIGURE_DIR}/scatterplot_log_total_space_vs_log_white_space.png")
plt.close()

It appears that there is a generally linear relationship between the two. Hence, we first use linear regression to predict missing white space values when gross power is missing and only total space is present.


In [7]:
# Predicting white space from total space for different power scenarios
for scenario in imputation_scenarios:
    data_centers_imputation_scenarios[scenario] = predict_white_space_from_total_space(
        data_centers_imputation_scenarios["baseline"], data_centers_imputation_scenarios[scenario]
    )

INFO:root:Fit: log white_space_m2 = 0.8787 * log total_space_m2 + 0.1127
INFO:functions.energy_and_water_use.regression_analysis:R2: 0.7444
INFO:root:Fit: log white_space_m2 = 0.8787 * log total_space_m2 + 0.1127
INFO:functions.energy_and_water_use.regression_analysis:R2: 0.7444
INFO:root:Fit: log white_space_m2 = 0.8787 * log total_space_m2 + 0.1127
INFO:functions.energy_and_water_use.regression_analysis:R2: 0.7444
INFO:root:Fit: log white_space_m2 = 0.8787 * log total_space_m2 + 0.1127
INFO:functions.energy_and_water_use.regression_analysis:R2: 0.7444


### Polynomial regression

There are multiple ways to predict critical power from area. First, polynomial regression is tested.

Polynomial regression with degree 2 is used to fill in the 34% of missing gross power. This is done with the input of white space. Other variables are excluded to prevent over fitting.


In [8]:
critical_power_predicted_polynomial = polynomial_regression_analysis(
    data_centers_imputation_scenarios["avg"],
    "critical_power_mw",
    ["white_space_m2"],
    polynomial_degree=2,
)

INFO:functions.energy_and_water_use.regression_analysis:3-Fold CV - Mean MAE: 0.5384
INFO:functions.energy_and_water_use.regression_analysis:5-Fold CV - Mean MAE: 0.5381
INFO:functions.energy_and_water_use.regression_analysis:R² Score: 0.6122
INFO:functions.energy_and_water_use.regression_analysis:RMSE: 0.7166


### Mixed-effect modeling

Mixed-effect modeling is used with the random effect of company. This produces the highest r2 and best overall fit. Therefore, this method is used.

First, we find the percent of the data where the company or country is not in the training set.


In [9]:
# Define the training data set
training_data = data_centers_imputation_scenarios["avg"].dropna(subset=["white_space_m2", "critical_power_mw"])

# Find the data which needs to be predicted
data_centers_to_predict = data_centers_imputation_scenarios["avg"][
    data_centers_imputation_scenarios["avg"]["critical_power_mw"].isna()
]

to_predict = data_centers_to_predict["company"].unique()
training = training_data["company"].unique()
not_in_training = set(to_predict) - set(training)

# Print the percent of companies and countries in the prediction set that are not in the training set
entities = ["company", "ISO_A3"]
entity_names = ["company", "country"]

for entity, entity_name in zip(entities, entity_names, strict=False):
    to_predict = data_centers_to_predict[entity].unique()
    training = training_data[entity].unique()
    not_in_training = set(to_predict) - set(training)

    # Print the percent of data points in the set to predict whose entity is not in the training set
    percent_data_points = data_centers_to_predict[entity].isin(not_in_training).mean() * 100
    print(
        f"{percent_data_points:.2f}% of data points in the prediction set have a {entity_name} not in the training set"
    )


nan% of data points in the prediction set have a company not in the training set
nan% of data points in the prediction set have a country not in the training set


In [10]:
# Fit a mixed-effect model for each power scenario
for scenario in power_scenarios:
    data_centers_imputation_scenarios[scenario] = mixed_effects_model_analysis(
        data_centers_imputation_scenarios[scenario],
        input_cols="white_space_m2",
        output_cols="critical_power_mw",
        categorical_col="company",
        display_results=False,
    )

INFO:functions.energy_and_water_use.regression_analysis:                 Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Q("log_critical_power_mw")
No. Observations: 2071    Method:             REML                      
No. Groups:       605     Scale:              0.4004                    
Min. group size:  1       Log-Likelihood:     -2264.7540                
Max. group size:  198     Converged:          Yes                       
Mean group size:  3.4                                                   
-------------------------------------------------------------------------
                         Coef.   Std.Err.     z     P>|z|  [0.025  0.975]
-------------------------------------------------------------------------
Intercept                -2.384     0.097  -24.701  0.000  -2.573  -2.195
Q("log_white_space_m2")   0.558     0.012   45.761  0.000   0.534   0.582
Group Var                 0.294     0.060                                

INFO:f

In [11]:
# Export to csv to use for full list of data centers without scenarios
data_centers_imputation_scenarios["min"].to_csv(f"{OUTPUT_DIR}/data_centers_no_scenarios.csv", index=False)

### Merging

For scenario modelling it is easiest to have the min and max gross power estimates for the big 5 in one data frame.


In [12]:
# Power scenario column
for scenario in power_scenarios:
    data_centers_imputation_scenarios[scenario]["power_scenario"] = scenario

# Concatenating the dataframes
data_centers = pd.concat(
    [data_centers_imputation_scenarios[scenario] for scenario in power_scenarios], ignore_index=True
)

# Convert to GeoDataFrame
data_centers = gpd.GeoDataFrame(
    data_centers,
    geometry=gpd.points_from_xy(data_centers.longitude, data_centers.latitude),
    crs="EPSG:4326",
)

### Categorizing data center size

In line with Lei & Masanet (2022), we categorize data centers small, medium and large by total space.

- Small: 0-100 m<sup>2</sup>
- Medium: 100-2000 m<sup>2</sup>
- Large: >2000 m<sup>2</sup>

These categories must first also be equivalated to power using the mixed-effect model. By observing the data, this translates to:

- Small: 0-1 MW
- Medium: 1-4 MW
- Large: >4 MW


In [13]:
# Add a new category for size of data center based on power
data_centers["size"] = pd.cut(
    data_centers["critical_power_mw"],
    bins=[0, 1, 4, float("inf")],
    labels=["small", "medium", "large"],
)

In [14]:
# Find the total critical power for operational data centers for each power scenario
for is_operational in [True, False]:
    for scenario in power_scenarios:
        total_critical_power = data_centers[
            (data_centers["operational"] == is_operational) & (data_centers["power_scenario"] == scenario)
        ]["critical_power_mw"].sum()
        status_str = "operational" if is_operational else "planned"
        print(
            f"{scenario.capitalize()} total critical power for {status_str} data centers: "
            f"{total_critical_power / 1000:.2f} GW"
        )

Min total critical power for operational data centers: 61.20 GW
Max total critical power for operational data centers: 64.06 GW
Avg total critical power for operational data centers: 63.08 GW
Min total critical power for planned data centers: 40.34 GW
Max total critical power for planned data centers: 41.78 GW
Avg total critical power for planned data centers: 41.93 GW


## Assigning ASHRAE climate zones


Data on climate zones comes from Beck et al. (2023), where Koppen climate classifications are provided with a 0.5 degree resolution. While PUE and WUE values are given relative to ASHRAE climate zones, no global tif/shapefiles are available for this classification system, which was originally created for building design in the U.S. Instead, ASHRAE climate zones are mapped onto the Koppen classification system using [Kudacity cities by Koppen classification](https://www.kudacity.com/cset/by_climate) and [ASHRAE cities by climate zone](https://www.ashrae.org/file%20library/technical%20resources/standards%20and%20guidelines/standards%20addenda/169_2020_a_20211029.pdf), in addition to the [ASHRAE descriptions](https://help.iesve.com/ve2021/ashrae_climate_zones.htm).

Koppen climate zones from 1991-2020 weather data are used for the analysis.


In [15]:
# Polygonize 1991-2020 climate zones
koppen_1991_2020 = read_koppen_tif(input_raster_path=INPUTS_DIR / "koppen_geiger_tif/1991_2020/koppen_geiger_0p5.tif")

Next, koppen classifications were matched to ASHRAE. Multiple koppen zones fall under one ASHRAE zone, so first we create a dictionary to map one to another.


In [19]:
# Mapping each Köppen classification to the corresponding ASHRAE zone
koppen_to_ashrae = {
    1: "1A",
    2: "1A",
    3: "1A",
    4: "2B",
    5: "3B",
    6: "2A",
    7: "5B",
    8: "3A",
    9: "3C",
    10: "4C",
    11: "2A",
    12: "3A",
    13: "6B",
    14: "3A",
    15: "4A",
    16: "6B",
    17: "4B",
    18: "6B",
    19: "8",
    20: "8",
    21: "6A",
    22: "5A",
    23: "8",
    24: "8",
    25: "5A",
    26: "7",
    27: "8",
    28: "8",
    29: "8",
    30: "8",
}

# Map Köppen classifications to ASHRAE zones
koppen_1991_2020["ashrae_zone"] = koppen_1991_2020["classification"].map(koppen_to_ashrae)

# Dissolve the polygons by ASHRAE zone
koppen_1991_2020 = koppen_1991_2020.dissolve(by="ashrae_zone", as_index=False)

Plotting the mapped ASHRAE zones

In [20]:
# Create a mapping of zones to colors
unique_zones = sorted(koppen_1991_2020["ashrae_zone"].unique())
color_map = plt.cm.get_cmap("tab20", len(unique_zones))
zone_colors = {zone: color_map(i) for i, zone in enumerate(unique_zones)}

# Plot the map
fig, ax = plt.subplots(figsize=(12, 8), subplot_kw={"projection": ccrs.Robinson()})

# Add each zone to the plot with its corresponding color
for zone, color in zone_colors.items():
    koppen_1991_2020[koppen_1991_2020["ashrae_zone"] == zone].plot(
        ax=ax, transform=ccrs.PlateCarree(), color=color, label=f"ASHRAE_zone {zone}"
    )

# Add a legend outside the plot
legend_handles = [
    Line2D(
        [0],
        [0],
        marker="o",
        color="w",
        markerfacecolor=color_map(i),
        markersize=10,
        label=f"Zone {zone}",
    )
    for i, zone in enumerate(unique_zones)
]
ax.legend(
    handles=legend_handles,
    title="ASHRAE Climate Zones",
    loc="center left",
    bbox_to_anchor=(1, 0.5),
)

ax.set_axis_off()

# Save the plot
plt.savefig(f"{FIGURE_DIR}/ashrae_climate_zones.png", bbox_inches="tight")

plt.close()

  color_map = plt.cm.get_cmap("tab20", len(unique_zones))


In [21]:
# Perform spatial join between data centers and climate zones without creating 'index_right' column
data_centers = gpd.sjoin(
    data_centers,
    koppen_1991_2020[["ashrae_zone", "geometry"]],
    how="left",
    predicate="within",
)

## Direct energy and water use


### Matching PUE and WUE values to data centers

The WUE and PUE values are taken from Lei & Masanet (2022), which provide best and worst WUE and PUE values under a range of climates and cooling technologies cases. A 50th quantile, medium performance scenario was added taking the average of best and worst.


In [16]:
# Load in PUE & WUE data
PUE_WUE_scenarios = pd.read_csv(f"{INPUTS_DIR}/PUE_WUE_scenarios.csv")

In [17]:
# Drop the reported PUE and energy consumption columns as they are mostly missing and the model will overwrite them
data_centers = data_centers.drop(columns=["Annual electricity consumption (GWh)", "PUE"])

The WUE values from Lei & Masanet (2022) pertain to water consumption of data centers. We want water withdrawal for our analysis in order to incorporate into water scarcity indicators. Hence, we multiply all WUE values by 1.3, a relative ratio between withdrawal and consumption based on Li et al. (2025).


In [22]:
# Assign PUE and WUE values to each data center
data_centers = assign_pue_wue(
    data_centers,
    PUE_WUE_scenarios,
    size_to_case_mapping={  # Mapping of data center sizes to technology cases
        "large": [1, 2],
        "medium": list(range(3, 8)),
        "small": list(range(8, 11)),
    },
    tech_perf_level_to_quantile_mapping={  # Performance levels and their corresponding quantiles
        "best": 0,
        "medium": 50,
        "worst": 100,
    },
    conversion_factor_consumption_to_withdrawal=1.3,
)

### Scenarios

5 different scenarios of cooling technology used are defined. The most common scenario, representing the most common technology currently in place, uses Waterside economizer + (water-cooled chiller) in large data centers and Airside economizer + (water-cooled chiller) in medium data centers.

Here are the scenarios:

| Scenario               | Case Large-Scale | Technology Large-Scale                                          | Case Medium-Scale | Technology Medium-Scale                       | Case Small-Scale | Technology Small-Scale  |
| ---------------------- | ---------------- | --------------------------------------------------------------- | ----------------- | --------------------------------------------- | ---------------- | ----------------------- |
| Most common | 2                | Waterside economizer + (water-cooled chiller)                   | 3                 | Airside economizer + (water-cooled chiller)   | 10               | Direct expansion system |
| Optimized energy use   | 2                | Waterside economizer + (water-cooled chiller)                   | 4                 | Waterside economizer + (water-cooled chiller) | 8                | Water-cooled chiller    |
| Optimized water use    | 1                | Airside economizer + adiabatic cooling + (water-cooled chiller) | 6                 | Airside economizer + (air-cooled chiller)     | 9                | Air-cooled chiller      |
| Intensive energy use   | 1                | Airside economizer + adiabatic cooling + (water-cooled chiller) | 7                 | Air-cooled chiller                            | 10               | Direct expansion system |
| Intensive water use    | 2                | Waterside economizer + (water-cooled chiller)                   | 5                 | Water-cooled chiller                          | 8                | Water-cooled chiller    |


In [23]:
# Assign scenarios
data_centers = assign_scenarios(
    data_centers,
    scenario_mappings={
        "large": {
            1: ["optimized_water_use", "intensive_energy_use"],
            2: ["most_common", "optimized_energy_use", "intensive_water_use"],
        },
        "medium": {
            3: ["most_common"],
            4: ["optimized_energy_use"],
            5: ["intensive_water_use"],
            6: ["optimized_water_use"],
            7: ["intensive_energy_use"],
        },
        "small": {
            9: ["optimized_water_use"],
            8: ["optimized_energy_use", "intensive_water_use"],
            10: ["most_common", "intensive_energy_use"],
        },
    },
)

### Calculating annual direct energy and water consumption

Gross power represents the data center power capacity, or the IT load. However, this does not account alone for additional energy used (e.g. lighting, operations, or inefficiencies). Therefore, this can be multiplied by the power use effectiveness (PUE) to obtain annual energy consumption, also accounting for the hours in a year, as seen in the equation below.

$$
DC\_E_{total} (MWh) = Power (MW) \times PUE \left(\frac{MWh}{MWh}\right) \times 8760 \left(\frac{hours}{year}\right)
$$

For the annual water consumption, the water use efficiency (WUE) represents the water used in litres per kWh of electricity consumed by IT equipment (or gross power). This results in the following equation:

$$
DC\_W_{direct} (m^3) = Power (MW) \times WUE \left(\frac{m^3}{MWh}\right) \times 8760 \left(\frac{hours}{year}\right)
$$


In [24]:
# Calculate annual electricity use (MWh) and direct water use (m3)
data_centers["annual_electricity_use_MWh"] = data_centers["PUE"] * data_centers["critical_power_mw"] * 8760  # Hours/yr
data_centers["annual_direct_water_use_m3"] = data_centers["WUE_withdrawal"] * data_centers["critical_power_mw"] * 8760

In [25]:
# Export the results to a csv file
data_centers.to_csv(f"{OUTPUT_DIR}/data_centers_direct_impacts.csv", index=False)

In [26]:
data_centers = pd.read_csv(f"{OUTPUT_DIR}/data_centers_direct_impacts.csv")

  data_centers = pd.read_csv(f"{OUTPUT_DIR}/data_centers_direct_impacts.csv")


## Indirect water use


### Power plants


Data is from the WRI global power plant database version 1.30 (Byers et al., 2018; download [here](https://datasets.wri.org/datasets/global-power-plant-database?))


In [27]:
# Read in power plant data
power_plants_gdf = read_gdf_from_csv(INPUTS_DIR / "globalpowerplantdatabasev130/global_power_plant_database.csv")
data_centers_no_scenarios = read_gdf_from_csv(OUTPUT_DIR / "data_centers_no_scenarios.csv")


### Power grids

Assigning power grid zones to data centers and power plants. We primarily used Electricity Maps electricity grid boundaries. For areas without an assigned Electricity Maps grid, we used Ecoinvent grids (Ecoinvent Electricity Networks, 2020). For the remaining few locations, we assigned data centers to their country grids.

geogjson link: <https://github.com/electricitymaps/electricitymaps-contrib/blob/master/web/geo/world.geojson>


In [28]:
# Load electricity maps geojson and ecoinvent grids shapefile
electricity_maps_gdf = gpd.read_file(INPUTS_DIR / "electricity_maps.geojson")
ecoinvent_electricity_grids_gdf = gpd.read_file(INPUTS_DIR / "ecoinvent_electricity_networks/electricity.shp")

In [29]:
# Assign electricity maps zones to power plants and data centers
power_plants_with_zone, data_centers_with_zone = (
    assign_multi_source_grid_zones(gdf, electricity_maps_gdf, ecoinvent_electricity_grids_gdf)
    for gdf in (power_plants_gdf, data_centers_no_scenarios)
)


In [30]:
# For power plants and data centers still without a zone, assign it to the ISO2 code of the country
power_plants_with_zone["power_grid_zone"] = power_plants_with_zone["power_grid_zone"].fillna(
    power_plants_with_zone["country"].apply(country_alpha3_to_alpha2)  # Power plant data country column is ISO3
)

data_centers_with_zone["power_grid_zone"] = data_centers_with_zone["power_grid_zone"].fillna(
    data_centers_with_zone["ISO_A3"].apply(country_alpha3_to_alpha2)
)

# Assign remaining data centers without a grid to the nearest grid
data_centers_with_zone = replace_zones_with_nearest(data_centers_with_zone, power_plants_with_zone)

INFO:functions.energy_and_water_use.indirect_water_use:Missing zones: ['CW', 'IM', 'NC', 'MC', 'MT', 'LI', 'GG', 'JE', 'PF']
INFO:functions.energy_and_water_use.indirect_water_use:Number of data centers in the missing zones: 12


### Assigning water intensities per grid

Global median water use intensities in (litres/MWh) from [Jin et al., (2019)](https://doi.org/10.1016/j.rser.2019.109391) and ecoinvent were collected.


In [31]:
# Read water intensity data
wi_column_map = {
    "Fuel type (dataset)": "primary_fuel",
    "median blue water withdrawal of operation (m3/MWh)": "water_intensity_m3/MWh",
}

water_intensity = pd.read_csv(
    INPUTS_DIR / "glob_median_water_intensity_e_prod.csv",
    usecols=list(wi_column_map.keys()),
).rename(columns=wi_column_map)

In [32]:
# Assign water intensities to power plants
power_plants_with_zone = power_plants_with_zone.merge(water_intensity, on="primary_fuel", how="left")

# Calculate the water intensity per power grid
power_grid_summary = get_power_grid_stats(power_plants_with_zone, data_centers_with_zone)

In [33]:
# Merge the full data centers dataframe with the data centers with zones
data_centers_with_zone = data_centers.merge(
    data_centers_with_zone[["company", "name", "address", "power_grid_zone"]],
    on=["company", "name", "address"],
    how="left",
)

# Assign water intensities to data centers based on the power grids they are connected to
data_centers_with_zone = data_centers_with_zone.merge(
    power_grid_summary[["power_grid_zone", "water_intensity_m3/MWh"]],
    on="power_grid_zone",
    how="left",
).rename(columns={"water_intensity_m3/MWh": "grid_water_intensity_m3/MWh"})

In [34]:
# Calculate indirect and total water use for each data center
data_centers_with_zone["indirect_water_use_m3"] = (
    data_centers_with_zone["annual_electricity_use_MWh"] * data_centers_with_zone["grid_water_intensity_m3/MWh"]
)
data_centers_with_zone["total_water_use_m3"] = (
    data_centers_with_zone["annual_direct_water_use_m3"] + data_centers_with_zone["indirect_water_use_m3"]
)

# Export data centers with water use for all scenarios
data_centers_with_zone.to_csv(f"{OUTPUT_DIR}/data_centers_total_water_use.csv", index=False)

In [38]:
# Create a summary of the results with the direct, indirect, and total water use for each scenario
water_electricity_use_results = results_summary(data_centers_with_zone)

# Export to csv
water_electricity_use_results.to_csv(f"{OUTPUT_DIR}/data_centers_water_electricity_use_summary.csv")

In [37]:
# Find the average data center WUE and PUE based on characteristics such as size, climate zone, and power grid zone
average_wue_pue = results_average_wue_pue(data_centers_with_zone)

# Export to csv
average_wue_pue.to_csv(f"{OUTPUT_DIR}/data_centers_average_PUE_WUE_summary.csv")

### Baseline scenario for further analysis

The baseline scenario is used for water scarcity modeling, including the baseling cooling mix, average power capacity scenario, and medium technological performance.


In [None]:
# To reduce computational load, set a baseline scenario
# This uses power_scenario of 'avg', tech_performance of 'medium', and cooling_tech_scenario of 'baseline'
data_centers_baseline = data_centers_with_zone[
    (data_centers_with_zone["power_scenario"] == "avg")
    & (data_centers_with_zone["tech_performance"] == "medium")
    & (data_centers_with_zone["cooling_tech_scenario"] == "baseline")
]

# Assign water use to power plants
power_plants_baseline = assign_water_use_to_power_plants(
    data_centers_baseline,
    power_grid_summary,
    power_plants_with_zone,
    consider_op_status=False,
)

# Assign water use to power plants only based on operational data centers
power_plants_operational_planned = assign_water_use_to_power_plants(
    data_centers_with_zone,
    power_grid_summary,
    power_plants_with_zone,
    consider_op_status=True,
)

In [None]:
# Export to csv
power_plants_baseline.to_csv(f"{OUTPUT_DIR}/power_plants_water_use_baseline.csv", index=False)
data_centers_baseline.to_csv(f"{OUTPUT_DIR}/data_centers_water_use_baseline.csv", index=False)

In [None]:
# Create a combined dataframe of data centers and power plants
combined_dcs_pps = combine_dcs_and_pps(
    data_centers_df=data_centers_baseline,
    power_plants_df=power_plants_baseline,
    status="all",
)

# Only consider operational data centers
combined_dcs_pps_operational = combine_dcs_and_pps(
    data_centers_df=data_centers_baseline,
    power_plants_df=power_plants_operational_planned,
    status="operational",
)

# Only consider planned data centers
combined_dcs_pps_planned = combine_dcs_and_pps(
    data_centers_df=data_centers_baseline,
    power_plants_df=power_plants_operational_planned,
    status="planned",
)

In [None]:
# Export to csv
combined_dcs_pps.to_csv(f"{OUTPUT_DIR}/water_use_dcs_pps_baseline.csv", index=False)
combined_dcs_pps_operational.to_csv(f"{OUTPUT_DIR}/water_use_dcs_pps_operational_baseline.csv", index=False)
combined_dcs_pps_planned.to_csv(f"{OUTPUT_DIR}/water_use_dcs_pps_planned_baseline.csv", index=False)

In [None]:
# Calculate on average what percent of water use is direct vs indirect
data_centers_baseline["annual_direct_water_use_m3"].sum() / data_centers_baseline["total_water_use_m3"].sum()

30% of water withdrawal occurs for on-site purposes. The rest for electricity generation.
