In [None]:
import re
import math

import altair as alt ##
import numpy as np
import pandas as pd
import plotly.express as px



## Import data from CSVs

In [None]:
dbutils.fs.cp("dbfs:/FileStore/shared_uploads/barryjames028@gmail.com/AWS_EC2_Carbon_Footprint_Dataset.xlsx", "file:/databricks/driver/AWS_EC2_Carbon_Footprint_Dataset.xlsx")
dbutils.fs.cp("dbfs:/FileStore/shared_uploads/barryjames028@gmail.com/codecarbon_carbontracking.csv", "file:/databricks/driver/codecarbon_carbontracking.csv")
df_ec2 = pd.read_excel("/databricks/driver/AWS_EC2_Carbon_Footprint_Dataset.xlsx", "EC2 Instances Dataset")
df_ec2 = df_ec2.set_index("Instance type")

In [None]:
df_intensity = pd.read_excel("/databricks/driver/AWS_EC2_Carbon_Footprint_Dataset.xlsx", "AWS Regions Mix Intensity")
df_intensity = df_intensity.set_index("Region")

In [None]:
dbutils.fs.cp("dbfs:/FileStore/shared_uploads/barryjames028@gmail.com/datadog_cpu_memory_usage.csv", "file:/databricks/driver/datadog_cpu_memory_usage.csv")
df_dd = pd.read_csv("/databricks/driver/datadog_cpu_memory_usage.csv").dropna()

We created this query in datadog based on metrics from [this agent for datadog](https://docs.datadoghq.com/agent/kubernetes/)

![image.png](attachment:9564d2a1-ea03-43c0-8967-6c60f102bc22.png)

The `from` part of the query is just to filter out pods that belongs to the age pipeline in the production enviroment. Since `kubernetes.memory.rss` is given in bytes and we need it in GB we divide by 1073741824 (the number of bytes in a GB). `kubernetes.cpu.usage.total` is given in nanocores so we divide by 1e9 to get in cores. To get the neccesary information about the hardware we need to sum both metrics by the AWS EC2 instance type, and to know what part of the pipeline the executing job belongs to we use the pod name.

![image.png](attachment:a37dce2e-8667-4657-af4d-ad08a7e967e2.png)

In [None]:
def to_query_value(query):
    if "memory.rss" in query:
        return "memory"
    elif "cpu.usage" in query:
        return "cpu"
    else:
        return np.nan

def parse_query(df: pd.DataFrame):
    """
    Extract instance types and pipeline stage names from the query string

    """
    df["query_value"] = df["query"].apply(to_query_value)
    df = df.drop(axis=1, labels="query")

    df["instance-type"] = df["group"].apply(lambda x: re.split(';|:', x)[1])
    df["pod_name"] = df["group"].apply(lambda x: re.split(';|:', x)[3])
    df["stage_name"] = df["pod_name"].str.replace("ate-age-v2-", "").str.split("-").str[0]
    df["stage_name"] = df["stage_name"].apply(lambda x:
                                                    "eventspreprocessed" if "eventspreprocessed" in x else x)
    df["time"] = pd.to_datetime(df_dd["time"])

    return df.drop(columns=['group'])


In [None]:
df_dd = parse_query(df_dd)

In [None]:
## plotly not giving better histograms so instead seaborn, hisplot and try to use different , number of bins=50

fig = px.histogram(df_dd,
                   x="value",
                   facet_col="query_value",
                   nbins=200).update_xaxes(matches=None)

fig.update_xaxes(title_text="GB", row=1, col=1)
fig.update_xaxes(title_text="vCPUs", row=1, col=2)

In [None]:
def value_to_util_frac(df, value, query_value, instance_type):
    if query_value == "memory":
        return value / df["Instance Memory (in GB)"][instance_type]
    elif query_value == "cpu":
        return value / df["Instance vCPU"][instance_type] # "Platform Total Number of vCPU"
    else:
        raise NotImplementedError()


df_dd["util_frac"] = df_dd.apply(lambda row: value_to_util_frac(
    df_ec2, row["value"], row["query_value"], row["instance-type"]), axis=1)
df_dd = df_dd.drop(columns="value", axis=1)

In [None]:
## seaborn for histogram and kDE=TRUE
#sns.histplot(data = births, x = 'Maternal Pregnancy Weight', bins = 20);
#sns.rugplot(data = births, x = 'Maternal Pregnancy Weight', color = "red");
#sns.histplot(data = births, x = 'Maternal Pregnancy Weight', kde = True);  ## smooth curve of the distribution of value without the need of bins
#sns.rugplot(data = births, x = 'Maternal Pregnancy Weight', color = "red");


px.histogram(df_dd, x="util_frac", facet_col="query_value", nbins=150).update_xaxes(matches=None)

In [None]:
def preprocess_dd_data(df: pd.DataFrame):
    na_pods_count = df[df["pod_name"] == "N/A"]["query_value"].value_counts()
    print(f"Encountered {na_pods_count.values} pods with N/A name.")

    na_pods_util_frac = df[(df["pod_name"] == "N/A")]["util_frac"].sum() / df["util_frac"].sum()
    print(f"Fraction of memory usage, represented by N/A pods: {na_pods_util_frac}.")

    df_dd = df[df["pod_name"] != "N/A"]

    return df_dd

df_dd = preprocess_dd_data(df_dd)

Encountered [157] pods with N/A name.
Fraction of memory usage, represented by N/A pods: 3.4032300630672487e-06.


In [None]:
grouping_cols = df_dd.columns.to_list()
grouping_cols.remove("util_frac")

In [None]:
# Lookup Power from utlization
instances_in_use = df_dd["instance-type"].unique()
df_ec2 = df_ec2.loc[instances_in_use]
df_ec2

Unnamed: 0_level_0,Release Date,Instance vCPU,Platform Total Number of vCPU,Platform CPU Name,Instance Memory (in GB),Platform Memory (in GB),Storage Info (Type and Size in GB),Storage Type,Platform Storage Drive Quantity,Platform GPU Quantity,...,Instance @ 10%,Instance @ 50%,Instance @ 100%,Platform Additional Memory Scope 3 Emissions,Platform Additional Storage Scope 3 Emissions,Platform Additional GPU Scope 3 Emissions,Platform Additional CPU Scope 3 Emissions,Total Platform Scope 3 Emissions (kgCO₂eq),Instance Hourly Manufacturing Emissions (gCO₂eq),Hardware Information on AWS Documentation & Comments
Instance type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
m5.4xlarge,November 2017,16.0,96,Xeon Platinum 8175M,64.0,384.0,EBS-Only,EBS,0.0,,...,55.836667,112.966667,159.630833,510.3792,0,0,100,1610.3792,7.766103,Up to 3.1 GHz Intel Xeon Platinum 8175M
r5.4xlarge,July 2018,16.0,96,Xeon Platinum 8175M,128.0,768.0,EBS-Only,EBS,0.0,,...,71.235,152.65,223.599167,1042.9488,0,0,100,2142.9488,10.334437,Up to 3.1 GHz Intel Xeon Platinum 8000 series ...
m4.4xlarge,June 2015,16.0,72,Xeon E5-2686 v4,64.0,256.0,EBS-Only,EBS,0.0,,...,54.28,84.131111,113.762778,332.856,0,0,100,1432.856,9.213323,2.3 GHz Intel Xeon E5-2686 v4 (Broadwell) or 2...
c5d.4xlarge,May 2018,16.0,72,Xeon Platinum 8124M,32.0,192.0,1 x 400 NVMe SSD,SSD,2.0,,...,62.541667,106.231111,143.722778,244.0944,200,0,100,1544.0944,9.928591,Up to 3.5 GHz 2nd generation Intel Xeon Scalab...
r5ad.4xlarge,March 2019,16.0,96,EPYC 7571,128.0,768.0,2 x 300 NVMe SSD,SSD,4.0,,...,72.950697,114.684244,158.036334,1042.9488,400,0,100,2542.9488,12.263449,2.5 GHz AMD EPYC 7000 series processors
m5.8xlarge,June 2019,32.0,96,Xeon Platinum 8175M,128.0,384.0,EBS-Only,EBS,0.0,,...,111.673333,225.933333,319.261667,510.3792,0,0,100,1610.3792,15.532207,Up to 3.1 GHz Intel Xeon Platinum 8175M
m5d.4xlarge,June 2018,16.0,96,Xeon Platinum 8175M,64.0,384.0,2 x 300 NVMe SSD,SSD,4.0,,...,55.836667,112.966667,159.630833,510.3792,400,0,100,2010.3792,9.695116,Up to 3.1 GHz Intel Xeon Platinum 8175M
r5a.4xlarge,November 2018,16.0,96,EPYC 7571,128.0,768.0,EBS-Only,EBS,0.0,,...,72.950697,114.684244,158.036334,1042.9488,0,0,100,2142.9488,10.334437,2.5 GHz AMD EPYC 7000 series processors
g4dn.4xlarge,March 2019,16.0,96,Xeon Platinum 8259CL,64.0,384.0,225.0,SSD,2.0,8.0,...,75.736565,147.295123,202.020234,510.3792,200,1200,100,3010.3792,14.517647,NVIDIA T4 GPU and custom Intel Cascade Lake CPUs
c5.4xlarge,November 2016,16.0,72,Xeon Platinum 8124M,32.0,192.0,EBS-Only,EBS,0.0,,...,62.541667,106.231111,143.722778,244.0944,0,0,100,1344.0944,8.642582,Up to 3.5GHz 2nd generation Intel Xeon Scalabl...


### Interpolation

In [None]:
cpu_loolup_col_prefix = "PkgWatt"    ## power consumed by CPUs
ram_loolup_col_prefix = "RAMWatt"
gpu_loolup_col_prefix = "GPUWatt"
power_delta_col = "Delta Full Machine"

import scipy.interpolate

def util_to_power(df, util_value, util_type, instance_type):
    x = [0, 0.1, 0.5, 1]   ## utilization ratio
    if util_type == "cpu":
        col_prefix = cpu_loolup_col_prefix
    elif util_type == "memory":
        col_prefix = ram_loolup_col_prefix
    elif util_type == "gpu":
        col_prefix = gpu_loolup_col_prefix
    else:
        raise NotImplementedError(f"handling for {util_type} not implemented")
    y_cols = [f"{col_prefix} @ Idle"] + [f"{col_prefix} @ {str(int(i*100))}%"  for i in x[1:]]
    y = df.loc[instance_type].loc[y_cols]
    return scipy.interpolate.interp1d(x,y)(util_value)    ## interpolate utilization ratio to the power consumption


def instace_type_util_to_delta_power(df, any_sample_values, instance_type):
    return any_sample_values.map({True: df.loc[instance_type].loc[power_delta_col], False: np.nan})


def instace_type_util_to_gpu_power(any_sample_values, instance_type, measured_avg_gpu_power_dict):
    return any_sample_values.map({
        True: measured_avg_gpu_power_dict[instance_type] if instance_type in gpu_instances.index else np.nan,
        False: np.nan
    })

In [None]:
cols = df_ec2.columns
df_power_plot = df_ec2[cols[cols.str.contains(cpu_loolup_col_prefix) | cols.str.contains(ram_loolup_col_prefix)]]

def colname_to_frac(c):
    int_str = c.split("@ ")[1].strip("%")
    int_str = "0" if "Idle" in int_str else int_str
    return float(int_str) / 100.0

df_power_plot.columns = pd.MultiIndex.from_tuples(
    [
        (colname_to_frac(c),"cpu" if cpu_loolup_col_prefix in c else "ram") for c in df_power_plot.columns
    ],
    names=('util', 'factor'))

df_plot_power = pd.melt(df_power_plot.stack("util").reset_index(level=1),
                        id_vars=["util"],
                        ignore_index=False).reset_index().rename(columns={
    "value": "power [W]",
    "util": "utilization fraction"
})
### add symbols-- try to change this visualization
## matplotlib

px.line(df_plot_power,
        x="utilization fraction",
        y="power [W]",
        facet_col="factor",
        color="Instance type",
        title="Util vs Power per factor for instances in use")

### Reshape dataframe from long to wide format, dimension: time * {pod_name, ...}

In [None]:
def get_subset_for_display(df, subsample_every=[10,18]):
    skip_num = [int(math.floor(df.shape[0] / subsample_every[0])),
                int(math.floor(df.shape[1] / subsample_every[1]))]
    df_ss = df.iloc[0::skip_num[0], 0::skip_num[1]]
    df_disp = df_ss.loc[df_ss.any(axis=1), :]
    df_disp = df_disp.loc[:, df_disp.any(axis=0)]

    return df_disp

In [None]:
pivot_cols = grouping_cols.copy()
pivot_cols.remove("time")
df_util = pd.pivot(df_dd, index="time", values=["util_frac"], columns=pivot_cols)
get_subset_for_display(df_util)

Unnamed: 0_level_0,util_frac,util_frac,util_frac,util_frac,util_frac
query_value,memory,memory,memory,cpu,cpu
instance-type,m5.4xlarge,m5d.4xlarge,m5.8xlarge,m5.4xlarge,m5.4xlarge
pod_name,eventspreprocessedred-1-10no-1641863769117-exec-11,eventspreprocessedyellow-1-10se-1641865659075-exec-27,eventspreprocessedyellow-1-10no-1641865659242-exec-10,ate-age-v2-usertensors-1dfa5bd583-xww7p,ate-age-v2-usertensors-0a03fe2dab-slxm6
stage_name,eventspreprocessed,eventspreprocessed,eventspreprocessed,usertensors,usertensors
time,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5
2022-01-11 01:38:00+00:00,0.397831,,,,
2022-01-11 02:14:00+00:00,,0.383533,0.101143,,
2022-01-11 02:50:00+00:00,,,,0.31443,
2022-01-11 03:26:00+00:00,,,,0.311382,0.309154
2022-01-11 04:02:00+00:00,,,,0.348518,


In [None]:
px.histogram(df_util.notna().sum(axis=0), nbins=200) # num samples per pod

### Upsample to constant rate

In [None]:
sample_intervals = df_util.index.to_series().diff().value_counts()
resample_interval = sample_intervals.index.min()
sample_intervals, resample_interval

Out[18]: (0 days 00:02:00    185
 0 days 00:04:00      1
 Name: time, dtype: int64,
 Timedelta('0 days 00:02:00'))

In [None]:
df_util_rs = df_util.asfreq(resample_interval, method="ffill")

In [None]:
# Check utils per instance-type:
util_summary = df_util_rs.mean(axis=0).groupby(["query_value", "instance-type"])
pd.concat([util_summary.mean(),
           util_summary.size()], axis=1).rename(columns={
    0:"mean_util_frac",
    1:"pod_count"
}).sort_values(["query_value", "pod_count"], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_util_frac,pod_count
query_value,instance-type,Unnamed: 2_level_1,Unnamed: 3_level_1
memory,r5.4xlarge,0.078708,110
memory,m5.8xlarge,0.030593,106
memory,m5.4xlarge,0.07793,79
memory,r5a.4xlarge,0.091368,74
memory,m4.4xlarge,0.081825,60
memory,c5d.4xlarge,0.157299,42
memory,r5ad.4xlarge,0.068562,34
memory,c5.4xlarge,0.193832,24
memory,m5d.4xlarge,0.139954,21
memory,g4dn.4xlarge,0.329598,3


In [None]:
## plot pod count on x-axis and utilization on y-axis for all instance types, for memory and another for CPU ## number of containers on x-axis

### Calculate power delta and GPU power (from avg GPU util)

In [None]:
gpu_util_cols = ['GPUWatt @ Idle', 'GPUWatt @ 10%','GPUWatt @ 50%', 'GPUWatt @ 100%']
gpu_instances = df_ec2[df_ec2[gpu_util_cols].sum(axis=1) > 0]
gpu_instances[gpu_util_cols].T

Instance type,g4dn.4xlarge
GPUWatt @ Idle,8.129003
GPUWatt @ 10%,22.278232
GPUWatt @ 50%,52.658457
GPUWatt @ 100%,71.298151


In [None]:
## sns.hitplot (data=df, x- Instance type, y=utilization)

In [None]:
def get_mean_gpu_power():
    df_gpu_instace_codecarbon = pd.read_csv("/databricks/driver/codecarbon_carbontracking.csv")
    mean_gpu_power = df_gpu_instace_codecarbon["gpu_power"].mean()
    return mean_gpu_power

In [None]:
def calc_total_power(df_util_rs, df_ec2, mean_gpu_power):

    instance_type_level_idx = np.where(np.array(df_util_rs.columns.names) == "instance-type")[0][0]
    query_value_level_idx = np.where(np.array(df_util_rs.columns.names) == "query_value")[0][0]

    def calc_power_util(df_util_rs):
        df_power_util = (df_util_rs
                         .apply(lambda ts: util_to_power(
                             df_ec2,
                             ts.values,
                             ts.name[query_value_level_idx],
                             ts.name[instance_type_level_idx]), axis=0)
                         .droplevel(0, axis=1))
        df_power_util = df_power_util.stack('query_value')
        df_power_util.index = (pd.MultiIndex
                               .from_tuples([(time, query_value) for time, query_value in df_power_util.index],
                                            names=["time", "power_factor"]))
        df_power_util = df_power_util.unstack('power_factor')
        return df_power_util

    def calc_delta_power(df_util_any_query, instance_type_level_idx_any_query):
        df_delta_power = (df_util_any_query
                          .apply(lambda ts: instace_type_util_to_delta_power(
                              df_ec2,
                              ts,
                              ts.name[instance_type_level_idx_any_query]), axis=0)
                          .droplevel(0, axis=1)
                          .assign(power_factor='delta')
                          .set_index('power_factor', append=True)
                          .unstack('power_factor')) # append multiindex level
        return df_delta_power

    def calc_gpu_power(df_util_any_query, instance_type_level_idx_any_quer, mean_gpu_power):
        df_gpu_power = (df_util_any_query
        .apply(lambda ts: instace_type_util_to_gpu_power(
            ts,
            ts.name[instance_type_level_idx_any_query],
            {i: mean_gpu_power for i in gpu_instances.index}), axis=0))
        df_gpu_power = (df_gpu_power
                        .droplevel(0, axis=1)
                        .assign(power_factor='gpu')
                        .set_index('power_factor', append=True)
                        .unstack('power_factor')) # append multiindex level
        return df_gpu_power

    df_util_any_query = df_util_rs.stack(query_value_level_idx).groupby("time").any()
    instance_type_level_idx_any_query = np.where(np.array(df_util_any_query.columns.names) == "instance-type")[0][0]

    return pd.concat([calc_power_util(df_util_rs),
                      calc_delta_power(df_util_any_query, instance_type_level_idx_any_query),
                      calc_gpu_power(df_util_any_query, instance_type_level_idx_any_query, mean_gpu_power)],
                     axis=1)


In [None]:
mean_gpu_power = get_mean_gpu_power()
df_power = calc_total_power(df_util_rs, df_ec2, mean_gpu_power)
get_subset_for_display(df_power)

instance-type,c5.4xlarge,c5d.4xlarge,m5.4xlarge,m5.8xlarge,r5.4xlarge,r5ad.4xlarge,m4.4xlarge,m5.8xlarge,r5.4xlarge,r5a.4xlarge
pod_name,ate-age-v2-usertensors-0379d73b1b-h5km9,eventspreprocessedred-1-10no-1641863769117-exec-22,ate-age-v2-usertensors-f28cff8fd8-74qjd,ate-age-v2-usertensors-673bea4442-kqwst,eventspreprocessedyellow-1-10no-1641865659242-exec-14,eventspreprocessedred-1-10no-1641863769117-exec-5,ate-age-v2-usertensors-fb63f827d1-zv8bf,ate-age-v2-usertensors-4cea9db07c-6z5fh,ate-age-v2-usertensors-7cc8ce6771-lhmpm,eventspreprocessedred-1-10se-1641864700389-exec-17
stage_name,usertensors,eventspreprocessed,usertensors,usertensors,eventspreprocessed,eventspreprocessed,usertensors,usertensors,usertensors,eventspreprocessed
power_factor,cpu,cpu,cpu,cpu,cpu,cpu,delta,delta,delta,delta
time,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4
2022-01-11 01:36:00+00:00,,39.15748,,,,31.00797,,,,13.333333
2022-01-11 02:12:00+00:00,,,,,40.916839,,,,,
2022-01-11 02:48:00+00:00,48.799591,,42.238738,65.769989,,,12.888889,32.0,16.0,
2022-01-11 03:24:00+00:00,52.672186,,42.860345,59.741177,,,12.888889,32.0,16.0,
2022-01-11 04:00:00+00:00,,,48.02443,60.256094,,,,32.0,16.0,


In [None]:
px.line(df_power.sum(axis=1)).update_layout(
    title="Total power",
    yaxis_title="Power [W]",
    legend_title="Power")

## use matplotlib
## set all figure size to  same , e.g. (8,4)

In [None]:
px.line((df_power
         .stack("power_factor")
         .sum(axis=1)
         .reset_index(1)
         .rename(columns={0:"power"})),
        y="power",
        color="power_factor")

In [None]:
px.line((df_power
         .stack("stage_name")
         .sum(axis=1)
         .reset_index(1)
         .rename(columns={0:"power"})),
        y="power",
        color="Deep Learning Pipeline Stages",
        title="Power usage by a Pipeline Stage")


### matplotlib using symbols

In [None]:
df_dE = df_power * resample_interval / np.timedelta64(1, 'h') / 1000 # delta Energy in kWh per sample

In [None]:
df_E_stage = df_dE.sum(axis=0).groupby(["stage_name", "power_factor"]).sum().unstack("power_factor")
df_E_stage = df_E_stage[df_E_stage.sum(axis=0).sort_values(ascending=False).index]
df_E_stage = df_E_stage.loc[df_E_stage.sum(axis=1).sort_values(ascending=False).index]
df_E_stage[df_E_stage == 0.0] = np.nan
energy_plot_data = pd.melt(df_E_stage.reset_index(),
                    id_vars=['stage_name'],
                    value_vars=['cpu', 'memory', 'delta', 'gpu'])

In [None]:
alt.Chart(energy_plot_data).mark_rect().encode(
    x=alt.X('power_factor', sort=['cpu', 'memory', 'delta', 'gpu'], axis=alt.Axis(title='Power factor')),
    y=alt.Y('stage_name', axis=alt.Axis(title='Pipeline stage')),
    color=alt.Color('value',scale=alt.Scale(type='log',scheme='greenblue'), title="kWh")
).properties(
    width=700,
    height=300
)

In [None]:
## heatmap from seaborn --> coorrelation between power factor and pipeline stage---heatmap

### Convert to CO2eq

In [None]:
my_region = "eu-west-1"

def df_dE_to_co2e(df_dE, region, df_intensity):
    pue = df_intensity.loc[region]["PUE"]
    co2e_intensity = df_intensity.loc[region]["CO2e (metric gram/kWh)"]

    df_dco2e = df_dE * pue * co2e_intensity # CO2e in gram per sample
    df_dco2e = df_dco2e.rename_axis(columns={"power_factor": "co2e_factor"})
    return df_dco2e

df_dco2e = df_dE_to_co2e(df_dE, my_region, df_intensity)

In [None]:
df_dco2e.sum().sum() # total scope 2 emissions in gCO2eq

Out[32]: 9251.715201292618

In [None]:
def concat_scope3_emissions(df_dco2e, df_ec2): # move df_dE_to_co2e into here to avoid state changes

    def instace_type_kwh_to_co2e(any_sample_values, instance_type):
        return any_sample_values.map({True: df_ec2.loc[instance_type].loc["Instance Hourly Manufacturing Emissions (gCO₂eq)"] * resample_interval / np.timedelta64(1, 'h'), False: np.nan})

    df_dco2e_any_factor = df_dco2e.stack("co2e_factor").groupby("time").any()
    instance_type_level_idx_any_factor = np.where(np.array(df_dco2e_any_factor.columns.names) == "instance-type")[0][0]

    df_dco2e = pd.concat([df_dco2e,
                         df_dco2e_any_factor.apply(
        lambda ts: instace_type_kwh_to_co2e(ts, ts.name[instance_type_level_idx_any_factor]), axis=0
    ).assign(co2e_factor='manufacturing').set_index('co2e_factor', append=True).unstack('co2e_factor')]) # append multiindex level

    return df_dco2e

df_dco2e = concat_scope3_emissions(df_dco2e, df_ec2)

In [None]:
df_dco2e.sum().sum() # total emissions in gCO2eq

Out[34]: 12385.687252709093

In [None]:
px.line((df_dco2e
         .stack("co2e_factor")
         .sum(axis=1)
         .reset_index(1)
         .rename(columns={0:"gCO2eq"})),
        y="gCO2eq",
        color="co2e_factor")

In [None]:
# Check saving for greener region
concat_scope3_emissions(
    df_dE_to_co2e(
        df_dE, "eu-north-1", df_intensity), df_ec2).sum().sum() / df_dco2e.sum().sum()

Out[36]: 0.2719423331706656

### Fraction of scope 2 out of total

In [None]:
co2e_per_factor = df_dco2e.sum(axis=0).groupby("co2e_factor").sum()
px.bar(co2e_per_factor.sort_values(ascending=False)).update_layout(
    title="Emissions by factor",
    yaxis_title="CO2eq [g]")

In [None]:
scope_3_co2e_factors = ["manufacturing"]
scope_2_co2e_factors = co2e_per_factor.index.drop(scope_3_co2e_factors)
print("Manufacturing fraction of all emissions:")
co2e_per_factor[scope_3_co2e_factors] / co2e_per_factor.sum()

Manufacturing fraction of all emissions:
Out[38]: co2e_factor
manufacturing    0.253032
dtype: float64

### Plot by CO2eq factor

In [None]:
df_dco2e.stack(["stage_name"]).groupby(["time","stage_name"]).sum().sum(axis=1)

Out[39]: time                       stage_name              
2022-01-11 01:00:00+00:00  eventspreprocessed           4.267434
                           reachmetrics                 2.407232
2022-01-11 01:02:00+00:00  eventspreprocessed          32.560676
                           reachmetrics                 2.407232
2022-01-11 01:04:00+00:00  eventspreprocessed          45.729159
                                                         ...    
2022-01-11 07:12:00+00:00  metricsreporter              2.288919
                           predictionspostprocessed     2.611730
                           userpredicti                 2.335077
2022-01-11 07:14:00+00:00  metricsreporter              2.197873
                           predictionspostprocessed     2.611730
Length: 423, dtype: float64

In [None]:
px.line(
    pd.melt((df_dco2e
             .stack(["stage_name"])
             .groupby(["time","stage_name"])
             .sum()
             .sum(axis=1)
             .unstack(["stage_name"])),
        ignore_index=False, value_name="co2e"),
    color="stage_name"
)

In [None]:
df_co2e_stage = df_dco2e.sum(axis=0).groupby(["stage_name", "co2e_factor"]).sum().unstack("co2e_factor")
df_co2e_stage = df_co2e_stage[df_co2e_stage.sum(axis=0).sort_values(ascending=False).index]
df_co2e_stage = df_co2e_stage.loc[df_co2e_stage.sum(axis=1).sort_values(ascending=False).index]
df_co2e_stage[df_co2e_stage == 0.0] = np.nan
plot_data = pd.melt(df_co2e_stage.reset_index(),
                    id_vars=['stage_name'],
                    value_vars=['cpu', 'memory', 'delta', 'gpu'])

In [None]:
alt.Chart(plot_data).mark_rect().encode(
    x=alt.X('co2e_factor',
            sort=['cpu', 'memory', 'delta', 'gpu'],
            axis=alt.Axis(title='CO2e factor')),
    y=alt.Y('stage_name',
            sort=['metricsreporter', 'reachmetrics', 'predictionspostprocessed',
                  'userpredicti', 'modeltrainin', 'eventspreprocessed', 'usertensors'],
            axis=alt.Axis(title='Pipeline stage')),
    color=alt.Color('value',scale=alt.Scale(type='log',scheme='greenblue'), title="CO2eq [g]")
).properties(
    width=700,
    height=300
)

In [None]:
px.bar(df_co2e_stage.sum(axis=1)).update_layout(
    title="Emissions by stage",
    yaxis_title="CO2eq [g]")

### Verify against an earlier codecarbon run

In [None]:
# Compare again old codecarbon estimate
1000 * 1658.9217151355867 / 365 # in gCO2eq, neither PUE or scope 3 applied

Out[44]: 4544.99100037147

In [None]:
# compare with adapted total based on cpu and gpu measurements for the same stages
codecarbon_co2e_intensity = 617.0 # g/kWh - this was about 2x to the value from df_intensity fore eu-west-1
df_dco2e = df_dE * codecarbon_co2e_intensity # CO2e in gram per sample
df_dco2e = df_dco2e.rename_axis(columns={"power_factor": "co2e_factor"})
get_subset_for_display(df_dco2e)

instance-type,c5.4xlarge,c5d.4xlarge,m5.4xlarge,m5.8xlarge,r5.4xlarge,r5ad.4xlarge,m4.4xlarge,m5.8xlarge,r5.4xlarge,r5a.4xlarge
pod_name,ate-age-v2-usertensors-0379d73b1b-h5km9,eventspreprocessedred-1-10no-1641863769117-exec-22,ate-age-v2-usertensors-f28cff8fd8-74qjd,ate-age-v2-usertensors-673bea4442-kqwst,eventspreprocessedyellow-1-10no-1641865659242-exec-14,eventspreprocessedred-1-10no-1641863769117-exec-5,ate-age-v2-usertensors-fb63f827d1-zv8bf,ate-age-v2-usertensors-4cea9db07c-6z5fh,ate-age-v2-usertensors-7cc8ce6771-lhmpm,eventspreprocessedred-1-10se-1641864700389-exec-17
stage_name,usertensors,eventspreprocessed,usertensors,usertensors,eventspreprocessed,eventspreprocessed,usertensors,usertensors,usertensors,eventspreprocessed
co2e_factor,cpu,cpu,cpu,cpu,cpu,cpu,delta,delta,delta,delta
time,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4
2022-01-11 01:36:00+00:00,,0.805339,,,,0.637731,,,,0.274222
2022-01-11 02:12:00+00:00,,,,,0.841523,,,,,
2022-01-11 02:48:00+00:00,1.003645,,0.86871,1.352669,,,0.265081,0.658133,0.329067,
2022-01-11 03:24:00+00:00,1.083291,,0.881494,1.228677,,,0.265081,0.658133,0.329067,
2022-01-11 04:00:00+00:00,,,0.987702,1.239267,,,,0.658133,0.329067,


In [None]:
df_dco2e.sum(axis=0).groupby(["stage_name", "co2e_factor"]).sum().unstack("co2e_factor")[["cpu", "gpu"]].loc[["modeltrainin", "userpredicti", "usertensors"]].sum().sum()

Out[46]: 5965.0286443314

In [None]:
model_comparison_df = pd.DataFrame.from_dict({
    "model":["Transformer(base)", "Transformer(big)", "ELMo", "BERT(base)", "DL pipeline"],
    "emission": [11.7934, 87.0897, 118.841, 652.26583, 12.386]})
px.bar(y=model_comparison_df['emission'],
       x=model_comparison_df['model']).update_layout(
    title="Carbon footprint of different models",
    yaxis_title="CO2eq [kg]",
    xaxis_title=None)
