# BLOOM Carbon Footprint : An Analysis of the logs

# Introduction

This notebook aims to analyse the SLURM logs of the experiments run on Jean Zay during the BigScience Project.
They were obtained using the `jobs_info.py` script.

## Data
The logs can be found in the Project-end folder.

## TODO
- Add source for grams of Co2 vs kWh, see TODO in code
- Add explanation about CPU power and why it is << to GPU power and we choose to ignore it
- Refactor other trainings, eval, tokenizers
- Do we need to keep # DATA PROCESSING, DOWNLOADING, DEDUPLICATION ?

# Modules loading and configuration

In [1]:
import os
import matplotlib
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 1000)  # or 1000
pd.set_option('display.max_rows', 1000)  # or 1000
pd.set_option('display.max_colwidth', None) 
pd.options.plotting.backend = "plotly"

# Data loading and pre-processing

In [2]:
# Read files into a single DataFrame
directory = "Project-end/"
file_paths = [directory+x for x in os.listdir(directory) if x.endswith("logs.txt")]
df_list = [pd.read_csv(filename, sep="|", header=None).assign(file=filename.split("/")[1]) for filename in file_paths]
df = pd.concat(df_list)
df.columns= ['job_id', 'num_V100_32GB', 'num_V100_16GB', 'num_A100_40GB', 'num_A100_80GB', \
             'alloc_cpu', 'alloc_mem', 'alloc_energy', 'partition', 'group', 'elapsed',\
              'qos','jobname', 'start', 'end', 'workdir', 'account' ,'file']

# Delete useless fields
df=df.drop(['alloc_energy','workdir'], axis=1)

# Cast datetimes as such
df['start'] = pd.to_datetime(df['start'])
df['end'] = pd.to_datetime(df['end'],  errors='coerce')

# Make our own duration as elapsed field is sometimes buggy
df['duration'] = df['end'] - df['start']

# Show example of elapsed time being bogus
# df['elapsed'] = pd.to_timedelta(df['elapsed'])
# df['duration_diff']= df['duration'] - df['elapsed']
# df=df[df.columns[[0,1,2,3,4,5,6,7,8,12,13,9,16,17,10,11,14,15]]] 
# df[df['duration_diff']>pd.to_timedelta(0)].head(3)

# Compute GPU hours per hardware type
durations_in_hours = (df['duration'] / np.timedelta64(1, 's') / 3600)
df['hours_V100_32GB'] = df['num_V100_32GB'] * durations_in_hours
df['hours_V100_16GB'] = df['num_V100_16GB'] * durations_in_hours
df['hours_A100_40GB'] = df['num_A100_40GB'] * durations_in_hours
df['hours_A100_80GB'] = df['num_A100_80GB'] * durations_in_hours


# Data augmentation with external sources

In [3]:
# Compute GPU power for each job, assuming running at 100% TDP, using hardware info from JZ documentation --> http://www.idris.fr/eng/jean-zay/cpu/jean-zay-cpu-hw-eng.html
df['gpu_power_watts_hours'] = df['hours_V100_32GB'] * 300 # https://resources.nvidia.com/en-us-virtualization-and-gpus/v100-datasheet
df['gpu_power_watts_hours'] += df['hours_V100_16GB'] * 300 # https://resources.nvidia.com/en-us-virtualization-and-gpus/v100-datasheet
df['gpu_power_watts_hours'] += df['hours_A100_40GB'] * 250 # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/A100-PCIE-Prduct-Brief.pdf
df['gpu_power_watts_hours'] += df['hours_A100_80GB'] * 400 # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-nvidia-us-2188504-web.pdf
carbon_g_per_W = 0.057 # TODO add source of this data point

df[df['gpu_power_watts_hours']>0].head(3)

Unnamed: 0,job_id,num_V100_32GB,num_V100_16GB,num_A100_40GB,num_A100_80GB,alloc_cpu,alloc_mem,partition,group,elapsed,qos,jobname,start,end,account,file,duration,hours_V100_32GB,hours_V100_16GB,hours_A100_40GB,hours_A100_80GB,gpu_power_watts_hours
18,1127041,64,0,0,0,1280.0,2.50T,gpu_p13,six,00:27:59,qos_gpu-t3,mbert_alpha,2022-01-13 05:54:05,2022-01-13 06:22:04,six@gpu,2022_08_24_roman_castagne_logs.txt,0 days 00:27:59,29.848889,0.0,0.0,0.0,8954.666667
21,1308991,64,0,0,0,1280.0,2.50T,gpu_p13,six,00:03:50,qos_gpu-t3,mbert_alpha,2022-01-17 16:31:47,2022-01-17 16:35:37,six@gpu,2022_08_24_roman_castagne_logs.txt,0 days 00:03:50,4.088889,0.0,0.0,0.0,1226.666667
22,1314956,4,0,0,0,80.0,160G,gpu_p13,six,10:00:29,qos_gpu-t3,mbert_alpha,2022-01-17 18:29:49,2022-01-18 04:30:18,six@gpu,2022_08_24_roman_castagne_logs.txt,0 days 10:00:29,40.032222,0.0,0.0,0.0,12009.666667


# Data sanity checks

In [None]:
# Compare detailled logs vs. snapshot of quotas
print(df.hours_A100_80GB.sum()) # Capturing 99.3% of quotas, looking fine
print(df.hours_V100_32GB.sum() + df.hours_V100_16GB.sum()) # Capturing 115% of quotas, maybe quotas does not count all partitions?
df_v100 = df[(df['hours_V100_32GB']>0) | (df['hours_V100_16GB']>0)]
df_v100.partition.value_counts()

In [None]:
# Show number of runs per input file
df['file'].value_counts()

# All Runs analysis

In [4]:
# Plot the distribution of runs length
durations_in_hours = (df['duration'] / np.timedelta64(1, 's') / 3600)
durations_in_hours_for_hist = durations_in_hours [(durations_in_hours < 1000)] # remove few run of 200days...
durations_in_hours_for_hist.hist(log_y=True)

In [5]:
# Carbon footprint estimation for GPUs only
carbon_g_per_W = 0.057 # TODO add source of this data point
total_gpus_hours = df.hours_A100_80GB.sum()  + df.hours_V100_32GB.sum() +df.hours_V100_16GB.sum()
print(f"All experiments lasted a combined {df.duration.sum()},\
 using {(total_gpus_hours/1e6).round(2)} millions hours of GPUs,\
 representing an electrical consumption of {(df.gpu_power_watts_hours.sum() / 1000).round(1)} kW \
 or {((df.gpu_power_watts_hours.sum() * carbon_g_per_W) /1e6).round(2)} tons of Co2e emitted.")

All experiments lasted a combined 5816 days 13:22:40, using 3.61 millions hours of GPUs, representing an electrical consumption of 1206025.4 kW  or 68.74 tons of Co2e emitted.


# Main Bloom Training

In [6]:
# Select only final training runs
bloomdf = df[df['jobname'] == 'tr11-176B-ml']
bloomdf.head(3)

Unnamed: 0,job_id,num_V100_32GB,num_V100_16GB,num_A100_40GB,num_A100_80GB,alloc_cpu,alloc_mem,partition,group,elapsed,qos,jobname,start,end,account,file,duration,hours_V100_32GB,hours_V100_16GB,hours_A100_40GB,hours_A100_80GB,gpu_power_watts_hours
2184,406772,0,0,0,384,6144.0,22500G,gpu_p5,genhug01,00:00:01,qos_gpu-gc,tr11-176B-ml,2022-06-12 00:31:16,2022-06-12 00:31:17,,new_hugo_laurencon_jz_logs.txt,0 days 00:00:01,0.0,0.0,0.0,0.106667,42.666667
41501,417634,0,0,0,384,6144.0,22500G,gpu_p5,six,00:01:49,qos_gpu-gc,tr11-176B-ml,2022-03-11 18:09:15,2022-03-11 18:11:04,,2022_08_02_thomas_wang_jz_logs.txt,0 days 00:01:49,0.0,0.0,0.0,11.626667,4650.666667
41502,417716,0,0,0,384,6144.0,22500G,gpu_p5,six,00:01:31,qos_gpu-gc,tr11-176B-ml,2022-03-11 18:18:33,2022-03-11 18:20:04,,2022_08_02_thomas_wang_jz_logs.txt,0 days 00:01:31,0.0,0.0,0.0,9.706667,3882.666667


In [7]:
# Distribution of runs length
durations_in_hours = (bloomdf['duration'] / np.timedelta64(1, 's') / 3600)
durations_in_hours.hist(nbins=100, log_y=True)

In [8]:
# Decomposition of runs per number of GPUs used
bloomdf['num_A100_80GB'].value_counts()
#bloomdf['num_A100_80GB'].hist(nbins=int(bloomdf['num_A100_80GB'].max()))

384    316
192     27
0       20
144     13
288      8
1        5
32       4
8        2
16       2
216      1
Name: num_A100_80GB, dtype: int64

In [9]:
# Carbon footprint estimation for GPUs only
carbon_g_per_W = 0.057
print(f"Final training lasted {bloomdf.duration.sum()},\
 using {(bloomdf.hours_A100_80GB.sum()/1e6).round(2)} millions hours of GPUs,\
 representing an electrical consumption of {(bloomdf.gpu_power_watts_hours.sum() / 1000).round(1)} kW \
 or {((bloomdf.gpu_power_watts_hours.sum() * carbon_g_per_W) /1e6).round(2)} tons of Co2e emitted.")

Final training lasted 118 days 05:40:42, using 1.08 millions hours of GPUs, representing an electrical consumption of 433195.8 kW  or 24.69 tons of Co2e emitted.


# Other Trainings

In [None]:
traindf = df[df['jobname'].str.startswith('tr')]
traindf = traindf[traindf['jobname'] != 'tr11-176B-ml']
traindf = traindf[~traindf['jobname'].str.contains('sync')]
traindf = traindf[~traindf['jobname'].str.contains('slurm')]
traindf = traindf[~traindf['jobname'].str.contains('move')]
traindf = traindf[~traindf['jobname'].str.contains('eval')]
traindf = traindf[~traindf['jobname'].str.contains('tokenizer')]
traindf.shape

In [None]:
traindf['jobname'].value_counts()

In [None]:
traindf.duration.sum()

# Eval

In [None]:
evaldf = df[df['jobname'].str.contains('eval')]
evaldf.shape

In [None]:
evaldf['jobname'].value_counts()

# Tokenizer

In [None]:
tokendf = df[df['jobname'].str.contains('tokenizer')]
tokendf.shape

In [None]:
tokendf['jobname'].value_counts()

# DATA PROCESSING, DOWNLOADING, DEDUPLICATION

In [None]:
datadf = df[df['jobname'].str.contains('data')]
datadf = datadf[~datadf['jobname'].str.contains('token')]
datadf.shape


In [None]:
datadf['jobname'].value_counts()