# BLOOM Carbon Footprint : An Analysis of the logs

# Introduction

This notebook aims to analyse the SLURM logs of the experiments run on Jean Zay during the BigScience Project.
They were obtained using the `jobs_info.py` script.

## Data
The logs can be found in the Project-end folder.

## TODO
- Add GPU hours for each experiments, verify that it match quotas
- Extract general info on other runs

# Modules loading

In [2]:
import pandas as pd
import matplotlib
import os
import numpy as np

# Data loading, pre-processing and checks

In [77]:
# Read files into a single DataFrame
directory = "Project-end/"
file_paths = [directory+x for x in os.listdir(directory) if x.endswith("logs.txt")]
df_list = [pd.read_csv(filename, sep="|", header=None).assign(file=filename.split("/")[1]) for filename in file_paths]
df = pd.concat(df_list)
df.columns= ['job_id', 'num_V100_32GB', 'num_V100_16GB', 'num_A100_40GB', 'num_A100_80GB', \
             'alloc_cpu', 'alloc_mem', 'alloc_energy', 'partition', 'group', 'elapsed',\
              'qos','jobname', 'start', 'end', 'workdir', 'account' ,'file']

# Delete useless fields
df=df.drop(['alloc_energy','workdir'], axis=1)

# Set display
pd.set_option('display.max_columns', 1000)  # or 1000
pd.set_option('display.max_rows', 1000)  # or 1000
pd.set_option('display.max_colwidth', -1) 

df.head()

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,job_id,num_V100_32GB,num_V100_16GB,num_A100_40GB,num_A100_80GB,alloc_cpu,alloc_mem,partition,group,elapsed,qos,jobname,start,end,account,file
0,771344,0,0,0,0,50.0,100G,cpu_p1,genini01,18:37:03,qos_cpu-t4,preprocess_oscar,2022-01-04T11:48:15,2022-01-05T06:25:18,six@cpu,2022_08_24_roman_castagne_logs.txt
1,772361,0,0,0,0,,,cpu_p1,genini01,00:00:00,qos_cpu-t4,preprocess_oscar,2022-01-04T14:22:30,2022-01-04T14:22:30,six@cpu,2022_08_24_roman_castagne_logs.txt
2,772368,0,0,0,0,,,cpu_p1,genini01,00:00:00,qos_cpu-t4,preprocess_oscar,2022-01-04T14:23:51,2022-01-04T14:23:51,six@cpu,2022_08_24_roman_castagne_logs.txt
3,772373,0,0,0,0,50.0,100G,cpu_p1,genini01,22:55:58,qos_cpu-t4,preprocess_oscar,2022-01-04T14:41:26,2022-01-05T13:37:24,six@cpu,2022_08_24_roman_castagne_logs.txt
4,772394,0,0,0,0,50.0,100G,cpu_p1,genini01,06:26:29,qos_cpu-t4,preprocess_oscar,2022-01-04T14:41:26,2022-01-04T21:07:55,six@cpu,2022_08_24_roman_castagne_logs.txt


In [78]:
# Cast datetimes as such
df['start'] = pd.to_datetime(df['start'])
df['end'] = pd.to_datetime(df['end'],  errors='coerce')

# We are going to make our own duration as elapsed field is sometimes buggy
df['elapsed'] = pd.to_timedelta(df['elapsed'])
df['duration'] = df['end'] - df['start']
df['duration_diff']= df['duration'] - df['elapsed']

# Hacky columns re-arrange
df=df[df.columns[[0,1,2,3,4,5,6,7,8,12,13,9,16,17,10,11,14,15]]] 

# Show example of elapsed time being bogus
df[df['duration_diff']>pd.to_timedelta(0)].head(10)

Unnamed: 0,job_id,num_V100_32GB,num_V100_16GB,num_A100_40GB,num_A100_80GB,alloc_cpu,alloc_mem,partition,group,start,end,elapsed,duration,duration_diff,qos,jobname,account,file
7,783484,0,0,0,0,2.0,4G,cpu_p1,genini01,2022-01-05 11:31:44,2022-01-06 13:30:08,-5 days +18:01:36,1 days 01:58:24,5 days 07:56:48,qos_cpu-t4,preprocess_oscar,six@cpu,2022_08_24_roman_castagne_logs.txt
8,783489,0,0,0,0,2.0,4G,cpu_p1,genini01,2022-01-05 11:31:44,2022-01-06 13:30:08,-5 days +18:01:36,1 days 01:58:24,5 days 07:56:48,qos_cpu-t4,preprocess_oscar,six@cpu,2022_08_24_roman_castagne_logs.txt
9,783491,0,0,0,0,2.0,4G,cpu_p1,genini01,2022-01-05 11:31:44,2022-01-06 13:30:08,-5 days +18:01:36,1 days 01:58:24,5 days 07:56:48,qos_cpu-t4,preprocess_oscar,six@cpu,2022_08_24_roman_castagne_logs.txt
64,1423667,0,16,0,0,320.0,640G,gpu_p13,genhug01,2021-05-10 03:07:49,2021-05-12 00:25:31,-6 days +22:42:18,1 days 21:17:42,6 days 22:35:24,qos_gpu-t4,v16x16,,2022_08_24_thomwolf_jz_logs.txt
184,1670520,0,1,0,0,40.0,80G,gpu_p13,genhug01,2021-05-16 20:46:36,2021-05-17 21:11:22,-5 days +19:35:14,1 days 00:24:46,5 days 04:49:32,qos_gpu-t4,extract,,2022_08_24_thomwolf_jz_logs.txt
185,1670521,0,1,0,0,40.0,80G,gpu_p13,genhug01,2021-05-16 20:46:36,2021-05-17 21:23:55,-5 days +19:22:41,1 days 00:37:19,5 days 05:14:38,qos_gpu-t4,extract,,2022_08_24_thomwolf_jz_logs.txt
186,1670522,0,1,0,0,40.0,80G,gpu_p13,genhug01,2021-05-16 20:47:09,2021-05-17 21:15:54,-5 days +19:31:15,1 days 00:28:45,5 days 04:57:30,qos_gpu-t4,extract,,2022_08_24_thomwolf_jz_logs.txt
187,1670523,0,1,0,0,40.0,80G,gpu_p13,genhug01,2021-05-16 20:47:09,2021-05-17 21:41:03,-5 days +19:06:06,1 days 00:53:54,5 days 05:47:48,qos_gpu-t4,extract,,2022_08_24_thomwolf_jz_logs.txt
188,1670524,0,1,0,0,40.0,80G,gpu_p13,genhug01,2021-05-16 20:47:09,2021-05-17 21:19:53,-5 days +19:27:16,1 days 00:32:44,5 days 05:05:28,qos_gpu-t4,extract,,2022_08_24_thomwolf_jz_logs.txt
189,1670525,1,0,0,0,40.0,80G,gpu_p13,genhug01,2021-05-16 20:47:09,2021-05-17 22:20:29,-5 days +18:26:40,1 days 01:33:20,5 days 07:06:40,qos_gpu-t4,extract,,2022_08_24_thomwolf_jz_logs.txt


# Data sanity checks

In [79]:
df[df['job_id']=='726762']

Unnamed: 0,job_id,num_V100_32GB,num_V100_16GB,num_A100_40GB,num_A100_80GB,alloc_cpu,alloc_mem,partition,group,start,end,elapsed,duration,duration_diff,qos,jobname,account,file
41517,726762,0,0,0,384,6144.0,22500G,gpu_p5,six,2022-03-25 11:09:19,2022-03-28 11:14:56,-10 days +16:54:23,3 days 00:05:37,12 days 07:11:14,qos_gpu-gc,tr11-176B-ml,,2022_08_02_thomas_wang_jz_logs.txt


In [80]:
df['duration']

0       0 days 18:37:03
1       0 days 00:00:00
2       0 days 00:00:00
3       0 days 22:55:58
4       0 days 06:26:29
              ...      
39026   0 days 00:09:47
39027   0 days 00:11:51
39028   0 days 00:20:09
39029   0 days 00:20:54
39030   0 days 00:10:45
Name: duration, Length: 177192, dtype: timedelta64[ns]

In [81]:
df['duration'].sum()

Timedelta('5816 days 13:22:40')

In [82]:
df['file'].value_counts()

2022_08_02_thomas_wang_jz_logs.txt      45967
2022_08_08_luciles_logs.txt             39031
2022_08_01_stas_jz_logs.txt             38111
2022_08_25_victorsanh_jz_logs.txt       29049
2022-08-24_tvn_jz_logs.txt              11385
2022_08_01_muennighoff_jz_logs.txt      5139 
2022_08_01_pierrec_jz_logs.txt          4813 
new_hugo_laurencon_jz_logs.txt          2185 
2022_08_24_thomwolf_jz_logs.txt         760  
2022_08_02_younesb_jz_logs.txt          603  
2022_08_24_roman_castagne_logs.txt      110  
2022_08_01_sylvainv_jz_logs.txt         34   
2022_08_08_danielhesslow_jz_logs.txt    5    
Name: file, dtype: int64

# Main Bloom Training

In [84]:
# select only final training runs
bloomdf = df[df['jobname'] == 'tr11-176B-ml']
bloomdf.head()

Unnamed: 0,job_id,num_V100_32GB,num_V100_16GB,num_A100_40GB,num_A100_80GB,alloc_cpu,alloc_mem,partition,group,start,end,elapsed,duration,duration_diff,qos,jobname,account,file
2184,406772,0,0,0,384,6144.0,22500G,gpu_p5,genhug01,2022-06-12 00:31:16,2022-06-12 00:31:17,0 days 00:00:01,0 days 00:00:01,0 days,qos_gpu-gc,tr11-176B-ml,,new_hugo_laurencon_jz_logs.txt
41501,417634,0,0,0,384,6144.0,22500G,gpu_p5,six,2022-03-11 18:09:15,2022-03-11 18:11:04,0 days 00:01:49,0 days 00:01:49,0 days,qos_gpu-gc,tr11-176B-ml,,2022_08_02_thomas_wang_jz_logs.txt
41502,417716,0,0,0,384,6144.0,22500G,gpu_p5,six,2022-03-11 18:18:33,2022-03-11 18:20:04,0 days 00:01:31,0 days 00:01:31,0 days,qos_gpu-gc,tr11-176B-ml,,2022_08_02_thomas_wang_jz_logs.txt
41503,417729,0,0,0,384,6144.0,22500G,gpu_p5,six,2022-03-11 18:21:31,2022-03-11 18:22:43,0 days 00:01:12,0 days 00:01:12,0 days,qos_gpu-gc,tr11-176B-ml,,2022_08_02_thomas_wang_jz_logs.txt
41504,417737,0,0,0,384,6144.0,22500G,gpu_p5,six,2022-03-11 18:27:31,2022-03-11 18:30:16,0 days 00:02:45,0 days 00:02:45,0 days,qos_gpu-gc,tr11-176B-ml,,2022_08_02_thomas_wang_jz_logs.txt


In [13]:
bloomdf['num_A100_80GB'].value_counts()

384    316
192    27 
0      20 
144    13 
288    8  
1      5  
32     4  
8      2  
16     2  
216    1  
Name: num_A100_80GB, dtype: int64

In [14]:
bloomdf.duration.sum()

Timedelta('118 days 05:40:42')

# Other Trainings

In [16]:
traindf = df[df['jobname'].str.startswith('tr')]
traindf = traindf[traindf['jobname'] != 'tr11-176B-ml']
traindf = traindf[~traindf['jobname'].str.contains('sync')]
traindf = traindf[~traindf['jobname'].str.contains('slurm')]
traindf = traindf[~traindf['jobname'].str.contains('move')]
traindf = traindf[~traindf['jobname'].str.contains('eval')]
traindf = traindf[~traindf['jobname'].str.contains('tokenizer')]
traindf.shape

(6430, 18)

In [17]:
traindf['jobname'].value_counts()

tr6e-1B3-prefix-lm                                        1435
tr6f-1B3-prefix-lm                                        1431
tr6g-1B3-prefix-lm                                        1008
tr7d-1B3-alibi                                            433 
tr11                                                      302 
tr11-200B-ml                                              204 
tr8-104B                                                  153 
tr11f-6B3-ml                                              146 
tr8b-104B-cl                                              131 
tr8b-104B-bnb                                             123 
tr8b-104B-emb-norm                                        117 
tr13f-6B3-ml-t0                                           102 
tr5c-1B3-multilingual-alpha-alibi                         81  
tr11c-2B5-ml                                              78  
tr5d-1B3-multilingual-equal-alibi                         73  
tr11e-350M-ml                                          

In [18]:
traindf.duration.sum()

Timedelta('488 days 13:34:21')

# Eval

In [20]:
evaldf = df[df['jobname'].str.contains('eval')]
evaldf.shape

(16041, 18)

In [21]:
evaldf['jobname'].value_counts()

eval-tr3                                                                            3643
evaluate_t0                                                                         2337
eval_finetune-t5-xxl-lm-d4-091621                                                   747 
score_eval_finetune-t5-xxl-lm-d4-091621                                             694 
score_eval_finetune-t5-xxl-lm-d4-all-091621                                         671 
eval_baseline                                                                       534 
eval_finetune-t5-xxl-lm-d4-all-091621                                               531 
eval_finetune-t5-xl-lm-d4-091621                                                    530 
eval_finetune-t5-xxl-lm-d4-091621-512                                               530 
eval_finetune-t5-xxl-lm-d4-gpt-091621                                               514 
bs-eval-bloom-176b                                                                  504 
score_eval_finetune-t

# Tokenizer

In [23]:
tokendf = df[df['jobname'].str.contains('tokenizer')]
tokendf.shape

(103, 18)

In [24]:
tokendf['jobname'].value_counts()

train_tokenizer                                                 67
modelling-metadata-example-load-model-and-tokenizer             11
modelling-metadata-website-desc-load-model-and-tokenizer        8 
modelling-metadata-html-download-tokenizer-and-model            6 
compare_tokenizers                                              3 
modelling-metadata-entity-beg-load-model-and-tokenizer          2 
modelling-metadata-exp1-subexp3-load-model-and-tokenizer        2 
modelling-metadata-entity-load-model-and-tokenizer              1 
modelling-metadata-exp1-subexp1-load-model-and-tokenizer        1 
modelling-metadata-exp1-subexp2-load-model-and-tokenizer        1 
modelling-metadata-website-desc-load-model-and-tokenizer-25k    1 
Name: jobname, dtype: int64

# DATA PROCESSING, DOWNLOADING, DEDUPLICATION

In [255]:
datadf = df[df['jobname'].str.contains('data')]
datadf = datadf[~datadf['jobname'].str.contains('token')]
datadf.shape


(29359, 18)

In [256]:
datadf['jobname'].value_counts()

download_all_catalogue_datasets                                                     4507
pseudo_crawl_clean_dataset                                                          4282
preprocess_all_catalogue_datasets                                                   3146
modelling-metadata-c4-dataset-toy-add-metadata-full                                 2925
modelling-metadata-c4-dataset-toy-add-website-desc                                  2844
modelling-metadata-c4-dataset-export-to-jsonlines                                   2801
modelling-metadata-c4-dataset-toy-add-metadata-full-v3                              2761
filter_short_document_all_catalogue_datasets                                        1900
deduplicate_all_catalogue_datasets                                                  1121
modelling-metadata-c4-dataset-toy-add-metadata-full-v2                              873 
convert_datasets_to_jsonl                                                           609 
yong-download_dataset