# Training Time Analysis
Calculate how long it would take to train the average of the top 10 frontier models on each rank-1 GPU cluster.

In [15]:
import pandas as pd
import numpy as np

## Load Datasets

In [16]:
print("Loading datasets...")
frontier_models = pd.read_csv('../Data/ai_models_dataset/frontier_ai_models.csv')
gpu_clusters = pd.read_csv('../Data/gpu_clusters_dataset/gpu_clusters.csv')

print(f"Initial frontier models: {len(frontier_models)}")
print(f"Initial GPU clusters: {len(gpu_clusters)}")

Loading datasets...
Initial frontier models: 131
Initial GPU clusters: 786


## Filter Frontier Models
Keep only models with known training compute

In [17]:
# Filter frontier models to only those with known training compute
frontier_models_filtered = frontier_models[
    frontier_models['Training compute (FLOP)'].notna()
].copy()

print(f"Frontier models with known training compute: {len(frontier_models_filtered)}")

# Convert training compute to numeric and calculate log
frontier_models_filtered['Training compute (FLOP)'] = pd.to_numeric(
    frontier_models_filtered['Training compute (FLOP)'],
    errors='coerce'
)
frontier_models_filtered['Training compute (log)'] = np.log10(
    frontier_models_filtered['Training compute (FLOP)']
)

# Convert publication date to datetime
frontier_models_filtered['Publication date'] = pd.to_datetime(
    frontier_models_filtered['Publication date'],
    errors='coerce'
)

Frontier models with known training compute: 125


In [18]:
# Display sample of frontier models
frontier_models_filtered[['Model', 'Organization', 'Publication date', 'Training compute (FLOP)', 'Training compute (log)']].head(10)

Unnamed: 0,Model,Organization,Publication date,Training compute (FLOP),Training compute (log)
0,Theseus,Bell Laboratories,1950-07-02,40.0,1.60206
1,Perceptron Mark I,"Cornell Aeronautical Laboratory,Cornell Univer...",1957-01-01,694894.9,5.841919
2,Pandemonium (morse),Massachusetts Institute of Technology (MIT),1959-02-01,600000000.0,8.778151
3,Samuel Neural Checkers,IBM,1959-07-01,428400000.0,8.631849
4,Perceptron (1960),Cornell Aeronautical Laboratory,1960-03-30,720000000.0,8.857332
5,ADALINE,Stanford University,1960-06-30,6600.0,3.819544
6,Linear Decision Functions,Bell Laboratories,1962-06-01,1559250.0,6.192916
7,Print Recognition Logic,IBM,1963-01-01,22500000.0,7.352183
8,LTE speaker verification system,IBM,1966-11-01,105917100.0,8.024966
9,Neocognitron,NHK Broadcasting Science Research Laboratories,1980-04-01,273738200.0,8.437335


In [19]:
# Check info
frontier_models_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 125 entries, 0 to 130
Data columns (total 97 columns):
 #   Column                                           Non-Null Count  Dtype         
---  ------                                           --------------  -----         
 0   Model                                            125 non-null    object        
 1   Domain                                           124 non-null    object        
 2   Task                                             125 non-null    object        
 3   Authors                                          112 non-null    object        
 4   Notability criteria                              118 non-null    object        
 5   Notability criteria notes                        37 non-null     object        
 6   Model accessibility                              62 non-null     object        
 7   Link                                             125 non-null    object        
 8   Citations                                    

## Filter GPU Clusters
Keep only rank-1 clusters with known 16-bit OP/s

In [20]:
# Filter GPU clusters to only rank=1 clusters with known 16-bit OP/s
gpu_clusters_filtered = gpu_clusters[
    (gpu_clusters['Rank when first operational'] == 1) &
    (gpu_clusters['16-bit OP/s (log)'].notna())
].copy()

print(f"Rank-1 GPU clusters with known 16-bit OP/s: {len(gpu_clusters_filtered)}")

# Convert first operational date to datetime
gpu_clusters_filtered['First Operational Date'] = pd.to_datetime(
    gpu_clusters_filtered['First Operational Date'],
    errors='coerce'
)

# Convert 16-bit OP/s to numeric
gpu_clusters_filtered['16-bit OP/s (log)'] = pd.to_numeric(
    gpu_clusters_filtered['16-bit OP/s (log)'],
    errors='coerce'
)

Rank-1 GPU clusters with known 16-bit OP/s: 18


In [21]:
# Display sample of GPU clusters
gpu_clusters_filtered[['Name', 'Owner', 'First Operational Date', '16-bit OP/s (log)']]#.head(10)

Unnamed: 0,Name,Owner,First Operational Date,16-bit OP/s (log)
16,xAI Colossus Memphis Phase 2,xAI,2025-02-18,20.296413
24,OpenAI/Microsoft Goodyear Arizona,"Microsoft,OpenAI",2024-10-02,19.995372
25,Meta 100k,Meta AI,2024-10-30,19.995372
26,xAI Colossus Memphis Phase 1,xAI,2024-09-02,19.995372
47,CoreWeave H200s,CoreWeave,2024-08-28,19.618665
57,Meta GenAI 2024a,Meta AI,2024-03-12,19.385883
58,Meta GenAI 2024b,Meta AI,2024-03-12,19.385883
78,Microsoft GPT-4 cluster,"Microsoft,OpenAI",2022-04-30,18.892095
81,Microsoft Azure Eagle,Microsoft,2023-11-15,19.153734
83,Oak Ridge NL Summit,US Department of Energy,2018-06-08,18.538574


In [22]:
# Check all rank-1 clusters
gpu_clusters_filtered[['Name', 'Owner', 'First Operational Date', '16-bit OP/s (log)']].sort_values('First Operational Date')

Unnamed: 0,Name,Owner,First Operational Date,16-bit OP/s (log)
208,Google TensorFlow Research Cloud,Google,2017-05-17,17.265572
90,Meta 2017 V100 Cluster,Meta AI,2017-10-01,18.439333
83,Oak Ridge NL Summit,US Department of Energy,2018-06-08,18.538574
617,Anonymized Chinese System,,2021-03-15,18.778151
132,Anonymized Chinese System,,2021-07-15,18.477121
78,Microsoft GPT-4 cluster,"Microsoft,OpenAI",2022-04-30,18.892095
95,Tesla 10k H100 Cluster,Tesla,2023-08-28,18.995372
97,Imbue 10k Cluster,Imbue,2023-09-07,18.995372
89,Microsoft Azure MLPerf 3.1 Submission,Microsoft,2023-11-08,19.026861
88,NVIDIA CoreWeave Eos-DFW Phase 1,"NVIDIA,CoreWeave",2023-11-08,19.026861


## Calculate Top 10 Models and Training Time for Each Cluster
For each rank-1 cluster, find the top 10 models released before that cluster's operational date

In [23]:
# Function to get top 10 models before a given date
def get_top_10_models_before_date(models_df, cutoff_date, top_n=10):
    """Get top N models by training compute released before cutoff_date"""
    models_before = models_df[models_df['Publication date'] < cutoff_date]
    if len(models_before) < top_n:
        print(f"Warning: Only {len(models_before)} models found before {cutoff_date}")
        return models_before.nlargest(len(models_before), 'Training compute (FLOP)')
    return models_before.nlargest(top_n, 'Training compute (FLOP)')

# Calculate training time for each cluster based on its top 10 models
results = []

for idx, cluster_row in gpu_clusters_filtered.iterrows():
    cluster_name = cluster_row['Name']
    cluster_date = cluster_row['First Operational Date']
    cluster_ops_log = cluster_row['16-bit OP/s (log)']
    
    # Get top 10 models before this cluster's operational date
    top_10_for_cluster = get_top_10_models_before_date(frontier_models_filtered, cluster_date)
    
    if len(top_10_for_cluster) == 0:
        print(f"No models found before {cluster_name} ({cluster_date})")
        continue
    
    # Calculate average training compute for these models
    avg_compute_log = top_10_for_cluster['Training compute (log)'].mean()
    avg_compute = top_10_for_cluster['Training compute (FLOP)'].mean()
    
    # Calculate training time
    training_time_log_sec = avg_compute_log - cluster_ops_log
    training_time_sec = 10 ** training_time_log_sec
    training_time_min = training_time_sec / 60
    training_time_hrs = training_time_min / 60
    training_time_days = training_time_hrs / 24
    
    results.append({
        'Name': cluster_name,
        'Owner': cluster_row['Owner'],
        'First Operational Date': cluster_date,
        '16-bit OP/s (log)': cluster_ops_log,
        'Num models available': len(top_10_for_cluster),
        'Avg training compute (log)': avg_compute_log,
        'Avg training compute (FLOP)': avg_compute,
        'Training time (seconds)': training_time_sec,
        'Training time (minutes)': training_time_min,
        'Training time (hours)': training_time_hrs,
        'Training time (days)': training_time_days
    })

# Create results dataframe
gpu_clusters_results = pd.DataFrame(results)

print(f"Calculated training times for {len(gpu_clusters_results)} clusters")
gpu_clusters_results#.head()

Calculated training times for 18 clusters


Unnamed: 0,Name,Owner,First Operational Date,16-bit OP/s (log),Num models available,Avg training compute (log),Avg training compute (FLOP),Training time (seconds),Training time (minutes),Training time (hours),Training time (days)
0,xAI Colossus Memphis Phase 2,xAI,2025-02-18,20.296413,10,25.385034,5.5575e+25,122636.89727,2043.948288,34.065805,1.419409
1,OpenAI/Microsoft Goodyear Arizona,"Microsoft,OpenAI",2024-10-02,19.995372,10,25.189622,2.0964e+25,156404.872723,2606.747879,43.445798,1.810242
2,Meta 100k,Meta AI,2024-10-30,19.995372,10,25.189622,2.0964e+25,156404.872723,2606.747879,43.445798,1.810242
3,xAI Colossus Memphis Phase 1,xAI,2024-09-02,19.995372,10,25.189622,2.0964e+25,156404.872723,2606.747879,43.445798,1.810242
4,CoreWeave H200s,CoreWeave,2024-08-28,19.618665,10,25.189622,2.0964e+25,372354.919686,6205.915328,103.431922,4.309663
5,Meta GenAI 2024a,Meta AI,2024-03-12,19.385883,10,24.817023,1.099855e+25,269860.927652,4497.682128,74.961369,3.12339
6,Meta GenAI 2024b,Meta AI,2024-03-12,19.385883,10,24.817023,1.099855e+25,269860.927652,4497.682128,74.961369,3.12339
7,Microsoft GPT-4 cluster,"Microsoft,OpenAI",2022-04-30,18.892095,10,23.949721,1.13828e+24,114189.463115,1903.157719,31.719295,1.321637
8,Microsoft Azure Eagle,Microsoft,2023-11-15,19.153734,10,24.627878,5.50455e+24,297950.233956,4965.837233,82.763954,3.448498
9,Oak Ridge NL Summit,US Department of Energy,2018-06-08,18.538574,10,21.059649,2.26746e+21,331.951713,5.532529,0.092209,0.003842


## Inspect Results

In [24]:
# Display all results sorted by date
gpu_clusters_results.sort_values('First Operational Date')

Unnamed: 0,Name,Owner,First Operational Date,16-bit OP/s (log),Num models available,Avg training compute (log),Avg training compute (FLOP),Training time (seconds),Training time (minutes),Training time (hours),Training time (days)
16,Google TensorFlow Research Cloud,Google,2017-05-17,17.265572,10,20.530759,1.211781e+21,1841.56158,30.692693,0.511545,0.021314
12,Meta 2017 V100 Cluster,Meta AI,2017-10-01,18.439333,10,20.785173,1.348342e+21,221.738003,3.695633,0.061594,0.002566
9,Oak Ridge NL Summit,US Department of Energy,2018-06-08,18.538574,10,21.059649,2.26746e+21,331.951713,5.532529,0.092209,0.003842
17,Anonymized Chinese System,,2021-03-15,18.778151,10,22.836532,9.04704e+22,11438.82032,190.647005,3.17745,0.132394
15,Anonymized Chinese System,,2021-07-15,18.477121,10,23.013686,2.422084e+23,34400.486139,573.341436,9.555691,0.398154
7,Microsoft GPT-4 cluster,"Microsoft,OpenAI",2022-04-30,18.892095,10,23.949721,1.13828e+24,114189.463115,1903.157719,31.719295,1.321637
13,Tesla 10k H100 Cluster,Tesla,2023-08-28,18.995372,10,24.573611,5.10597e+24,378650.958223,6310.849304,105.180822,4.382534
14,Imbue 10k Cluster,Imbue,2023-09-07,18.995372,10,24.600018,5.27727e+24,402388.963315,6706.482722,111.774712,4.65728
11,Microsoft Azure MLPerf 3.1 Submission,Microsoft,2023-11-08,19.026861,10,24.627878,5.50455e+24,399040.491905,6650.674865,110.844581,4.618524
10,NVIDIA CoreWeave Eos-DFW Phase 1,"NVIDIA,CoreWeave",2023-11-08,19.026861,10,24.627878,5.50455e+24,399040.491905,6650.674865,110.844581,4.618524


## Example: Check Top 10 Models for a Specific Cluster

In [25]:
# Pick a cluster to examine (e.g., the first one chronologically)
if len(gpu_clusters_results) > 0:
    example_cluster = gpu_clusters_results.sort_values('First Operational Date').iloc[0]
    print(f"Cluster: {example_cluster['Name']}")
    print(f"Operational Date: {example_cluster['First Operational Date']}")
    print(f"\nTop 10 models released before this date:")
    
    top_10_example = get_top_10_models_before_date(
        frontier_models_filtered, 
        example_cluster['First Operational Date']
    )
    print(top_10_example[['Model', 'Organization', 'Publication date', 'Training compute (FLOP)', 'Training compute (log)']])

Cluster: Google TensorFlow Research Cloud
Operational Date: 2017-05-17 00:00:00

Top 10 models released before this date:
                    Model                            Organization  \
67                   GNMT                                  Google   
69       NASv3 (CIFAR-10)                            Google Brain   
65            AlphaGo Lee                                DeepMind   
68               Xception                                  Google   
61            AlphaGo Fan                                DeepMind   
59               SNM-skip                                  Google   
66   BIG LSTM+CNN INPUTS                             Google Brain   
63           Inception v3  Google,University College London (UCL)   
58           Seq2Seq LSTM                                  Google   
64  DeepSpeech2 (English)  Baidu Research - Silicon Valley AI Lab   

   Publication date  Training compute (FLOP)  Training compute (log)  
67       2016-09-26             6.620000e+21   

In [26]:
## Summary Statistics

## Final Dataframes
- `frontier_models_filtered`: Frontier models with known training compute
- `gpu_clusters_results`: Rank-1 clusters with calculated training times based on their top 10 models

In [27]:
print("=== Training Time Summary Statistics ===")
print(f"Median training time: {gpu_clusters_results['Training time (hours)'].median():.2f} hours ({gpu_clusters_results['Training time (days)'].median():.2f} days)")
print(f"Mean training time: {gpu_clusters_results['Training time (hours)'].mean():.2f} hours ({gpu_clusters_results['Training time (days)'].mean():.2f} days)")
print(f"Min training time: {gpu_clusters_results['Training time (hours)'].min():.2f} hours ({gpu_clusters_results['Training time (days)'].min():.2f} days)")
print(f"Max training time: {gpu_clusters_results['Training time (hours)'].max():.2f} hours ({gpu_clusters_results['Training time (days)'].max():.2f} days)")

=== Training Time Summary Statistics ===
Median training time: 43.45 hours (1.81 days)
Mean training time: 54.68 hours (2.28 days)
Min training time: 0.06 hours (0.00 days)
Max training time: 111.77 hours (4.66 days)


In [28]:
# Distribution of training times
gpu_clusters_results[['Training time (hours)', 'Training time (days)', 'Num models available']].describe()

Unnamed: 0,Training time (hours),Training time (days),Num models available
count,18.0,18.0,18.0
mean,54.682461,2.278436,10.0
std,42.719248,1.779969,0.0
min,0.061594,0.002566,10.0
25%,15.096592,0.629025,10.0
50%,43.445798,1.810242,10.0
75%,98.26493,4.094372,10.0
max,111.774712,4.65728,10.0


# Save results to CSV

In [29]:
gpu_clusters_results.to_csv('../Data/gpu_cluster_training_times.csv', index=False)
print("Results saved to ../Data/gpu_cluster_training_times.csv")

Results saved to ../Data/gpu_cluster_training_times.csv
