# Data Cleaning: GPU Clusters and AI Models
This notebook combines GPU cluster data with notable AI models data into a single dataset for visualization.

In [147]:
import pandas as pd
import re

## Load Datasets

In [148]:
gpu_clusters = pd.read_csv('../Data/gpu_clusters_dataset/gpu_clusters.csv')
notable_models = pd.read_csv('../Data/ai_models_dataset/notable_ai_models.csv')

In [149]:
# Check the original datasets
print(f"GPU Clusters shape: {gpu_clusters.shape}")
print(f"Notable Models shape: {notable_models.shape}")

GPU Clusters shape: (786, 55)
Notable Models shape: (939, 46)


In [150]:
gpu_clusters[gpu_clusters["Status"] == "Existing"]

Unnamed: 0,Name,Status,Certainty,Single cluster?,Max OP/s (log),H100 equivalents,Chip type (primary),Chip quantity (primary),Country,Owner,...,Cost Quote,Noteworthy,Decommissioned Date (if applicable),Largest existing cluster when first operational,% of largest cluster when first operational,Source 1,Source 2,Source 3,Source 4,Source 5
13,xAI Colossus Memphis Phase 3,Existing,Confirmed,Yes,20.737034,275795.856497,NVIDIA H100 SXM5 80GB,200000.0,United States of America,xAI,...,,True,,,,https://archive.ph/oHQSA,,,,
16,xAI Colossus Memphis Phase 2,Existing,Likely,Yes,20.597476,200000.000003,NVIDIA H100 SXM5 80GB,150000.0,United States of America,xAI,...,,,,xAI Colossus Memphis Phase 2,1.0,https://archive.ph/z39uN,https://web.archive.org/web/20241217184541/htt...,https://web.archive.org/web/20241217184541/htt...,https://www.youtube.com/watch?v=AUAJ82H12qs&t=...,
24,OpenAI/Microsoft Goodyear Arizona,Existing,Likely,Yes,20.296446,100000.000001,NVIDIA H100 SXM5 80GB,100000.0,United States of America,"Microsoft,OpenAI",...,,,,xAI Colossus Memphis Phase 1,1.0,https://web.archive.org/web/20241006181731/htt...,https://web.archive.org/web/20240609051243/htt...,https://www.youtube.com/watch?v=hobvps-H38o,,
25,Meta 100k,Existing,Likely,Yes,20.296446,100000.000001,NVIDIA H100 SXM5 80GB,100000.0,United States of America,Meta AI,...,,True,,xAI Colossus Memphis Phase 1,1.0,https://archive.ph/85KB1,https://web.archive.org/web/20240901171617/htt...,https://web.archive.org/web/2/https://www.toms...,https://web.archive.org/web/20241228090751/htt...,
26,xAI Colossus Memphis Phase 1,Existing,Confirmed,Yes,20.296446,100000.000001,NVIDIA H100 SXM5 80GB,100000.0,United States of America,xAI,...,,,,xAI Colossus Memphis Phase 1,1.0,https://archive.ph/Bh9Tq,https://web.archive.org/web/20240727065656/htt...,https://archive.ph/z39uN,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,Alibaba Cloud in Germany,Existing,Confirmed,No,,,,,Germany,Alibaba,...,,,,,,https://web.archive.org/web/20250820085405/htt...,,,,
776,Alibaba Cloud in United Kingdom,Existing,Confirmed,No,,,,,United Kingdom of Great Britain and Northern I...,Alibaba,...,,,,,,https://web.archive.org/web/20250820085405/htt...,,,,
777,Alibaba Cloud in UAE,Existing,Confirmed,No,,,,,United Arab Emirates,Alibaba,...,,,,,,https://web.archive.org/web/20250820085405/htt...,,,,
778,Alibaba Cloud in Saudi Arabia,Existing,Confirmed,No,,,,,Saudi Arabia,Alibaba,...,,,,,,https://pandaily.com/alibaba-clouds-jv-opens-t...,,,,


In [151]:
users = gpu_clusters["Users"]

for i in range(len(users)):
    print(users[i])

OpenAI,Microsoft
xAI
Meta
Mistral
Reliance Jio,Cloud
Cloud,Microsoft,OpenAI
OpenAI,Microsoft
Cloud,CoreWeave
Anthropic
OpenAI,Microsoft
Cloud
Cloud
Cloud
xAI
OpenAI
Cloud
xAI
Cloud,CoreWeave
xAI
Cloud
Cloud,Microsoft,OpenAI
nan
OpenAI,Microsoft
SK Telecom,Amazon
OpenAI
Meta
xAI
Tesla
Cloud,CoreWeave
Cloud
CoreWeave,Undisclosed Client
Cloud
nan
nan
nan
nan
nan
Cloud
Cloud
nan
Cloud
Argonne National Laboratory and general scientific community
Microsoft,Azure
Tesla
Tesla
US Government,Lawrence Livermore NL,Scientific Research
nan
CoreWeave
Tesla
NVIDIA
Cloud
nan
NVIDIA,CoreWeave
Cloud
US Government,Academia,Industry
Cloud
Cloud
Meta
Meta
nan
Inflection
nan
Taiwan National Science and Technology Council,TSMC,Foxconn
Cloud
Startups via cloud
nan
Cloud
nan
Cloud
Cloud
nan
Cloud,Cohere,xAI
Meta
Cloud
Cloud
Cloud,NVIDIA
Cloud
nan
OpenAI
Microsoft,Azure
Microsoft,Azure
Cloud
nan
US civilian research
nan
xAI,X
NVIDIA
Cloud
NVIDIA,CoreWeave
Microsoft
Meta
Swiss Universities
Cloud
Cloud
Google
Tes

## Prepare GPU Cluster Data

In [152]:
# Select and rename relevant columns for GPU clusters
# Only include existing clusters
gpu_data = gpu_clusters[gpu_clusters['Status'] == 'Existing'][['Name', 'Owner', 'First Operational Date', 'H100 equivalents']].copy()
gpu_data.columns = ['name', 'owner', 'date', 'total_compute_available']
gpu_data['type'] = 'cluster'
gpu_data['training_compute'] = pd.NA

print(f"Total existing GPU clusters: {len(gpu_data)}")
gpu_data.head()

Total existing GPU clusters: 609


Unnamed: 0,name,owner,date,total_compute_available,type,training_compute
13,xAI Colossus Memphis Phase 3,xAI,2025-07-22,275795.856497,cluster,
16,xAI Colossus Memphis Phase 2,xAI,2025-02-18,200000.000003,cluster,
24,OpenAI/Microsoft Goodyear Arizona,"Microsoft,OpenAI",2024-10-02,100000.000001,cluster,
25,Meta 100k,Meta AI,2024-10-30,100000.000001,cluster,
26,xAI Colossus Memphis Phase 1,xAI,2024-09-02,100000.000001,cluster,


## Prepare AI Model Data

In [153]:
# Select and rename relevant columns for AI models
model_data = notable_models[['Model', 'Organization', 'Publication date', 'Training compute (FLOP)']].copy()
model_data.columns = ['name', 'owner', 'date', 'training_compute']
model_data['type'] = 'model'
model_data['total_compute_available'] = pd.NA

model_data.head()

Unnamed: 0,name,owner,date,training_compute,type,total_compute_available
0,Veo 3.1,Google DeepMind,2025-10-15,,model,
1,Claude Haiku 4.5,Anthropic,2025-10-15,,model,
2,Ling-1T,Ant Group,2025-10-10,6.000001e+24,model,
3,GPT-5 Pro,OpenAI,2025-10-07,,model,
4,Sora 2.0,OpenAI,2025-09-30,,model,


## Combine Datasets

In [154]:
combined_data = pd.concat([model_data, gpu_data], ignore_index=True)
print(f"Combined dataset shape: {combined_data.shape}")
combined_data.head()

Combined dataset shape: (1548, 6)


  combined_data = pd.concat([model_data, gpu_data], ignore_index=True)


Unnamed: 0,name,owner,date,training_compute,type,total_compute_available
0,Veo 3.1,Google DeepMind,2025-10-15,,model,
1,Claude Haiku 4.5,Anthropic,2025-10-15,,model,
2,Ling-1T,Ant Group,2025-10-10,6.000001e+24,model,
3,GPT-5 Pro,OpenAI,2025-10-07,,model,
4,Sora 2.0,OpenAI,2025-09-30,,model,


## Clean Owner Names

In [155]:
def clean_owner_name(owner):
    if pd.isna(owner):
        return owner

    # Extract organization name from parentheses if present
    # e.g., "Stargate (OpenAI)" -> "OpenAI"
    paren_match = re.search(r'\(([^)]+)\)', owner)
    if paren_match:
        return paren_match.group(1)

    # Remove common suffixes and clean up
    owner = owner.strip()

    # Handle comma-separated multiple owners
    # Prioritize AI-first companies if present
    if ',' in owner:
        owners_list = [o.strip() for o in owner.split(',')]
        
        # AI-first companies to prioritize (in order of priority)
        ai_first_companies = ['OpenAI', 'Anthropic', 'xAI', 'Meta', 'Google', 'DeepMind',
                              'Alibaba', 'Mistral', 'Cohere', 'Inflection']
        
        # Check if any AI-first company is in the list
        for ai_company in ai_first_companies:
            for o in owners_list:
                if ai_company.lower() in o.lower():
                    owner = o
                    break
            else:
                continue
            break
        else:
            # No AI-first company found, take first owner
            owner = owners_list[0]

    # Common mappings to standardize names
    mappings = {
        'Microsoft': 'Microsoft',
        'Meta AI': 'Meta',  # Map Meta AI to Meta first
        'Meta': 'Meta',
        'Google DeepMind': 'Google DeepMind',  # Check full name first
        'DeepMind': 'Google DeepMind',  # Map DeepMind to Google DeepMind
        'Google': 'Google DeepMind',
        'Amazon': 'Amazon',
        'OpenAI': 'OpenAI',
        'Anthropic': 'Anthropic',
        'Ant Group': 'Alibaba',  # Ant Group is a subsidiary of Alibaba
        'Alibaba': 'Alibaba',
        'Tesla': 'Tesla',
        'xAI': 'xAI',
        'Oracle': 'Oracle',
        'Tencent': 'Tencent',
        'Mistral': 'Mistral',
        'Cohere': 'Cohere',
        'Inflection': 'Inflection AI'
    }

    # Check if owner contains any of the key organization names
    # Check for exact matches first, then partial matches
    for key, value in mappings.items():
        if owner.lower() == key.lower():
            return value
    
    for key, value in mappings.items():
        if key.lower() in owner.lower():
            return value

    return owner

In [156]:
# Apply cleaning to GPU cluster owners
mask_cluster = combined_data['type'] == 'cluster'
combined_data.loc[mask_cluster, 'owner'] = combined_data.loc[mask_cluster, 'owner'].apply(clean_owner_name)

## Data Type Conversions

In [157]:
# Convert date columns to datetime
combined_data['date'] = pd.to_datetime(combined_data['date'], errors='coerce')

# Convert training_compute to numeric
combined_data['training_compute'] = pd.to_numeric(combined_data['training_compute'], errors='coerce')

# Filter models to only include those with known training compute values
mask_model = combined_data['type'] == 'model'
mask_cluster = combined_data['type'] == 'cluster'

# Keep only models with non-null training_compute
models_with_compute = combined_data[mask_model & combined_data['training_compute'].notna()].copy()
all_clusters = combined_data[mask_cluster].copy()

print(f"Models with known training compute: {len(models_with_compute)}")
print(f"Total clusters: {len(all_clusters)}")

# Recombine
combined_data = pd.concat([models_with_compute, all_clusters], ignore_index=True)

# Filter to only include models/clusters with mutual ownership
# Get unique owners from each type
model_owners = set(combined_data[combined_data['type'] == 'model']['owner'].dropna().unique())
cluster_owners = set(combined_data[combined_data['type'] == 'cluster']['owner'].dropna().unique())

# Find intersection - owners that have both models and clusters
mutual_owners = model_owners & cluster_owners

print(f"\nOwners with both models (with compute) and existing clusters: {len(mutual_owners)}")
print(f"Mutual owners: {sorted(mutual_owners)}")

# Filter combined data to only include these mutual owners
combined_data = combined_data[combined_data['owner'].isin(mutual_owners)].copy()

print(f"\nAfter filtering:")
print(f"Total rows: {len(combined_data)}")
print(f"Models: {len(combined_data[combined_data['type'] == 'model'])}")
print(f"Clusters: {len(combined_data[combined_data['type'] == 'cluster'])}")

Models with known training compute: 505
Total clusters: 609

Owners with both models (with compute) and existing clusters: 21
Mutual owners: ['Alibaba', 'Amazon', 'EleutherAI', 'Google DeepMind', 'IBM', 'Inflection AI', 'Microsoft', 'NAVER', 'NVIDIA', 'OpenAI', 'Princeton University', 'Reka AI', 'Samsung', 'Saudi Aramco', 'Stability AI', 'Technology Innovation Institute', 'University of Cambridge', 'University of Oxford', 'University of Texas at Austin', 'Zhipu AI', 'xAI']

After filtering:
Total rows: 229
Models: 92
Clusters: 137


## Inspect Combined Data

In [158]:
print("\\n=== Final Dataset Summary ===")
print(f"Total rows: {len(combined_data)}")
print(f"Models: {len(combined_data[combined_data['type'] == 'model'])}")
print(f"Clusters: {len(combined_data[combined_data['type'] == 'cluster'])}")
print(f"\\nUnique owners: {combined_data['owner'].nunique()}")
print(f"\\nOwner distribution:")
combined_data.groupby('owner')['type'].value_counts().unstack(fill_value=0)

\n=== Final Dataset Summary ===
Total rows: 229
Models: 92
Clusters: 137
\nUnique owners: 21
\nOwner distribution:


type,cluster,model
owner,Unnamed: 1_level_1,Unnamed: 2_level_1
Alibaba,20,14
Amazon,7,5
EleutherAI,2,1
Google DeepMind,35,6
IBM,3,6
Inflection AI,2,2
Microsoft,19,8
NAVER,3,1
NVIDIA,22,10
OpenAI,4,19


In [159]:
# Display first few rows
combined_data#.head(10)

Unnamed: 0,name,owner,date,training_compute,type,total_compute_available
2,Qwen3-Omni-30B-A3B,Alibaba,2025-09-22,3.600000e+22,model,
3,AgentFounder-30B,Alibaba,2025-09-16,6.536700e+23,model,
4,Qwen3-Max,Alibaba,2025-09-05,1.512000e+25,model,
6,gpt-oss-120b,OpenAI,2025-08-05,4.940000e+24,model,
7,gpt-oss-20b,OpenAI,2025-08-05,5.490000e+23,model,
...,...,...,...,...,...,...
1108,Alibaba Cloud in Silicon Valley,Alibaba,2014-07-01,,cluster,
1109,Alibaba Cloud in Germany,Alibaba,2016-07-01,,cluster,
1110,Alibaba Cloud in United Kingdom,Alibaba,2018-07-01,,cluster,
1111,Alibaba Cloud in UAE,Alibaba,2016-07-01,,cluster,


In [160]:
# Check data types
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 229 entries, 2 to 1112
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   name                     229 non-null    object        
 1   owner                    229 non-null    object        
 2   date                     228 non-null    datetime64[ns]
 3   training_compute         92 non-null     float64       
 4   type                     229 non-null    object        
 5   total_compute_available  119 non-null    float64       
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 12.5+ KB


In [161]:
# Check for missing values
combined_data.isnull().sum()

name                         0
owner                        0
date                         1
training_compute           137
type                         0
total_compute_available    110
dtype: int64

In [162]:
# Sample of models
combined_data[combined_data['type'] == 'model']#.head()

Unnamed: 0,name,owner,date,training_compute,type,total_compute_available
2,Qwen3-Omni-30B-A3B,Alibaba,2025-09-22,3.600000e+22,model,
3,AgentFounder-30B,Alibaba,2025-09-16,6.536700e+23,model,
4,Qwen3-Max,Alibaba,2025-09-05,1.512000e+25,model,
6,gpt-oss-120b,OpenAI,2025-08-05,4.940000e+24,model,
7,gpt-oss-20b,OpenAI,2025-08-05,5.490000e+23,model,
...,...,...,...,...,...,...
488,NetTalk (transcription),Princeton University,1987-06-06,2.832800e+10,model,
489,NetTalk (dictionary),Princeton University,1987-06-06,2.766406e+10,model,
495,LTE speaker verification system,IBM,1966-11-01,1.059171e+08,model,
497,Print Recognition Logic,IBM,1963-01-01,2.250000e+07,model,


In [163]:
# Sample of clusters
combined_data[combined_data['type'] == 'cluster'].head()

Unnamed: 0,name,owner,date,training_compute,type,total_compute_available
505,xAI Colossus Memphis Phase 3,xAI,2025-07-22,,cluster,275795.856497
506,xAI Colossus Memphis Phase 2,xAI,2025-02-18,,cluster,200000.000003
507,OpenAI/Microsoft Goodyear Arizona,OpenAI,2024-10-02,,cluster,100000.000001
509,xAI Colossus Memphis Phase 1,xAI,2024-09-02,,cluster,100000.000001
523,AWS EC2 P5 UltraClusters,Amazon,2023-07-26,,cluster,20000.0


## Save Combined Dataset

In [None]:
combined_data.to_csv('./Data/combined_gpu_models.csv', index=False)
print("Data saved to ./Data/combined_gpu_models.csv")