In [1]:
import pandas as pd
import numpy as np

In [2]:
celeb_heights_csv_name = 'data/celeb-heights-2025-03-13.csv'
notion_data_csv_name = 'data/notion-data-2025-12-23.csv'

In [3]:
celeb_heights = pd.read_csv(celeb_heights_csv_name)
notion_data = pd.read_csv(notion_data_csv_name)

In [4]:
merged = pd.merge(celeb_heights[['Name', 'Height', 'Image', 'URL']], notion_data[['Name', 'Height', 'ID', 'Category']], on='Name', how='outer', indicator=True)
# only combines if height not specified in notion data
combined = merged[(merged['_merge'] == 'both') & (merged['Height_y'].isna())].drop(columns=['Height_y']).rename(columns={'Height_x': 'Height'})
# include people with heights specified in notion data
unmatched = merged[(merged['_merge'] == 'right_only') | (merged['Height_y'].notna())].drop(columns=['Height_y']).rename(columns={'Height_x': 'Height'})

In [5]:
new_order = ['ID', 'Name', 'Height', 'Image', 'Category', 'Source']

In [6]:
combined['Source'] = [['Celeb Heights']] * len(combined)
combined.drop(columns=['_merge'], inplace=True)
combined = combined[new_order]
combined

Unnamed: 0,ID,Name,Height,Image,Category,Source
23,726.0,Aaron Paul,172.0,https://www.celebheights.com/tr/a/aaronpaul.jpg,[Acting],[Celeb Heights]
76,340.0,Adam Driver,189.0,https://www.celebheights.com/tr/a/adamdriver.jpg,[Acting],[Celeb Heights]
98,221.0,Adam Scott,175.0,https://www.celebheights.com/tr/a/adamscott.jpg,[Acting],[Celeb Heights]
120,282.0,Adolf Hitler,174.0,https://www.celebheights.com/tr/a/adolfhitler.jpg,"[Politics,Historical]",[Celeb Heights]
153,976.0,Afrojack,202.0,https://www.celebheights.com/tr/a/afrojack.jpg,[Music],[Celeb Heights]
...,...,...,...,...,...,...
14373,802.0,Woody Harrelson,177.0,https://www.celebheights.com/tr/w/woodyharrels...,[Acting],[Celeb Heights]
14382,912.0,XXXTentacion,168.0,https://www.celebheights.com/tr/x/xxxtentacion...,[Music],[Celeb Heights]
14436,957.0,Yung Gravy,198.0,https://www.celebheights.com/tr/y/yunggravy.jpg,[Music],[Celeb Heights]
14451,289.0,Zac Efron,173.0,https://www.celebheights.com/tr/z/zacefron.jpg,[Acting],[Celeb Heights]


In [7]:
unmatched = pd.merge(unmatched['Name'], notion_data)
custom = unmatched[unmatched['Height'].notna() & unmatched['Image'].notna()]
custom = custom[new_order]
custom

Unnamed: 0,ID,Name,Height,Image,Category,Source
0,265,21 Savage,182.0,https://www.celebheights.com/tr/0/21savage.jpg,[Music],[Celeb Heights]
1,952,APL.DE.AP,168.0,https://upload.wikimedia.org/wikipedia/commons...,[Music],[IMDb]
2,14,Abraham Lincoln,192.0,https://www.celebheights.com/tr/a/abrahamlinco...,"[Presidents,Historical]",[Celeb Heights]
3,109,Adam Sandler,177.0,https://www.celebheights.com/tr/a/adamsandler.jpg,"[Acting,Comedy]",[Celeb Heights]
4,199,Adam Savage,179.0,https://www.celebheights.com/tr/a/adamsavage.jpg,"[Influencers,Acting]",[Celeb Heights]
...,...,...,...,...,...,...
416,740,Zack Snyder,170.0,https://upload.wikimedia.org/wikipedia/commons...,[Filmmakers],[IMDb]
417,156,Zendaya,174.0,https://www.celebheights.com/tr/z/zendaya.jpg,[Acting],[Celeb Heights]
418,1007,bbno$,175.0,https://upload.wikimedia.org/wikipedia/commons...,[Music],[Google]
419,791,deadmau5,175.0,https://upload.wikimedia.org/wikipedia/commons...,[Music],[IMDb]


In [8]:
unmatched = unmatched[unmatched['Height'].isna() | unmatched['Image'].isna()]
unmatched = unmatched[new_order]
unmatched

Unnamed: 0,ID,Name,Height,Image,Category,Source
14,671,Alexander the Great,,,[Historical],[]
24,180,Aristotle,,,[Historical],[]
32,888,Ben E. King,,,[Music],[]
39,982,Big Shaq,,,[Music],[]
60,665,Charles de Gaulle,,,[Historical],[]
80,189,Dev,171.0,,[Other],[Us]
83,97,Dixie D'Amelio,167.0,,[Influencers],[IMDb]
84,954,Doechii,157.0,,[Music],[IMDb]
96,501,Elizabeth I,,,[Historical],[]
109,950,Fergie,,,[Music],[]


In [9]:
final = pd.concat([combined, custom]).sort_values(by='ID')
final['ID'] = final['ID'].astype(int)
final.to_csv('data/final_data.csv', index=False)

In [10]:
unmatched.to_csv('data/unmatched_data.csv', index=False)