In [17]:
# Import pandas
import pandas as pd

In [19]:
# Load the dataframe and print the shape of the data
data = pd.read_csv('manhattan_race_ethnicity_complete_hierarchy.csv')
print(f"Existing data loaded: {data.shape}")

Existing data loaded: (310, 183)


In [20]:
# Load the ethnicity key and print the shape of the ethnicity key
ethnicity_key = pd.read_csv('ethnicity_key.csv')
print(f"Ethnicity key loaded: {ethnicity_key.shape}")

Ethnicity key loaded: (187, 4)


In [22]:
# Define racial group structure
racial_groups = {
    'H': [],      # Hispanic/Latino
    'W': [],      # White  
    'B': [],      # Black
    'A': [],      # Asian
    'AIANA': [],  # American Indian/Alaska Native
    'NHPI': [],   # Native Hawaiian/Pacific Islander
    'SOR': []     # Some Other Race
}

In [23]:
# Map columns to racial groups
for col in data.columns:
    if col not in ['geo_id', 'geo_name', 'total_population']:
        matching_rows = ethnicity_key[ethnicity_key['Code'] == col]
        if len(matching_rows) > 0:
            race = matching_rows.iloc[0]['Race']
            if race in racial_groups:
                racial_groups[race].append(col)

In [24]:
# Display mapping results
for race, cols in racial_groups.items():
    print(f"{race}: {len(cols)} columns")

H: 27 columns
W: 68 columns
B: 34 columns
A: 28 columns
AIANA: 15 columns
NHPI: 5 columns
SOR: 3 columns


In [25]:
# Calculate racial totals
racial_totals = {}
for race_code, columns in racial_groups.items():
    if columns:
        racial_total = data[columns].sum(axis=1)
        racial_totals[race_code] = racial_total

In [26]:
# Display calculated totals
for race_code, racial_total in racial_totals.items():
    total_pop = racial_total.sum()
    tracts_with_pop = (racial_total > 0).sum()
    print(f"{race_code}: {total_pop:,} people across {tracts_with_pop} tracts")

H: 680,618.0 people across 291 tracts
W: 1,820,164.0 people across 301 tracts
B: 289,552.0 people across 287 tracts
A: 465,032.0 people across 297 tracts
AIANA: 6,909.0 people across 81 tracts
NHPI: 1,320.0 people across 5 tracts
SOR: 5,165.0 people across 132 tracts


In [27]:
# Initialise new dataset
new_data = data[['geo_id', 'geo_name', 'total_population']].copy()

In [28]:
# Define race order
race_order = [
    ('H', 'HISPANIC/LATINO'),
    ('W', 'WHITE'), 
    ('B', 'BLACK/AFRICAN AMERICAN'),
    ('A', 'ASIAN'),
    ('AIANA', 'AMERICAN INDIAN/ALASKA NATIVE'),
    ('NHPI', 'NATIVE HAWAIIAN/PACIFIC ISLANDER'),
    ('SOR', 'SOME OTHER RACE')
]

In [29]:
# Make new dataset structure
for race_code, race_name in race_order:
    columns = racial_groups[race_code]
    
    if columns:
        if race_code in racial_totals:
            new_data[race_code] = racial_totals[race_code]
        
        for col in columns:
            new_data[col] = data[col]

print(f"New dataset structure: {new_data.shape}")

New dataset structure: (310, 190)


  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[col] = data[col]
  new_data[race_code] = racial_totals[race_code]

In [30]:
# Save new dataset
new_data.to_csv('manhattan_race_ethnicity.csv', index=False)
print("Enhanced dataset saved to: manhattan_race_ethnicity.csv")

Enhanced dataset saved to: manhattan_race_ethnicity.csv


In [31]:
# Display demographic hierarchy
for race_code, race_name in race_order:
    columns = racial_groups[race_code]
    
    if columns and race_code in racial_totals:
        total_pop = racial_totals[race_code].sum()
        total_tracts = (racial_totals[race_code] > 0).sum()
        
        print(f"\n{race_name}")
        print(f"  MAIN TOTAL ({race_code}): {total_pop:,} people across {total_tracts} tracts")
        
        key_subcategories = columns[:10]
        for col in key_subcategories:
            if col in new_data.columns:
                label_rows = ethnicity_key[ethnicity_key['Code'] == col]
                if len(label_rows) > 0:
                    label = label_rows.iloc[0]['Label']
                    col_total = new_data[col].sum()
                    col_tracts = (new_data[col] > 0).sum()
                    if col_total > 0:
                        print(f"    ├── {label} ({col}): {col_total:,} people across {col_tracts} tracts")
        
        if len(columns) > 10:
            print(f"    └── ... and {len(columns) - 10} more subcategories")


HISPANIC/LATINO
  MAIN TOTAL (H): 680,618.0 people across 291 tracts
    ├── Mexican (HMex): 41,471.0 people across 273 tracts
    ├── Central American (HCA): 7,613.0 people across 53 tracts
    ├── Costa Rican (HCACstRcn): 70.0 people across 3 tracts
    ├── Guatemalan (HCAGutmln): 855.0 people across 25 tracts
    ├── Honduran (HCAHndrn): 2,692.0 people across 69 tracts
    ├── Nicaraguan (HCANcrgn): 425.0 people across 14 tracts
    ├── Panamanian (HCAPnmn): 470.0 people across 17 tracts
    ├── Salvadoran (HCASlvdrn): 2,296.0 people across 57 tracts
    ├── South American (HSA): 37,373.0 people across 184 tracts
    ├── Argentinean (HSAArgntn): 3,253.0 people across 94 tracts
    └── ... and 17 more subcategories

WHITE
  MAIN TOTAL (W): 1,820,164.0 people across 301 tracts
    ├── European (WEur): 554,719.0 people across 299 tracts
    ├── Albanian (WEurAlbn): 2,046.0 people across 64 tracts
    ├── Armenian (WEurArmn): 1,569.0 people across 53 tracts
    ├── Austrian (WEurAstrn)

In [32]:
# Calculate total Manhattan population
total_manhattan_pop = new_data['total_population'].sum()
print(f"\nTotal Manhattan population: {total_manhattan_pop:,}")


Total Manhattan population: 1,694,251


In [33]:
# Display racial counts
for race_code in racial_totals:
    race_total = racial_totals[race_code].sum()
    print(f"{race_code} total: {race_total:,}")

H total: 680,618.0
W total: 1,820,164.0
B total: 289,552.0
A total: 465,032.0
AIANA total: 6,909.0
NHPI total: 1,320.0
SOR total: 5,165.0


In [34]:
# Summary
print(f"\nFinal dataset saved: manhattan_race_ethnicity_.csv ({new_data.shape})")


Final dataset saved: manhattan_race_ethnicity_.csv ((310, 190))
