In [41]:
import pandas as pd
from census import Census
import pprint
# from us import states

c = Census("c4f3ae4d0e06d23cd971a41b448944090bf72d92")

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
pp = pprint.PrettyPrinter(indent=4)

From https://waysandmeans.house.gov/wp-content/uploads/2017/02/20170215HR-Testimony-Kneebone.pdf

For the purposes of this analysis, “cities” refer to primary cities in the nation’s 100 most populous metropolitan statistical areas (MSAs). A primary city is defined as the first named city in the official MSA title and any other city in the MSA name that has a population of 100,000 or more, which means a region could have between one and three primary cities. The suburbs are defined as the remainder of the region within the MSA but outside of the primary city or cities. Small metro areas comprise all other MSAs outside the top 100, and rural areas include all counties that are not a part of an MSA.

## 1. Get 100 most populous metropolitan statistical areas (ACS-2015)

In [3]:
# total population (ACS5): B01003_001E
pop_key = 'B01003_001E'
msas_2015 = c.acs5.get(('NAME', pop_key), {'for': 'metropolitan statistical area/micropolitan statistical area:*'}, year=2015)
msas_2015 = pd.DataFrame(msas_2015)
msas_2015.columns = ['population', 'name', 'msa_code']

# filter to metro areas and sort by population
metros_2015 = msas_2015[msas_2015.name.str.contains('Metro Area')]
top_100_metros_2015 = msas_2015.sort_values(by='population', ascending=False).iloc[:100]

display(top_100_metros_2015.head())

Unnamed: 0,population,name,msa_code
608,19979950.0,"New York-Newark-Jersey City, NY-NJ-PA Metro Area",35620
499,13154457.0,"Los Angeles-Long Beach-Anaheim, CA Metro Area",31080
170,9534008.0,"Chicago-Naperville-Elgin, IL-IN-WI Metro Area",16980
218,6833420.0,"Dallas-Fort Worth-Arlington, TX Metro Area",19100
389,6346653.0,"Houston-The Woodlands-Sugar Land, TX Metro Area",26420


## 2. From those 100 areas, get the "primary cities" within those areas

In [75]:
states = c.acs5.get('NAME', {'for': 'state:*', 'in': 'metropolitan statistical area/micropolitan statistical area:35620'})

def _get_states(msa_id):
    return c.acs5.get(
        'NAME', 
        {'for': 'state:*', 
         'in': f'metropolitan statistical area/micropolitan statistical area:{msa_id}'}
    )

def _get_cities(msa_id, state_id):
    return c.acs5.get(
        ('NAME', pop_key),
        {'for': 'principal city:*', 
         'in': f'metropolitan statistical area/micropolitan statistical area:{msa_id} state: {state_id}'}
    )

def _clean_city_name(city_name):
    repls = ('city',
             'CDP',
             'metropolitan government',
             'metro government',
             '(balance)',
             'consolidated government')
    for r in repls:
        city_name = city_name.replace(f' {r}', '')

    if city_name == "San Buenaventura (Ventura)":
        city_name = 'Ventura'
        
    return city_name

def _is_primary_city(city_object):
    full_city_name, metro_name = city_object['NAME'].split('; ')
    city_name, state_name = full_city_name.split(', ')
    city_name = _clean_city_name(city_name)
    
#     if (city_object[pop_key] >= 1e5):
#         print(color.BOLD + f'{city_name}, {state_name}' + color.END)
#     else:
#         print(f'{city_name}, {state_name}')
        
    return (city_name in metro_name and city_object[pop_key] >= 1e5)

def _get_primary_cities_from_state(msa_id, state_id):
    cities = _get_cities(msa_id, state_id)
    return [city for city in cities if _is_primary_city(city)]

flatten = lambda l: [item for sublist in l for item in sublist]

def get_primary_cities(msa_id):
    states = _get_states(msa_id)
    primary_cities = [_get_primary_cities_from_state(msa_id, state_id=s['state']) for s in states]
    return flatten(primary_cities)

# for msa_code in top_100_metros_2015.msa_code.values[92:]:
#     pp.pprint(get_primary_cities(msa_code))
#     print()

primary_cities = flatten([get_primary_cities(msa_code) for msa_code in top_100_metros_2015.msa_code.values[2:3]])
primary_cities = pd.DataFrame(primary_cities)
display(primary_cities.head())

Unnamed: 0,B01003_001E,NAME,metropolitan statistical area/micropolitan statistical area,principal city,state
0,2714017.0,"Chicago city, IL; Chicago-Naperville-Elgin, IL...",16980,14000,17
1,111919.0,"Elgin city, IL; Chicago-Naperville-Elgin, IL-I...",16980,23074,17
2,145789.0,"Naperville city, IL; Chicago-Naperville-Elgin,...",16980,51622,17


In [73]:
msa_df = pd.read_excel('list1_Sep_2018.xls', header=2)
metros_df = msa_df[msa_df['Metropolitan/Micropolitan Statistical Area'] == 'Metropolitan Statistical Area']
metros_df[metros_df['CBSA Code'] == '35620']

Unnamed: 0,CBSA Code,Metropolitan Division Code,CSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,Metropolitan Division Title,CSA Title,County/County Equivalent,State Name,FIPS State Code,FIPS County Code,Central/Outlying County
1205,35620,35004.0,408.0,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan Statistical Area,"Nassau County-Suffolk County, NY","New York-Newark, NY-NJ-CT-PA",Nassau County,New York,36.0,59.0,Central
1206,35620,35004.0,408.0,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan Statistical Area,"Nassau County-Suffolk County, NY","New York-Newark, NY-NJ-CT-PA",Suffolk County,New York,36.0,103.0,Central
1207,35620,35084.0,408.0,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan Statistical Area,"Newark, NJ-PA","New York-Newark, NY-NJ-CT-PA",Essex County,New Jersey,34.0,13.0,Central
1208,35620,35084.0,408.0,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan Statistical Area,"Newark, NJ-PA","New York-Newark, NY-NJ-CT-PA",Hunterdon County,New Jersey,34.0,19.0,Central
1209,35620,35084.0,408.0,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan Statistical Area,"Newark, NJ-PA","New York-Newark, NY-NJ-CT-PA",Morris County,New Jersey,34.0,27.0,Central
1210,35620,35084.0,408.0,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan Statistical Area,"Newark, NJ-PA","New York-Newark, NY-NJ-CT-PA",Sussex County,New Jersey,34.0,37.0,Central
1211,35620,35084.0,408.0,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan Statistical Area,"Newark, NJ-PA","New York-Newark, NY-NJ-CT-PA",Union County,New Jersey,34.0,39.0,Central
1212,35620,35084.0,408.0,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan Statistical Area,"Newark, NJ-PA","New York-Newark, NY-NJ-CT-PA",Pike County,Pennsylvania,42.0,103.0,Outlying
1213,35620,35154.0,408.0,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan Statistical Area,"New Brunswick-Lakewood, NJ","New York-Newark, NY-NJ-CT-PA",Middlesex County,New Jersey,34.0,23.0,Central
1214,35620,35154.0,408.0,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan Statistical Area,"New Brunswick-Lakewood, NJ","New York-Newark, NY-NJ-CT-PA",Monmouth County,New Jersey,34.0,25.0,Central
