## This notebook prepare raw data into network data for feature extraction

In [1]:
import pandas as pd
import networkx as nx
import numpy as np
from datetime import date
from glob import glob

from difflib import get_close_matches 

## Creating US air networks from source data

In [2]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}


def state_to_abbr(state):
    return(us_state_abbrev[state]).lower()

In [3]:
import re
import unicodedata

def strip_accents(text):
    """
    Strip accents from input String.

    :param text: The input string.
    :type text: String.

    :returns: The processed String.
    :rtype: String.
    """
    try:
        text = unicode(text, 'utf-8')
    except (TypeError, NameError): # unicode is a default on python 3 
        pass
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return str(text)

def text_to_id(text):
    """
    Convert input text to id.

    :param text: The input string.
    :type text: String.

    :returns: The processed String.
    :rtype: String.
    """
    text = strip_accents(text.lower())
    text = re.sub(r"\d", "", text) 
    text=re.sub(r"^\s+", "", text) 
    text=re.sub(r"\s+$", "", text)
    text = re.sub(r"\s+","_", text, flags = re.I)
    #text = re.sub('[ ]+', '_', text)
    text = re.sub('[^a-zA-Z_-]', '', text)
    return text

In [4]:
# get distance dataframe
f = '../data/us_air_raw_data/770841524_T_T100D_MARKET_ALL_CARRIER-1.csv'   
df=pd.read_csv(f)
# df = df[df.MONTH==12]
df=df[['YEAR','MONTH','ORIGIN_CITY_NAME','DEST_CITY_NAME','PASSENGERS','DISTANCE']]
df=df.rename(index=str, columns={"ORIGIN_CITY_NAME": "source", "DEST_CITY_NAME": "target",'PASSENGERS':'weight','DISTANCE':'distance'})
df['source']=df.apply(lambda row: text_to_id(str(row.source)), axis=1)
df['target']=df.apply(lambda row: text_to_id(str(row.target)), axis=1)
# df=df[df.weight !=0 ].reset_index()
df=df[['source','target','weight','distance']]
df = df.drop_duplicates(subset=['source','target']).reset_index().drop(columns=['index'])
df.to_pickle('../data/us_air_distance.pkl')
df.head()

Unnamed: 0,source,target,weight,distance
0,new_orleans_la,islip_ny,0.0,1218.0
1,new_orleans_la,salt_lake_city_ut,0.0,1428.0
2,portland_or,fort_lauderdale_fl,0.0,2694.0
3,raleighdurham_nc,san_francisco_ca,0.0,2400.0
4,seattle_wa,wichita_ks,0.0,1430.0


In [5]:
import requests
import json

def get_city_opendata(city, state):
    cmd = f'https://public.opendatasoft.com/api/records/1.0/search/?dataset=cities-and-towns-of-the-united-states&q={city}&sort=pop_2010&facet=state&refine.state={state}'

    res = requests.get(cmd)
    dct = json.loads(res.content)
    records = dct['records']
#     if len(records)>1:
#         record = [i for i in records if i['fields']['name'].upper()== city.upper()]
#         output = record[0]['fields']['pop_2010']
#     else:
    output = records[0]['fields']['pop_2010']
#     return output
    return records


In [51]:
def get_population(node):    
    full_name = node.replace('/','-')
    search_name =  node.replace('/','-').replace(',',' ')
    cmd = f'https://api.censusreporter.org/1.0/geo/search?q={search_name}'
    res = requests.get(cmd)
    dct = json.loads(res.content)
#     print(dct)
    numbers = []
    for item in list(dct['results']):
        if item['full_name'].split(',')[-1]==full_name.split(',')[-1]:
            numbers.append(item['full_geoid'])
    pop = []
    for number in  numbers: 
        cmd = f'https://api.censusreporter.org/1.0/geo/tiger2019/{number}'
        res = requests.get(cmd)
        dct = json.loads(res.content)
        pop.append(dct['properties']['population'])
    return(max(pop))

In [12]:
f = '../data/us_air_raw_data/770841524_T_T100D_MARKET_ALL_CARRIER-1.csv'   
df=pd.read_csv(f)
df = df[df.MONTH==12]
df=df[['ORIGIN_CITY_NAME','DEST_CITY_NAME','PASSENGERS','DISTANCE']]
df=df.rename(index=str, columns={"ORIGIN_CITY_NAME": "source", "DEST_CITY_NAME": "target",'PASSENGERS':'weight','DISTANCE':'distance'})
cities = list(set(list(df.source)+list(df.target)))

In [13]:
cities[2]

'Muscle Shoals, AL'

In [14]:
get_population(cities[2])

13874

In [52]:
pop_data = dict()
failed_cities = []
for node in cities:
    try:
        pop_data[node]=get_population(node)
    except:
        failed_cities.append(node)

for node in failed_cities:
    try:
        city = node.split(',')[0].split('/')[0]
        state = node.split(',')[1]
        pop_data[node]=get_population()
    except:
        print(node)

Arcata/Eureka, CA
Ofu, TT
Lawton/Fort Sill, OK
Sarasota/Bradenton, FL
Beaumont/Port Arthur, TX
Sun Valley/Hailey/Ketchum, ID
Minneapolis/St. Paul, MN
Champaign/Urbana, IL
Naukiti, AK
New Bedford/Fall River, MA
Wake Island, TT
Oceana, VA
Charleston/Dunbar, WV
Ithaca/Cortland, NY
Raleigh/Durham, NC
West Point, AK
Manhattan/Ft. Riley, KS
Akun, AK
Youngstown/Warren, OH
Hattiesburg/Laurel, MS
Seal Bay, AK
Cape Newenham, AK
Clarksburg/Fairmont, WV
Elmira/Corning, NY
Trading Bay, AK
Granite Point, AK
Mission/McAllen/Edinburg, TX
Deadhorse, AK
Newburgh/Poughkeepsie, NY
Jacksonville/Camp Lejeune, NC
Augusta/Waterville, ME
Lexington, KY
Eagle Harbor, AK
Lebanon-Hanover, NH
Cape Lisburne, AK
Montrose/Delta, CO
Sitkinak, AK
Kizhuyak, AK
Martha's Vineyard, MA
Nashville, TN
Kuparuk, AK
Presque Isle/Houlton, ME
Kona, HI
Rota, TT
Port Bailey, AK
Minchumina, AK
Charlotte Amalie, VI
Kalskag, AK
Greensboro/High Point, NC
Fitiuta Village, TT
Chignik Bay, AK
Sandpoint, AK
Ashland, WV
Bloomington/Normal, IL

In [None]:
len(pop_data)/len(cities)

In [41]:
'Oceana'.split('/')[0]+', '+ "IL"

'Oceana, IL'

In [53]:
len(pop_data)/len(cities)

0.8273195876288659

In [49]:
get_population('Lexington, KY')

{'results': []}


ValueError: max() arg is an empty sequence

In [191]:
# get_city_opendata('Easton', 'PA')

In [175]:
cities

{'Aberdeen, SD',
 'Abilene, TX',
 'Adak Island, AK',
 'Aguadilla, PR',
 'Akhiok, AK',
 'Akiachak, AK',
 'Akiak, AK',
 'Akron, OH',
 'Akun, AK',
 'Akutan, AK',
 'Alakanuk, AK',
 'Alamosa, CO',
 'Albany, GA',
 'Albany, NY',
 'Albuquerque, NM',
 'Alexandria, LA',
 'Allakaket, AK',
 'Allentown/Bethlehem/Easton, PA',
 'Alliance, NE',
 'Alpena, MI',
 'Alpine, AK',
 'Altoona, PA',
 'Amarillo, TX',
 'Ambler, AK',
 'Anacortes, WA',
 'Anaktuvuk Pass, AK',
 'Anchorage, AK',
 'Angoon, AK',
 'Aniak, AK',
 'Anniston, AL',
 'Anvik, AK',
 'Appleton, WI',
 'Arcata/Eureka, CA',
 'Arctic Village, AK',
 'Arlington, TX',
 'Asheville, NC',
 'Ashland, WV',
 'Aspen, CO',
 'Atka, AK',
 'Atlanta, GA',
 'Atlantic City, NJ',
 'Atmautluak, AK',
 'Atqasuk, AK',
 'Augusta, GA',
 'Augusta/Waterville, ME',
 'Austin, TX',
 'Bainbridge, GA',
 'Bakersfield, CA',
 'Baltimore, MD',
 'Bangor, ME',
 'Bar Harbor, ME',
 'Barrow, AK',
 'Barter Island, AK',
 'Baton Rouge, LA',
 'Bay City, TX',
 'Beaumont/Port Arthur, TX',
 'Beav

In [155]:
#get population
population = pd.read_csv('../data/Airports2.csv').sort_values('Fly_date')
population.head()

Unnamed: 0,Origin_airport,Destination_airport,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,Destination_population,Org_airport_lat,Org_airport_long,Dest_airport_lat,Dest_airport_long
730469,SEA,ORD,"Seattle, WA","Chicago, IL",1713,4410,30,1721,1990-01-01,5154164,16395048,47.449001,-122.308998,41.9786,-87.9048
448453,CLE,EWR,"Cleveland, OH","Newark, NJ",1476,4619,31,404,1990-01-01,2103367,16868983,41.411701,-81.8498,40.692501,-74.168701
1458454,CRW,ROA,"Charleston, WV","Roanoke, VA",388,2100,21,114,1990-01-01,307480,269195,38.3731,-81.593201,37.3255,-79.975403
448454,CLE,EWR,"Cleveland, OH","Newark, NJ",1337,3348,31,404,1990-01-01,2103367,16868983,41.411701,-81.8498,40.692501,-74.168701
448455,CLE,EWR,"Cleveland, OH","Newark, NJ",2787,4888,52,404,1990-01-01,2103367,16868983,41.411701,-81.8498,40.692501,-74.168701


In [161]:
f = '../data/us_air_raw_data/770841524_T_T100D_MARKET_ALL_CARRIER-1.csv'   
df=pd.read_csv(f)
df=df[df.MONTH==12]
df.head()

Unnamed: 0,PASSENGERS,DISTANCE,UNIQUE_CARRIER,UNIQUE_CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,YEAR,MONTH,Unnamed: 14
7640,0.0,1167.0,WN,Southwest Airlines Co.,11618,EWR,"Newark, NJ",NJ,13495,MSY,"New Orleans, LA",LA,2018,12,
7641,0.0,470.0,WN,Southwest Airlines Co.,11697,FLL,"Fort Lauderdale, FL",FL,10994,CHS,"Charleston, SC",SC,2018,12,
7642,0.0,562.0,WN,Southwest Airlines Co.,11996,GSP,"Greer, SC",SC,13232,MDW,"Chicago, IL",IL,2018,12,
7643,0.0,2306.0,WN,Southwest Airlines Co.,12264,IAD,"Washington, DC",VA,14747,SEA,"Seattle, WA",WA,2018,12,
7644,0.0,333.0,WN,Southwest Airlines Co.,12278,ICT,"Wichita, KS",KS,11259,DAL,"Dallas, TX",TX,2018,12,


In [166]:
A = set(list(df.ORIGIN_CITY_NAME)+list(df.DEST_CITY_NAME))
B = set(list(population.Origin_city)+list(population.Destination_city))

In [167]:
len([i for i in A if i not in B])/len(A)

0.5773195876288659

In [157]:
edges = pd.read_pickle('../data/ranked_edges.pkl')
nodes = list(set(list(edges[0])+list(edges[1])))
nodes_full_name = list(set(list(data.source.unique())+list(data.target.unique())))
nodes_key_name = [text_to_id(str(i)) for i in nodes_full_name]
nodes_city_name = [i[:-4].split('/') for i in nodes_full_name]
nodes_state_name = [i[-2:] for i in nodes_full_name]

In [71]:
nodes

['gambell_ak',
 'nuiqsut_ak',
 'albany_ga',
 'savoonga_ak',
 'salmon_id',
 'ambler_ak',
 'koyukuk_ak',
 'gulfportbiloxi_ms',
 'miami_fl',
 'west_palm_beachpalm_beach_fl',
 'chicago_il',
 'kenmore_wa',
 'charlotte_nc',
 'kearney_ne',
 'russian_mission_ak',
 'thermal_ca',
 'laredo_tx',
 'port_alsworth_ak',
 'anaktuvuk_pass_ak',
 'indianapolis_in',
 'kingman_az',
 'augusta_ga',
 'trenton_nj',
 'manley_hot_springs_ak',
 'coffman_cove_ak',
 'shemya_ak',
 'detroit_mi',
 'rapid_city_sd',
 'gainesville_fl',
 'rochester_mn',
 'valparaiso_fl',
 'tanana_ak',
 'tyonek_ak',
 'red_dog_ak',
 'bradford_pa',
 'parkersburg_wv',
 'alakanuk_ak',
 'pascokennewickrichland_wa',
 'ogden_ut',
 'plattsburgh_ny',
 'boulder_city_nv',
 'hancockhoughton_mi',
 'alpine_ak',
 'gulkana_ak',
 'reading_pa',
 'kwigillingok_ak',
 'dickinson_nd',
 'pueblo_co',
 'sheldon_point_ak',
 'hollis_ak',
 'sandpoint_ak',
 'lanai_hi',
 'baton_rouge_la',
 'yakima_wa',
 'trading_bay_ak',
 'fresno_ca',
 'sheridan_wy',
 'chignik_ak',
 'ty

In [47]:
new_nodes_city_name = []
for cities in nodes_city_name:
    new_cities = []
    new_cities = new_cities+cities
    for city in cities:
        cs = city.replace('Island','').replace('North','').replace('Bay','')\
        .replace('East','').replace('West','').replace('South','').replace('Port','').replace('Lake','').strip()
        if city!=cs:
            new_cities.append(cs)
        for c in cs.replace('-',' ').split(' '):
            if c!=cs:
                new_cities.append(c)
    new_nodes_city_name.append(new_cities)
nodes_city_name = new_nodes_city_name

In [49]:
nodes_population = {}
not_found = []
for key,cities,state in zip(nodes_key_name,nodes_city_name,nodes_state_name):
    if key in nodes:
        nodes_population[key] = []
        if len(cities)>1:
            for city in cities:
                try:
                    nodes_population[key].append(get_city_opendata(city, state))
                except:
                    not_found.append(key)
                    print(key,city,state)
                    nodes_population[key].append(np.nan)
        else:
            city = cities[0]
            try:
                nodes_population[key].append(get_city_opendata(city, state))
            except:
                not_found.append(key)
                print(key,city,state)
                nodes_population[key].append(np.nan)

utopia_ak Utopia AK
north_kingstown_ri North Kingstown RI
north_kingstown_ri Kingstown RI
cape_lisburne_ak Cape Lisburne AK
cape_lisburne_ak Lisburne AK
fort_drum_ny Fort Drum NY
fort_drum_ny Drum NY
zachar_bay_ak Zachar Bay AK
zachar_bay_ak Zachar AK
granite_point_ak Granite Point AK
granite_point_ak Granite AK
mammoth_lakes_ca Mammoth s CA
adak_island_ak Adak Island AK
twin_hills_ak Twin Hills AK
twin_hills_ak Twin AK
twin_hills_ak Hills AK
manhattanft_riley_ks Ft. Riley KS
manhattanft_riley_ks Ft. KS
akun_ak Akun AK
alpine_ak Alpine AK
hoolehua_hi Hoolehua HI
birch_creek_ak Birch Creek AK
birch_creek_ak Birch AK
tinian_tt Tinian TT
cape_romanzof_ak Cape Romanzof AK
cape_romanzof_ak Romanzof AK
westerly_ri erly RI
kuparuk_ak Kuparuk AK
east_farmingdale_ny East Farmingdale NY
prospect_creek_ak Prospect Creek AK
prospect_creek_ak Prospect AK
fitiuta_village_tt Fitiuta Village TT
fitiuta_village_tt Fitiuta TT
fitiuta_village_tt Village TT
port_bailey_ak Port Bailey AK
port_bailey_ak Bai

In [51]:
population = {}
for n,p in nodes_population.items():
    if len(p)==0:
        num = np.nan
    else:
        num = np.nan
        for i in p:
            if i>0:
                num = i
                break
    population[n]=num 
#     population[n]=np.nanmean(p)

In [52]:
manual_nodes = [i for i in nodes if i not in nodes_key_name]
manual_population = [16075,113383,57941,76639]
for n, p in zip(manual_nodes,manual_population):
    population[n]=p

In [53]:
len(nodes)

775

In [55]:
len([i for i in population.values() if i>0])

713

In [56]:
pd.DataFrame.from_dict(population,orient='index').to_pickle('../data/us_air_population_3.pkl')

In [14]:
edges = pd.read_pickle('../data/ranked_edges.pkl')

In [15]:
nodes = list(set(list(edges[0])+list(edges[1])))

In [236]:
population = pd.read_csv('../data/us_pop_2010.csv',encoding='latin-1').sort_values(by="NAME")
population['STNAME'] = population['STNAME'].apply(state_to_abbr)
population['NAME'] = population.apply(lambda row: text_to_id(str(row.NAME))+'_'+str(row.STNAME), axis=1)

In [314]:
population.head(5)

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,NAME,STNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011
36216,61,27,111,0,100,0,aastad_township_mn,mn,213,213,213,213
67925,50,45,1,0,0,0,abbeville_county_sc,sc,25417,25417,25354,25161
67656,162,45,0,100,0,0,abbeville_city_sc,sc,5237,5237,5224,5183
1,162,1,0,124,0,0,abbeville_city_al,al,2688,2688,2689,2704
27235,157,22,113,100,0,0,abbeville_city_la,la,12257,12258,12277,12316


In [362]:
# list(population.NAME.unique())

In [437]:
city= 'San+Francisco'
state = 'CA'
# cmd = f'https://public.opendatasoft.com/api/records/1.0/search/?dataset=cities-and-towns-of-the-united-states&q={city}&sort=-pop_2010&facet=state&refine.state={state}'
cmd = 'https://public.opendatasoft.com/api/records/1.0/search/?dataset=cities-and-towns-of-the-united-states&q=santa+fe&sort=pop_2010&facet=state&refine.state=NM'
res = requests.get(cmd)
dct = json.loads(res.content)
records = dct['records']
if len(records)>1:
    record = [i['fields']['name'].upper() for i in records if i['fields']['name'].upper()== city.upper()]
    output = record['fields']['pop_2010']
else:
    output = records[0]['fields']['pop_2010']
output

67947.0

In [None]:
{'bloomingtonnormal_il':76610,
 'marthas_vineyard_ma': 15000,
 'lawtonfort_sill_ok': 96867,
 'north_bendcoos_bay_or':9695,
 'midlandodessa_tx':99940,
 'bismarckmandan_nd':61272,
}

In [22]:
out_list = []
for n in nodes:
    name_str = n.split('_')
    city = ' '.join(name_str[:-1])
    state = name_str[-1].upper()
    try:
        out = get_city_opendata(city, state)
    except:
        print(n)
        out = np.nan
    out_list.append(out)

bloomingtonnormal_il
guam_tt
kuparuk_ak
marthas_vineyard_ma
dallasfort_worth_tx
sault_ste_marie_mi
lawtonfort_sill_ok
north_bendcoos_bay_or
kamuela_hi
kiluda_bay_ak
port_williams_ak
cape_newenham_ak
midlandodessa_tx
chignik_bay_ak
bismarckmandan_nd
st_marys_ak
sheldon_point_ak
utopia_ak
alpine_ak
st_michael_ak
ashland_wv
lebanon-hanover_nh
wake_island_tt
cedar_rapidsiowa_city_ia
van_nuys_ca
bendredmond_or
kitoi_bay_ak
birch_creek_ak
princetonbluefield_wv
oceana_va
fitiuta_village_tt
barter_island_ak
sitkinak_ak
kapalua_hi
naukiti_ak
lazy_bay_ak
new_bedfordfall_river_ma
youngstownwarren_oh
block_island_ri
hattiesburglaurel_ms
cape_lisburne_ak
nanwalek_ak
prospect_creek_ak
sarasotabradenton_fl
newport_newswilliamsburg_va
danger_bay_ak
eagle_harbor_ak
beaumontport_arthur_tx
whale_pass_ak
twin_hills_ak
zachar_bay_ak
scrantonwilkes-barre_pa
allentownbethlehemeaston_pa
manhattanft_riley_ks
north_kingstown_ri
port_alexander_ak
tinian_tt
uganik_ak
pago_pago_tt
chicagoromeoville_il
sandpoint_ak

0.8141935483870968

In [19]:
len(nodes)

775

In [449]:
len([i for i in data_out.values() if i>0])

512

In [24]:
data_out = dict(zip(nodes,out_list))
pd.DataFrame.from_dict(data_out,orient='index').to_pickle('../data/us_air_population_2.pkl')

In [366]:
Y,N=0,0
for n in nodes:
    state = n[-2::]
    city = n[::-3]
    df_temp = population.copy()
    df_temp = df_temp[df_temp.STNAME==state]
    match_df = df_temp[df_temp.NAME.str.contains(city)]
    match_city = list(match_df.NAME)
    match_pop = list(match_df.CENSUS2010POP)
    if len(match_city)!=0:
        Y+=1
#         print(n,'******',list(zip(match_city,match_pop)))
    else:
        N +=1
        print(n,'>>>>>>>')

peru_in >>>>>>>
valparaiso_in >>>>>>>
greenville_ms >>>>>>>
port_protection_ak >>>>>>>
hoolehua_hi >>>>>>>
adak_island_ak >>>>>>>
wrangell_ak >>>>>>>
shageluk_ak >>>>>>>
kiana_ak >>>>>>>
marietta_ga >>>>>>>
vero_beach_fl >>>>>>>
concord_nc >>>>>>>
bakersfield_ca >>>>>>>
santa_fe_nm >>>>>>>
chenega_ak >>>>>>>
perryville_ak >>>>>>>
santa_rosa_ca >>>>>>>
akron_oh >>>>>>>
roanoke_va >>>>>>>
hibbing_mn >>>>>>>
columbus_oh >>>>>>>
key_west_fl >>>>>>>
niagara_falls_ny >>>>>>>
rockland_me >>>>>>>
corvallis_or >>>>>>>
helena_mt >>>>>>>
colorado_springs_co >>>>>>>
boca_raton_fl >>>>>>>
milwaukee_wi >>>>>>>
anchorage_ak >>>>>>>
north_platte_ne >>>>>>>
norfolk_va >>>>>>>
los_angeles_ca >>>>>>>
nashville_tn >>>>>>>
green_bay_wi >>>>>>>
alakanuk_ak >>>>>>>
gillette_wy >>>>>>>
togiak_ak >>>>>>>
cape_lisburne_ak >>>>>>>
franklinoil_city_pa >>>>>>>
sanford_fl >>>>>>>
cape_girardeau_mo >>>>>>>
excursion_inlet_ak >>>>>>>
farmington_nm >>>>>>>
sun_valleyhaileyketchum_id >>>>>>>
allentownbethlehemeaston_pa

In [359]:
print(Y,N)

33 742


Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,NAME,STNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011
1265,50,2,20,0,0,0,anchorage_municipality_ak,ak,291826,291826,293356,295570
1112,162,2,0,3000,0,0,anchorage_municipality_ak,ak,291826,291826,293356,295570
1266,157,2,20,3000,0,0,anchorage_municipality_ak,ak,291826,291826,293356,295570
1325,157,2,130,99990,0,0,balance_of_ketchikan_gateway_borough_ak,ak,5016,5016,5035,5059
1124,162,2,0,12680,0,0,chefornak_city_ak,ak,418,418,420,429
1271,157,2,50,12680,0,0,chefornak_city_ak,ak,418,418,420,429
1414,157,2,270,13230,0,0,chevak_city_ak,ak,938,938,943,963
1125,162,2,0,13230,0,0,chevak_city_ak,ak,938,938,943,963
1335,157,2,164,13550,0,0,chignik_city_ak,ak,91,91,91,93
1126,162,2,0,13550,0,0,chignik_city_ak,ak,91,91,91,93


In [288]:
city_pop_dict = population[['NAME','CENSUS2010POP']].set_index('NAME').to_dict()['CENSUS2010POP']
cities  = list(city_pop_dict.keys())

def get_pop_by_city(c):
    if c in cities:
        return int(city_pop_dict[c])
    else:
        return np.nan
    
def city_get_population(c):
    matching_cities = get_close_matches(c,[i for i in cities if i[-2::]==c[-2::]])
#     print([i[0:length] for i in cities if i[-2::]==c[-2::]])
    print(c,matching_cities)
    if len(matching_cities)>0:
        return(get_pop_by_city(matching_cities[0]))
    else:
        return np.nan

In [289]:
get_close_matches('denver',cities)

['denver_town_in', 'denver_city_ia', 'denver_city_co']

In [290]:
# nodes = data[['source','ORIGIN']].set_index('source').to_dict()['ORIGIN']
# nodes_temp = data[['target','DEST']].set_index('target').to_dict()['DEST']
# nodes.update(nodes_temp)

In [305]:
out_data = {}
for n in nodes:
    out_data[n] = city_get_population(n)

peru_in ['peru_city_in', 'peru_city_pt_in', 'perry_county_in']
valparaiso_in ['valparaiso_city_in', 'valparaiso_city_pt_in', 'earl_park_town_in']
greenville_ms ['greenville_city_ms', 'eden_village_ms', 'big_creek_village_ms']
port_protection_ak ['north_pole_city_ak', 'port_lions_city_ak', 'point_hope_city_ak']
hoolehua_hi []
adak_island_ak ['adak_city_ak', 'kodiak_island_borough_ak']
wrangell_ak []
shageluk_ak ['shageluk_city_ak', 'alaska_ak']
kiana_ak ['kiana_city_ak', 'kivalina_city_ak']
marietta_ga ['marietta_city_ga', 'martin_town_ga', 'darien_city_ga']
vero_beach_fl ['vero_beach_city_fl', 'riviera_beach_city_fl', 'beverly_beach_town_fl']
concord_nc ['concord_city_nc', 'macon_county_nc', 'conover_city_nc']
bakersfield_ca ['bakersfield_city_ca', 'fairfield_city_ca']
santa_fe_nm ['santa_fe_city_nm', 'santa_fe_county_nm', 'santa_rosa_city_nm']
chenega_ak []
perryville_ak []
santa_rosa_ca ['santa_rosa_city_ca', 'santa_maria_city_ca', 'santa_clara_city_ca']
akron_oh ['akron_city_oh', 'm

ValueError: invalid literal for int() with base 10: 'X'

In [312]:
len(out_data)

600

In [310]:
pd.DataFrame.from_dict(out_data,orient='index').to_pickle('../data/us_air_population_1.pkl')

In [308]:
out_data

{'peru_in': 11417,
 'valparaiso_in': 31730,
 'greenville_ms': 34400,
 'port_protection_ak': 2117,
 'hoolehua_hi': nan,
 'adak_island_ak': 326,
 'wrangell_ak': nan,
 'shageluk_ak': 83,
 'kiana_ak': 361,
 'marietta_ga': 56579,
 'vero_beach_fl': 15220,
 'concord_nc': 79066,
 'bakersfield_ca': 347483,
 'santa_fe_nm': 67947,
 'chenega_ak': nan,
 'perryville_ak': nan,
 'santa_rosa_ca': 167815,
 'akron_oh': 199110,
 'roanoke_va': 97032,
 'hibbing_mn': 16361,
 'columbus_oh': 787033,
 'key_west_fl': 24649,
 'niagara_falls_ny': 50193,
 'rockland_me': 7297,
 'corvallis_or': 54462,
 'helena_mt': 28190,
 'colorado_springs_co': 416427,
 'boca_raton_fl': 84392,
 'milwaukee_wi': 0,
 'anchorage_ak': 291826,
 'north_platte_ne': 24733,
 'norfolk_va': 242803,
 'los_angeles_ca': 3792621,
 'nashville_tn': 4530,
 'green_bay_wi': 2035,
 'alakanuk_ak': 677,
 'gillette_wy': 29087,
 'togiak_ak': 817,
 'cape_lisburne_ak': nan,
 'franklinoil_city_pa': 6545,
 'sanford_fl': 53570,
 'cape_girardeau_mo': 37941,
 'excu

In [259]:
# get_close_matches('denver',cities)

In [260]:
# population[population.STNAME=='nv']

In [311]:
len(out_data)-len([i for i in out_data.values() if type(i)==int])

72

In [153]:
pop = pd.read_csv('../data/Airports2.csv').sort_values('Fly_date')
# pop['year'] = pop.apply(lambda row: int(str(row.Fly_date)[0:4]), axis=1)
# pop['Origin_city']=pop.apply(lambda row: text_to_id(str(row.Origin_city)), axis=1)
# pop['Destination_city']=pop.apply(lambda row: text_to_id(str(row.Destination_city)), axis=1)

In [154]:
pop.head()

Unnamed: 0,Origin_airport,Destination_airport,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,Destination_population,Org_airport_lat,Org_airport_long,Dest_airport_lat,Dest_airport_long
730469,SEA,ORD,"Seattle, WA","Chicago, IL",1713,4410,30,1721,1990-01-01,5154164,16395048,47.449001,-122.308998,41.9786,-87.9048
448453,CLE,EWR,"Cleveland, OH","Newark, NJ",1476,4619,31,404,1990-01-01,2103367,16868983,41.411701,-81.8498,40.692501,-74.168701
1458454,CRW,ROA,"Charleston, WV","Roanoke, VA",388,2100,21,114,1990-01-01,307480,269195,38.3731,-81.593201,37.3255,-79.975403
448454,CLE,EWR,"Cleveland, OH","Newark, NJ",1337,3348,31,404,1990-01-01,2103367,16868983,41.411701,-81.8498,40.692501,-74.168701
448455,CLE,EWR,"Cleveland, OH","Newark, NJ",2787,4888,52,404,1990-01-01,2103367,16868983,41.411701,-81.8498,40.692501,-74.168701


In [202]:
city_pop_dict = pop[['Origin_city','Origin_population']].set_index('Origin_city').to_dict()['Origin_population']
city_temp_dict = pop[['Destination_city','Destination_population']].set_index('Destination_city').to_dict()['Destination_population']
city_pop_dict.update(city_temp_dict)

cities  = list(city_pop_dict.keys())

def get_pop_by_city(c):
    if c in cities:
        return city_pop_dict[c]
    else:
        return 0
    
def city_get_population(c):
    matching_cities = get_close_matches(c,[i for i in cities if i[-2::]==c[-2::]])
    print(c,matching_cities)
    if len(matching_cities)>0:
        return(get_pop_by_city(matching_cities[0]))
    else:
        return 0

airport_pop_dict = pop[['Origin_airport','Origin_population']].groupby('Origin_airport').max().to_dict()['Origin_population']
airport_temp_dict = pop[['Destination_airport','Destination_population']].groupby('Destination_airport').max().to_dict()['Destination_population']
airport_pop_dict.update(airport_temp_dict)

def get_population(airport,c):
    if airport in list(airport_pop_dict.keys()):
        return(airport_pop_dict[airport])
    else:
        return city_get_population(c)
    
# def get_population(airport,city):
#     numbers = list(pop[pop.Origin_airport==airport].sort_values(by='Fly_date')['Origin_population'])
#     if len(numbers)>0:
#         return numbers[-1]
#     else:
#         numbers = list(pop[pop.Destination_airport==airport].sort_values(by='Fly_date')['Destination_population'])
#         if len(numbers)>0:
#             return numbers[-1]
#         else:
#             return city_get_population(city)

In [203]:
data.head()

Unnamed: 0,weight,distance,UNIQUE_CARRIER,UNIQUE_CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN,source,ORIGIN_STATE_ABR,DEST_AIRPORT_ID,DEST,target,DEST_STATE_ABR,YEAR,MONTH,Unnamed: 14
0,1.0,1491.0,DL,Delta Air Lines Inc.,11292,DEN,denver_co,CO,10821,BWI,baltimore_md,MD,2018,11,
1,1.0,236.0,DL,Delta Air Lines Inc.,12889,LAS,las_vegas_nv,NV,12892,LAX,los_angeles_ca,CA,2018,11,
2,1.0,522.0,DL,Delta Air Lines Inc.,14771,SFO,san_francisco_ca,CA,10713,BOI,boise_id,ID,2018,11,
3,1.0,842.0,DL,Delta Air Lines Inc.,15304,TPA,tampa_fl,FL,10821,BWI,baltimore_md,MD,2018,11,
4,1.0,184.0,MQ,Envoy Air,10721,BOS,boston_ma,MA,12953,LGA,new_york_ny,NY,2018,11,


In [204]:
pop.head()

Unnamed: 0,Origin_airport,Destination_airport,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,Destination_population,Org_airport_lat,Org_airport_long,Dest_airport_lat,Dest_airport_long
0,MHK,AMW,manhattan_ks,ames_ia,21,30,1,254,2008-10-01,122049,86219,39.140999,-96.670799,,
1,EUG,RDM,eugene_or,bend_or,41,396,22,103,1990-11-01,284093,76034,44.124599,-123.211998,44.254101,-121.150001
2,EUG,RDM,eugene_or,bend_or,88,342,19,103,1990-12-01,284093,76034,44.124599,-123.211998,44.254101,-121.150001
3,EUG,RDM,eugene_or,bend_or,11,72,4,103,1990-10-01,284093,76034,44.124599,-123.211998,44.254101,-121.150001
4,MFR,RDM,medford_or,bend_or,0,18,1,156,1990-02-01,147300,76034,42.374199,-122.873001,44.254101,-121.150001


In [210]:
sorted(set(list(pop.Origin_city)))

['aberdeen_sd',
 'abilene_tx',
 'akron_oh',
 'alamogordo_nm',
 'albany_ga',
 'albany_ny',
 'albany_or',
 'albuquerque_nm',
 'alexandria_la',
 'alexandria_mn',
 'allentown_pa',
 'alpena_mi',
 'altoona_pa',
 'altus_ok',
 'amarillo_tx',
 'americus_ga',
 'ames_ia',
 'anchorage_ak',
 'anderson_in',
 'anderson_sc',
 'anniston_al',
 'appleton_wi',
 'ardmore_ok',
 'asheville_nc',
 'astoria_or',
 'athens_ga',
 'athens_oh',
 'athens_tn',
 'atlanta_ga',
 'atlantic_city_nj',
 'auburn_al',
 'auburn_in',
 'augusta_ga',
 'augusta_me',
 'austin_tx',
 'bakersfield_ca',
 'baltimore_md',
 'bangor_me',
 'batesville_ar',
 'baton_rouge_la',
 'battle_creek_mi',
 'bay_city_tx',
 'beaumont_tx',
 'beckley_wv',
 'bedford_in',
 'bellefontaine_oh',
 'bellingham_wa',
 'bemidji_mn',
 'bend_or',
 'bennington_vt',
 'big_rapids_mi',
 'big_spring_tx',
 'billings_mt',
 'binghamton_ny',
 'birmingham_al',
 'bishop_ca',
 'bismarck_nd',
 'bloomington_il',
 'bloomington_in',
 'blytheville_ar',
 'borger_tx',
 'boston_ma',
 'bo

In [None]:
data[data.Origin_population==0]

In [None]:
data[data.Destination_population==0]

In [122]:
data.head()

Unnamed: 0,weight,distance,UNIQUE_CARRIER,UNIQUE_CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN,source,ORIGIN_STATE_ABR,DEST_AIRPORT_ID,DEST,target,DEST_STATE_ABR,YEAR,MONTH,Unnamed: 14,Origin_population,Destination_population
0,1.0,1491.0,DL,Delta Air Lines Inc.,11292,DEN,denver_co,CO,10821,BWI,baltimore_md,MD,2018,11,,0,2690886
1,1.0,236.0,DL,Delta Air Lines Inc.,12889,LAS,las_vegas_nv,NV,12892,LAX,los_angeles_ca,CA,2018,11,,1902834,25749594
2,1.0,522.0,DL,Delta Air Lines Inc.,14771,SFO,san_francisco_ca,CA,10713,BOI,boise_id,ID,2018,11,,8635706,0
3,1.0,842.0,DL,Delta Air Lines Inc.,15304,TPA,tampa_fl,FL,10821,BWI,baltimore_md,MD,2018,11,,2747272,2690886
4,1.0,184.0,MQ,Envoy Air,10721,BOS,boston_ma,MA,12953,LGA,new_york_ny,NY,2018,11,,9177360,38139592


In [5]:
data = data[['source','target','weight','distance']]

In [123]:
data.head()

Unnamed: 0,weight,distance,UNIQUE_CARRIER,UNIQUE_CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN,source,ORIGIN_STATE_ABR,DEST_AIRPORT_ID,DEST,target,DEST_STATE_ABR,YEAR,MONTH,Unnamed: 14,Origin_population,Destination_population
0,1.0,1491.0,DL,Delta Air Lines Inc.,11292,DEN,denver_co,CO,10821,BWI,baltimore_md,MD,2018,11,,0,2690886
1,1.0,236.0,DL,Delta Air Lines Inc.,12889,LAS,las_vegas_nv,NV,12892,LAX,los_angeles_ca,CA,2018,11,,1902834,25749594
2,1.0,522.0,DL,Delta Air Lines Inc.,14771,SFO,san_francisco_ca,CA,10713,BOI,boise_id,ID,2018,11,,8635706,0
3,1.0,842.0,DL,Delta Air Lines Inc.,15304,TPA,tampa_fl,FL,10821,BWI,baltimore_md,MD,2018,11,,2747272,2690886
4,1.0,184.0,MQ,Envoy Air,10721,BOS,boston_ma,MA,12953,LGA,new_york_ny,NY,2018,11,,9177360,38139592


In [72]:
df = pd.read_csv(f)
df.head()

Unnamed: 0,PASSENGERS,DISTANCE,UNIQUE_CARRIER,UNIQUE_CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,YEAR,MONTH,Unnamed: 14
0,0.0,1218.0,WN,Southwest Airlines Co.,13495,MSY,"New Orleans, LA",LA,12391,ISP,"Islip, NY",NY,2018,11,
1,0.0,1428.0,WN,Southwest Airlines Co.,13495,MSY,"New Orleans, LA",LA,14869,SLC,"Salt Lake City, UT",UT,2018,11,
2,0.0,2694.0,WN,Southwest Airlines Co.,14057,PDX,"Portland, OR",OR,11697,FLL,"Fort Lauderdale, FL",FL,2018,11,
3,0.0,2400.0,WN,Southwest Airlines Co.,14492,RDU,"Raleigh/Durham, NC",NC,14771,SFO,"San Francisco, CA",CA,2018,11,
4,0.0,1430.0,WN,Southwest Airlines Co.,14747,SEA,"Seattle, WA",WA,12278,ICT,"Wichita, KS",KS,2018,11,


In [95]:
df[df.Destination_population==np.nan]

Unnamed: 0,weight,distance,UNIQUE_CARRIER,UNIQUE_CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN,source,ORIGIN_STATE_ABR,DEST_AIRPORT_ID,DEST,target,DEST_STATE_ABR,YEAR,MONTH,Unnamed: 14,Origin_population,Destination_population


In [7]:
pop = data.groupby(['source','target']).sum()['weight']

In [8]:
pop.to_pickle('../data/us_air_mirgration.pkl')

In [78]:
pop = pd.read_csv('../data/Airports2.csv')
pop.head()

Unnamed: 0,Origin_airport,Destination_airport,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,Destination_population,Org_airport_lat,Org_airport_long,Dest_airport_lat,Dest_airport_long
0,MHK,AMW,"Manhattan, KS","Ames, IA",21,30,1,254,2008-10-01,122049,86219,39.140999,-96.670799,,
1,EUG,RDM,"Eugene, OR","Bend, OR",41,396,22,103,1990-11-01,284093,76034,44.124599,-123.211998,44.254101,-121.150001
2,EUG,RDM,"Eugene, OR","Bend, OR",88,342,19,103,1990-12-01,284093,76034,44.124599,-123.211998,44.254101,-121.150001
3,EUG,RDM,"Eugene, OR","Bend, OR",11,72,4,103,1990-10-01,284093,76034,44.124599,-123.211998,44.254101,-121.150001
4,MFR,RDM,"Medford, OR","Bend, OR",0,18,1,156,1990-02-01,147300,76034,42.374199,-122.873001,44.254101,-121.150001


In [34]:
pop = pop[['Origin_city','Destination_city','Origin_population','Destination_population']]
pop['Origin_city']=pop.apply(lambda row: text_to_id(str(row.Origin_city)), axis=1)
pop['Destination_city']=pop.apply(lambda row: text_to_id(str(row.Destination_city)), axis=1)

In [35]:
pop.head()

Unnamed: 0,Origin_city,Destination_city,Origin_population,Destination_population
0,manhattan_ks,ames_ia,122049,86219
1,eugene_or,bend_or,284093,76034
2,eugene_or,bend_or,284093,76034
3,eugene_or,bend_or,284093,76034
4,medford_or,bend_or,147300,76034


In [44]:
pop_dict.update(dict2)

In [58]:
a[-2::]

'il'

In [59]:
cities  = list(pop_dict.keys())
matching_cities = {}
for e in ranked_edges:
    a,b = e
    matching_cities[a]=get_close_matches(a,[i for i in cities if i[-2::]==a[-2::]])[0]
    matching_cities[b]=get_close_matches(b,[i for i in cities if i[-2::]==b[-2::]])[0]


#     print(get_pop(a),get_pop(b))

gulfportbiloxi_ms ['gulfport_ms']
springfield_mo ['springfield_mo']
sanford_fl ['orlando_fl']
flint_mi ['flint_mi', 'lansing_mi']
fort_lauderdale_fl ['fort_lauderdale_fl']
asheville_nc ['asheville_nc', 'statesville_nc', 'jacksonville_nc']
austin_tx ['austin_tx', 'houston_tx', 'abilene_tx']
waco_tx ['waco_tx', 'laredo_tx']
kona_hi []
van_nuys_ca ['san_jose_ca', 'santa_ana_ca']
sanford_fl ['orlando_fl']
clarksburgfairmont_wv ['clarksburg_wv', 'parkersburg_wv']
savoonga_ak []
unalakleet_ak []
shreveport_la ['shreveport_la']
midlandodessa_tx ['midland_tx', 'andrews_tx']
minneapolis_mn ['minneapolis_mn']
hopkinsville_ky ['danville_ky', 'madisonville_ky']
burbank_ca ['eureka_ca']
pontiac_mi []
galena_ak []
unalakleet_ak []
ontario_ca ['oxnard_ca', 'santa_rosa_ca', 'el_centro_ca']
billings_mt ['billings_mt']
fort_lauderdale_fl ['fort_lauderdale_fl']
plattsburgh_ny ['plattsburgh_ny']
bullhead_city_az []
eugene_or ['eugene_or', 'bend_or', 'roseburg_or']
los_angeles_ca ['los_angeles_ca']
provo_u

In [9]:
data.to_pickle('/home/weihua/Research/Link_Dynamics/data/us_air_distance.pkl')

In [10]:
def removed_end(text):
    return '_'.join(text.split('_')[0:-2])

In [17]:
ranked_edges = pd.read_pickle('../data/ranked_edges.pkl').values.tolist()
ranked_edges = [tuple(i) for i in ranked_edges]

In [18]:
ranked_edges[0:2]

[('gulfportbiloxi_ms', 'springfield_mo'), ('sanford_fl', 'flint_mi')]

In [225]:
population  = pd.read_csv('../data/usa_population.csv',encoding='latin-1').sort_values(by='NAME')
population['NAME'] = population['NAME'].apply(text_to_id)
population['STNAME'] = population['STNAME'].apply(state_to_abbr)
population

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,...,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019
36175,61,27,111,0,100,0,1,A,aastad_township,mn,...,213,213,212,212,212,211,212,212,213,213
67558,50,45,1,0,0,0,0,A,abbeville_county,sc,...,25328,25081,25019,24899,24795,24796,24657,24567,24587,24527
67287,162,45,0,100,0,0,0,A,abbeville_city,sc,...,5233,5178,5162,5131,5109,5089,5051,5035,5032,5014
1,162,1,0,124,0,0,0,A,abbeville_city,al,...,2699,2694,2643,2628,2608,2600,2584,2575,2571,2560
27279,157,22,113,100,0,0,1,A,abbeville_city,la,...,12225,12181,12326,12379,12356,12357,12357,12246,12177,12038
7425,157,13,315,184,0,0,1,A,abbeville_city,ga,...,2982,2951,2932,2933,2862,2929,2767,2783,2800,2684
5998,162,13,0,184,0,0,0,A,abbeville_city,ga,...,2982,2951,2932,2933,2862,2929,2767,2783,2800,2684
26595,162,22,0,100,0,0,0,A,abbeville_city,la,...,12225,12181,12326,12379,12356,12357,12357,12246,12177,12038
759,157,1,67,124,0,0,1,A,abbeville_city,al,...,2699,2694,2643,2628,2608,2600,2584,2575,2571,2560
67559,157,45,1,100,0,0,1,A,abbeville_city,sc,...,5233,5178,5162,5131,5109,5089,5051,5035,5032,5014


In [222]:
population = population[population.SUMLEV>=170]

In [219]:
population.head(20)

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,...,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019
4653,170,9,0,0,0,47500,0,A,milford_city,ct,...,52797,52874,53054,53159,53487,53648,54065,54355,54697,54747
6536,170,13,0,0,0,3436,0,A,athens-clarke_county_unified_government,ga,...,117403,118229,119891,120888,120386,123561,124905,126815,127268,128331
6537,170,13,0,0,0,4200,0,A,augusta-richmond_county_consolidated_government,ga,...,201115,200568,201742,201335,201287,201451,201949,201673,201667,202518
15885,170,18,0,0,0,36000,0,A,indianapolis_city,in,...,830841,836991,844910,854203,859730,863319,868704,873179,880739,886220
21896,170,20,0,0,0,28410,0,A,greeley_county_unified_government,ks,...,1190,1187,1199,1213,1231,1232,1208,1160,1163,1166
25833,170,21,0,0,0,48003,0,A,louisvillejefferson_county_metro_government,ky,...,742054,746361,751629,759183,761725,764946,767464,769828,768101,766757
41720,170,30,0,0,0,11390,0,A,butte-silver_bow,mt,...,34230,34405,34474,34574,34752,34691,34696,34811,34739,34915
70602,170,47,0,0,0,52004,0,A,nashville-davidson_metropolitan_government,tn,...,627746,635918,649344,660179,669611,679793,685829,687159,690516,694144


In [223]:
list(population.NAME)

['milford_city',
 'athens-clarke_county_unified_government',
 'augusta-richmond_county_consolidated_government',
 'indianapolis_city',
 'greeley_county_unified_government',
 'louisvillejefferson_county_metro_government',
 'butte-silver_bow',
 'nashville-davidson_metropolitan_government']

In [74]:
city = data.source[10]
cities = list(population['NAME'])

In [51]:
get_close_matches(city[0:-3],cities)

NameError: name 'city' is not defined

In [89]:
population[population.NAME=='south_bend_city'].T

Unnamed: 0,15789,18434,76204,76469
SUMLEV,162,157,162,157
STATE,18,18,53,53
COUNTY,0,141,0,49
PLACE,71000,71000,65625,65625
COUSUB,0,0,0,0
CONCIT,0,0,0,0
PRIMGEO_FLAG,0,0,0,1
FUNCSTAT,A,A,A,A
NAME,south_bend_city,south_bend_city,south_bend_city,south_bend_city
STNAME,Indiana,Indiana,Washington,Washington


In [88]:
cities = list(population['NAME'])
out_data = {}
for key in set(list(data.source)+list(data.target)):
    city = key[0:-3]
    match = get_close_matches(city,cities)
    if len(match)>0:
        out_data[key]= (match[0],list(population[population.NAME==match[0]].POPESTIMATE2019)[0])
        print(key,match[0],list(population[population.NAME==match[0]].STNAME)[0],end='\n')
    else:
        out_data[key] = None

yakutat_ak akutan_city Alaska
fairfield_ia fairfield_town Connecticut
tupelo_ms tupelo_town Arkansas
las_cruces_nm las_cruces_city New Mexico
augustawaterville_me augusta_village Illinois
augusta_ga augusta_town Missouri


KeyboardInterrupt: 

In [83]:
out_data

{'yakutat_ak': ('akutan_city', 'POPESTIMATE2019'),
 'fairfield_ia': ('fairfield_town', 'POPESTIMATE2019'),
 'tupelo_ms': ('tupelo_town', 'POPESTIMATE2019'),
 'las_cruces_nm': ('las_cruces_city', 'POPESTIMATE2019'),
 'augustawaterville_me': ('augusta_village', 'POPESTIMATE2019'),
 'augusta_ga': ('augusta_town', 'POPESTIMATE2019'),
 'fort_wayne_in': ('fort_wayne_city', 'POPESTIMATE2019'),
 'nikolai_ak': ('nikolai_city', 'POPESTIMATE2019'),
 'lynchburg_va': ('lynchburg_town', 'POPESTIMATE2019'),
 'burlington_ia': ('burlington_town', 'POPESTIMATE2019'),
 'mcgrath_ak': ('mcgrath_city', 'POPESTIMATE2019'),
 'lopez_island_wa': ('louds_island_ut', 'POPESTIMATE2019'),
 'may_creek_ak': ('sandy_creek_town', 'POPESTIMATE2019'),
 'sun_valleyhaileyketchum_id': None,
 'santa_ana_ca': ('santa_ana_city', 'POPESTIMATE2019'),
 'savannah_ga': ('savannah_town', 'POPESTIMATE2019'),
 'tenakee_ak': None,
 'farewell_ak': ('farwell_city', 'POPESTIMATE2019'),
 'fort_meade_md': ('fort_meade_city', 'POPESTIMATE201

In [80]:
pd.DataFrame(out_data).to_pickle('/home/weihua/Research/Link_Dynamics/data/us_air_population.pkl')