In [24]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
import folium

In [25]:
df = pd.read_csv("processed/selected_cities_20241117.csv")
full = pd.read_csv("processed/full_calculation_20241117.csv")

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 326 entries, 0 to 325
Data columns (total 36 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   FID                      326 non-null    int64  
 1   GMI_ADMIN                326 non-null    object 
 2   ADMIN_NAME               326 non-null    object 
 3   FIPS_CNTRY               326 non-null    object 
 4   CNTRY_NAME               326 non-null    object 
 5   STATUS                   326 non-null    object 
 6   POP                      326 non-null    int64  
 7   POP_RANK                 326 non-null    int64  
 8   POP_CLASS                326 non-null    object 
 9   POP_SOURCE               326 non-null    object 
 10  geometry                 326 non-null    object 
 11  CITY_NAME                326 non-null    object 
 12  POP_updated              326 non-null    int64  
 13  POP_updated_year         325 non-null    float64
 14  capital                  3

In [27]:
# Convert the 'geometry' column from WKT to shapely geometries
df['geometry'] = df['geometry'].apply(wkt.loads)

In [28]:
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:3857')
gdf = gdf.to_crs(epsg=4326) # Transform to WGS84 (EPSG:4326)

In [29]:
# Extract longitude and latitude from the transformed geometries
gdf['lon'] = gdf.geometry.x
gdf['lat'] = gdf.geometry.y

In [30]:
gdf['label'] = gdf['CITY_NAME']+", "+gdf['CNTRY_NAME']+": "+gdf['city_type']

In [31]:
gdf[gdf.capital==1]

Unnamed: 0,FID,GMI_ADMIN,ADMIN_NAME,FIPS_CNTRY,CNTRY_NAME,STATUS,POP,POP_RANK,POP_CLASS,POP_SOURCE,...,ISO_CC__sea,SHAPE_Length__sea,SHAPE_Area__sea,distance_sea,POP_rank_updated,city_type,pop_diff,lon,lat,label
0,296,ABW,Aruba,AA,Aruba,National and provincial capital,28295,7,"Less than 50,000",UN_Data_2010_2020,...,,1.283472,0.053652,1.449000,6,coast,,-70.026003,12.525003,"Oranjestad, Aruba: coast"
1,1556,ATG,Antigua & Barbuda,AC,Antigua & Barbuda,National capital,0,0,,,...,,1.896968,0.045993,0.684450,5,coast,,-61.842177,17.124777,"Saint John's, Antigua & Barbuda: coast"
2,2257,ARE-AZA,Abu Zaby,AE,United Arab Emirates,National and provincial capital,1539000,2,"1,000,000 to 4,999,999",UN_Habitat_2020,...,,3.565310,0.047519,2.347458,2,coast,,54.370998,24.476004,"Abu Dhabi, United Arab Emirates: coast"
3,1072,AFG-KAB,Kabol,AF,Afghanistan,National and provincial capital,4136000,2,"1,000,000 to 4,999,999",UN_Habitat_2020,...,,0.702247,0.009304,1192.107975,2,inland,,69.136758,34.530912,"Kabul, Afghanistan: inland"
4,1910,DZA-ALG,Alger,AG,Algeria,National and provincial capital,3608000,2,"1,000,000 to 4,999,999",UN_Habitat_2020,...,,1.359198,0.047538,1.553963,2,coast,,3.050001,36.783297,"Algiers, Algeria: coast"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,17,WSM,Samoa,WS,Samoa,National capital,37391,7,"Less than 50,000",UN_Data_2010_2020,...,,1.146609,0.045098,0.000000,7,coast,,-171.739494,-13.837023,"Apia, Samoa: coast"
321,2458,SWZ-HHH,Hhohho,WZ,Eswatini,National capital,0,0,,,...,,0.553750,0.010466,157.163254,6,inland,,31.191300,-26.303382,"Mbabane, Eswatini: inland"
322,865,YEM-SAN,San'a',YM,Yemen,National and provincial capital,3820000,2,"1,000,000 to 4,999,999",UN_Habitat_2020,...,,1.752070,0.069197,149.616375,7,inland,,44.209503,15.361443,"Sanaa, Yemen: inland"
324,2382,ZMB-LUS,Lusaka,ZA,Zambia,National and provincial capital,1747152,2,"1,000,000 to 4,999,999",UN_Data_2010_2020,...,,0.521008,0.008197,897.500478,5,inland,,28.170000,-15.429996,"Lusaka, Zambia: inland"


In [32]:
city_map = folium.Map(zoom_start=6, min_zoom=2)
for i in range(len(gdf)):
    if gdf.capital.iloc[i]==1: # capital
        folium.Marker(location=[gdf.lat.iloc[i], gdf.lon.iloc[i]], popup=gdf.label.iloc[i], icon=folium.Icon(color="darkred", icon="flag")).add_to(city_map)
    else: # non capital
        folium.Marker(location=[gdf.lat.iloc[i], gdf.lon.iloc[i]], popup=gdf.label.iloc[i], icon=folium.Icon(color="darkgreen", icon="flag")).add_to(city_map)

In [33]:
city_map

In [34]:
# WGS84 (EPSG:4326)
sgdf = gdf[["FID","CITY_NAME","CNTRY_NAME","FIPS_CNTRY","geometry","POP_updated","POP_rank_updated","capital","distance_sea","city_type","pop_diff"]]

In [35]:
sgdf

Unnamed: 0,FID,CITY_NAME,CNTRY_NAME,FIPS_CNTRY,geometry,POP_updated,POP_rank_updated,capital,distance_sea,city_type,pop_diff
0,296,Oranjestad,Aruba,AA,POINT (-70.02600 12.52500),81915,6,1,1.449000,coast,
1,1556,Saint John's,Antigua & Barbuda,AC,POINT (-61.84218 17.12478),185565,5,1,0.684450,coast,
2,2257,Abu Dhabi,United Arab Emirates,AE,POINT (54.37100 24.47600),1202756,2,1,2.347458,coast,
3,1072,Kabul,Afghanistan,AF,POINT (69.13676 34.53091),4434600,2,1,1192.107975,inland,
4,1910,Algiers,Algeria,AG,POINT (3.05000 36.78330),2364230,2,1,1.553963,coast,
...,...,...,...,...,...,...,...,...,...,...,...
321,2458,Mbabane,Eswatini,WZ,POINT (31.19130 -26.30338),60691,6,1,157.163254,inland,
322,865,Sanaa,Yemen,YM,POINT (44.20950 15.36144),33908,7,1,149.616375,inland,
323,866,Al Hudaydah,Yemen,YM,POINT (42.94600 14.79500),383786,4,0,0.594437,coast,349878.0
324,2382,Lusaka,Zambia,ZA,POINT (28.17000 -15.43000),207350,5,1,897.500478,inland,


In [36]:
sgdf.to_csv("processed/selected_cities_s_20241117.csv", index=False)

In [37]:
# 229 countries in total
sgdf['FIPS_CNTRY'].value_counts()

FIPS_CNTRY
SL    2
LY    2
PP    2
GB    2
GA    2
     ..
JO    1
KG    1
KR    1
KT    1
ZI    1
Name: count, Length: 229, dtype: int64

In [38]:
# 229 capital cities
sgdf[sgdf.capital==1]

Unnamed: 0,FID,CITY_NAME,CNTRY_NAME,FIPS_CNTRY,geometry,POP_updated,POP_rank_updated,capital,distance_sea,city_type,pop_diff
0,296,Oranjestad,Aruba,AA,POINT (-70.02600 12.52500),81915,6,1,1.449000,coast,
1,1556,Saint John's,Antigua & Barbuda,AC,POINT (-61.84218 17.12478),185565,5,1,0.684450,coast,
2,2257,Abu Dhabi,United Arab Emirates,AE,POINT (54.37100 24.47600),1202756,2,1,2.347458,coast,
3,1072,Kabul,Afghanistan,AF,POINT (69.13676 34.53091),4434600,2,1,1192.107975,inland,
4,1910,Algiers,Algeria,AG,POINT (3.05000 36.78330),2364230,2,1,1.553963,coast,
...,...,...,...,...,...,...,...,...,...,...,...
320,17,Apia,Samoa,WS,POINT (-171.73949 -13.83702),16585,7,1,0.000000,coast,
321,2458,Mbabane,Eswatini,WZ,POINT (31.19130 -26.30338),60691,6,1,157.163254,inland,
322,865,Sanaa,Yemen,YM,POINT (44.20950 15.36144),33908,7,1,149.616375,inland,
324,2382,Lusaka,Zambia,ZA,POINT (28.17000 -15.43000),207350,5,1,897.500478,inland,


In [39]:
# number of non capital cities
326-229

97

In [96]:
# countries that only have one city (selected cities)
country_city_counts = sgdf['FIPS_CNTRY'].value_counts()
#sgdf[sgdf['FIPS_CNTRY'].isin(country_city_counts[country_city_counts==1].index)]
fips_132 = sgdf[sgdf['FIPS_CNTRY'].isin(country_city_counts[country_city_counts==1].index)].FIPS_CNTRY

In [97]:
len(fips_132)

132

In [74]:
(229-132)*2+132

326

In [94]:
# countries that only have one city (original dataset) -- including countries without capitals
counts = full['FIPS_CNTRY'].value_counts()
#full[full['FIPS_CNbTRY'].isin(counts[counts==1].index)]
fips_82 = full[full['FIPS_CNTRY'].isin(counts[counts==1].index)].FIPS_CNTRY

In [95]:
len(fips_82)

82

In [76]:
132-82

50

In [77]:
# 82 one-city countries in the original dataset. 50 countries due to selection.

In [110]:
# the 50 countries
fips_132[~fips_132.isin(fips_82)]

2      AE
3      AF
6      AJ
18     AU
20     BA
22     BC
24     BE
31     BL
36     BO
40     BT
41     BU
43     BY
46     CB
47     CD
52     CG
67     CT
93     ET
94     EZ
109    GG
134    HU
149    IZ
155    JO
158    KG
167    KZ
168    LA
175    LO
177    LT
178    LU
186    MG
188    MI
191    ML
199    MU
209    NG
217    NP
226    OD
227    PA
248    RO
249    RP
250    RQ
253    RW
273    ST
281    SZ
285    TI
290    TP
297    TX
300    UG
303    UP
306    UV
309    UZ
321    WZ
324    ZA
325    ZI
Name: FIPS_CNTRY, dtype: object

In [99]:
set(fips_132) - set(fips_82)

{'AE',
 'AF',
 'AJ',
 'AU',
 'BA',
 'BC',
 'BE',
 'BL',
 'BO',
 'BT',
 'BU',
 'BY',
 'CB',
 'CD',
 'CG',
 'CT',
 'ET',
 'EZ',
 'GG',
 'HU',
 'IZ',
 'JO',
 'KG',
 'KZ',
 'LA',
 'LO',
 'LT',
 'LU',
 'MG',
 'MI',
 'ML',
 'MU',
 'NG',
 'NP',
 'OD',
 'PA',
 'RO',
 'RP',
 'RQ',
 'RW',
 'ST',
 'SZ',
 'TI',
 'TP',
 'TX',
 'UG',
 'UP',
 'UV',
 'UZ',
 'WZ',
 'ZA',
 'ZI'}

In [100]:
len(set(fips_132) - set(fips_82))

52

In [103]:
sgdf.FIPS_CNTRY.unique()

array(['AA', 'AC', 'AE', 'AF', 'AG', 'AJ', 'AL', 'AM', 'AN', 'AO', 'AQ',
       'AR', 'AS', 'AU', 'AV', 'BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BG',
       'BH', 'BK', 'BL', 'BM', 'BN', 'BO', 'BP', 'BR', 'BT', 'BU', 'BX',
       'BY', 'CA', 'CB', 'CD', 'CE', 'CF', 'CG', 'CH', 'CI', 'CJ', 'CK',
       'CM', 'CN', 'CO', 'CQ', 'CS', 'CT', 'CU', 'CV', 'CW', 'CY', 'DA',
       'DJ', 'DO', 'DR', 'EC', 'EG', 'EI', 'EK', 'EN', 'ER', 'ES', 'ET',
       'EZ', 'FG', 'FI', 'FJ', 'FK', 'FM', 'FO', 'FP', 'FR', 'GA', 'GB',
       'GG', 'GH', 'GI', 'GJ', 'GK', 'GL', 'GM', 'GP', 'GQ', 'GR', 'GT',
       'GV', 'GY', 'HA', 'HO', 'HR', 'HU', 'IC', 'ID', 'IM', 'IN', 'IR',
       'IS', 'IT', 'IV', 'IZ', 'JA', 'JE', 'JM', 'JO', 'KE', 'KG', 'KN',
       'KR', 'KS', 'KT', 'KU', 'KZ', 'LA', 'LE', 'LG', 'LH', 'LI', 'LO',
       'LS', 'LT', 'LU', 'LY', 'MA', 'MB', 'MD', 'MF', 'MG', 'MH', 'MI',
       'MJ', 'MK', 'ML', 'MN', 'MO', 'MP', 'MR', 'MT', 'MU', 'MX', 'MY',
       'MZ', 'NC', 'NE', 'NF', 'NG', 'NH', 'NI', 'N

In [104]:
len(sgdf.FIPS_CNTRY.unique())

229

In [105]:
len(full.FIPS_CNTRY.unique()) 

231

In [106]:
# two countries in the original dataset do not have capital cities
set(full.FIPS_CNTRY.unique()) - set(sgdf.FIPS_CNTRY.unique())

{'GZ', 'WE'}

In [113]:
full[full.FIPS_CNTRY=="GZ"]
#full[full.FIPS_CNTRY=="WE"]

Unnamed: 0,FID,GMI_ADMIN,ADMIN_NAME,FIPS_CNTRY,CNTRY_NAME,STATUS,POP,POP_RANK,POP_CLASS,POP_SOURCE,...,Name1__sea,Name2__sea,Name3__sea,TYPE__sea,ISO_CC__sea,SHAPE_Length__sea,SHAPE_Area__sea,distance_sea,POP_rank_updated,city_type
813,813,ISR-GAZ,Gaza Strip,GZ,Gaza Strip,Other,0,0,,,...,Mediterranean Sea,Atlantic Ocean,,Ocean or Sea,,1.011,0.058621,1.174571,3,coast
