In [1]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
import folium
from geopy.geocoders import Nominatim

In [2]:
code_selection = pd.read_csv("processed/selected_cities_s_20241204.csv")

In [3]:
code_selection

Unnamed: 0,FID,CITY_NAME,CNTRY_NAME,FIPS_CNTRY,geometry,POP_updated,POP_rank_updated,capital,distance_sea,city_type,pop_diff,all_coast,all_inland,single_city_cntry,first_batch,UN
0,1556,Saint John's,Antigua & Barbuda,AC,POINT (-6884239.60382339 1935354.668958882),185565.0,5,1,0.684450,coast,,1,0,1,0,1
1,2257,Abu Dhabi,United Arab Emirates,AE,POINT (6052551.857334056 2811519.478346775),1202756.0,2,1,2.347458,coast,,1,0,0,0,1
2,1072,Kabul,Afghanistan,AF,POINT (7696268.728135633 4100315.268073557),4434600.0,2,1,1192.107975,inland,,0,1,0,0,1
3,1910,Algiers,Algeria,AG,POINT (339524.5968838515 4408944.066085605),2364230.0,2,1,1.553963,coast,,0,0,0,1,1
4,1927,Batna,Algeria,AG,POINT (687064.1558494179 4239835.49229873),1119791.0,2,0,171.612108,inland,1244439.0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,2458,Mbabane,Eswatini,WZ,POINT (3472199.669953191 -3036704.873815354),60691.0,6,1,157.163254,inland,,0,1,0,0,1
284,865,Sanaa,Yemen,YM,POINT (4921379.394375705 1730890.67994842),1707531.0,2,1,149.616375,inland,,0,0,0,1,1
285,2214,Aden,Yemen,YM,POINT (5013996.773199519 1433039.157505845),588938.0,3,0,0.129441,coast,1118593.0,0,0,0,1,1
286,2382,Lusaka,Zambia,ZA,POINT (3135870.088754392 -1738805.992643869),2212301.0,2,1,897.500478,inland,,0,1,0,0,1


In [4]:
code_selection = code_selection.drop(
    columns=["geometry", "POP_rank_updated", "distance_sea", "pop_diff", "all_coast", "all_inland", "single_city_cntry", "UN"])

In [5]:
manual_selection = pd.read_csv("processed/manual_selection_PhD_20241204.csv")

In [6]:
manual_selection

Unnamed: 0,CITY_NAME,FIPS_CNTRY,POP_updated,POP_updated_year,POP_SOURCE_updated,city_type,Unnamed: 6
0,Al Ain,AE,496205.0,2015.0,CP,inland,
1,Ganja,AJ,299100.0,2019.0,CP,inland,capital w lake
2,Sevan,AM,18705.0,2022.0,CP,lake,
3,Bregenz,AU,29643.0,2024.0,CP,lake,
4,Bruges,BE,119869.0,2024.0,CP,inland,
5,Neum,BK,4653.0,2013.0,CP,coast,
6,Oruro,BL,264943.0,2012.0,CP,lake,
7,Salihorsk,BO,98590.0,2023.0,CP,lake,
8,Varna,BU,314607.0,2023.0,CP,coast,
9,Rumonge,BY,35931.0,2008.0,CP,lake,we change the capital to Gitega


In [7]:
manual_selection.rename(columns={"Unnamed: 6": "notes"}, inplace=True)

In [8]:
manual_selection["capital"] = 0
manual_selection["first_batch"] = 0

In [9]:
# remove the last row (notes)
manual_selection = manual_selection.iloc[:-1]

In [10]:
manual_selection = manual_selection.drop(
    columns=["POP_updated_year", "POP_SOURCE_updated"])

In [11]:
df = pd.concat([code_selection, manual_selection])

In [12]:
#Filling missing CNTRY_NAME values based on FIPS_CNTRY
cntry_mapping = df[df["CNTRY_NAME"].notnull()][["FIPS_CNTRY", "CNTRY_NAME"]].drop_duplicates()
fips_to_cntry = dict(zip(cntry_mapping["FIPS_CNTRY"], cntry_mapping["CNTRY_NAME"]))

In [13]:
df["CNTRY_NAME"] = df.apply(
    lambda row: fips_to_cntry[row["FIPS_CNTRY"]]
    if pd.isnull(row["CNTRY_NAME"]) and row["FIPS_CNTRY"] in fips_to_cntry
    else row["CNTRY_NAME"],
    axis=1,
)

In [14]:
df

Unnamed: 0,FID,CITY_NAME,CNTRY_NAME,FIPS_CNTRY,POP_updated,capital,city_type,first_batch,notes
0,1556.0,Saint John's,Antigua & Barbuda,AC,185565.0,1,coast,0,
1,2257.0,Abu Dhabi,United Arab Emirates,AE,1202756.0,1,coast,0,
2,1072.0,Kabul,Afghanistan,AF,4434600.0,1,inland,0,
3,1910.0,Algiers,Algeria,AG,2364230.0,1,coast,1,
4,1927.0,Batna,Algeria,AG,1119791.0,0,inland,1,
...,...,...,...,...,...,...,...,...,...
51,,Kariba,Zimbabwe,ZI,27450.0,0,lake,0,
52,,Vatican City,,VT,453.0,0,inland,0,
53,,Ramallah,,WE,38998.0,0,inland,0,
54,,Al-Burayj,,WE,43515.0,0,coast,0,this is a refugee camp


In [15]:
len(df.FIPS_CNTRY.unique())

195

In [26]:
len(df.CNTRY_NAME.unique())

195

In [16]:
df.loc[df['FIPS_CNTRY'] == 'MV', 'CNTRY_NAME'] = 'Maldives'
df.loc[df['FIPS_CNTRY'] == 'WE', 'CNTRY_NAME'] = 'Palestine'
df.loc[df['FIPS_CNTRY'] == 'VT', 'CNTRY_NAME'] = 'Vatican City'
df.loc[df['CITY_NAME'] == 'Ramallah', 'capital'] = 1
df.loc[df['CITY_NAME'] == 'Maale', 'capital'] = 1
df.loc[df['CITY_NAME'] == 'Vatican City', 'capital'] = 1

In [17]:
df

Unnamed: 0,FID,CITY_NAME,CNTRY_NAME,FIPS_CNTRY,POP_updated,capital,city_type,first_batch,notes
0,1556.0,Saint John's,Antigua & Barbuda,AC,185565.0,1,coast,0,
1,2257.0,Abu Dhabi,United Arab Emirates,AE,1202756.0,1,coast,0,
2,1072.0,Kabul,Afghanistan,AF,4434600.0,1,inland,0,
3,1910.0,Algiers,Algeria,AG,2364230.0,1,coast,1,
4,1927.0,Batna,Algeria,AG,1119791.0,0,inland,1,
...,...,...,...,...,...,...,...,...,...
51,,Kariba,Zimbabwe,ZI,27450.0,0,lake,0,
52,,Vatican City,Vatican City,VT,453.0,1,inland,0,
53,,Ramallah,Palestine,WE,38998.0,1,inland,0,
54,,Al-Burayj,Palestine,WE,43515.0,0,coast,0,this is a refugee camp


In [18]:
# Reordering the DataFrame by alphabetic order of FIPS_CNTRY, then by capital=1 first
df = df.sort_values(by=["CNTRY_NAME", "capital"], ascending=[True, False]).reset_index(drop=True)

In [19]:
df

Unnamed: 0,FID,CITY_NAME,CNTRY_NAME,FIPS_CNTRY,POP_updated,capital,city_type,first_batch,notes
0,1072.0,Kabul,Afghanistan,AF,4434600.0,1,inland,0,
1,631.0,Tirana,Albania,AL,418495.0,1,inland,1,
2,605.0,Durres,Albania,AL,113249.0,0,coast,1,
3,1910.0,Algiers,Algeria,AG,2364230.0,1,coast,1,
4,1927.0,Batna,Algeria,AG,1119791.0,0,inland,1,
...,...,...,...,...,...,...,...,...,...
339,2214.0,Aden,Yemen,YM,588938.0,0,coast,1,
340,2382.0,Lusaka,Zambia,ZA,2212301.0,1,inland,0,
341,,Siavonga,Zambia,ZA,26951.0,0,lake,0,
342,2312.0,Harare,Zimbabwe,ZI,1491754.0,1,inland,0,


In [20]:
len(df[df.capital==1])

195

In [21]:
df['CITY_ID'] = range(1, len(df) + 1)

In [22]:
df

Unnamed: 0,FID,CITY_NAME,CNTRY_NAME,FIPS_CNTRY,POP_updated,capital,city_type,first_batch,notes,CITY_ID
0,1072.0,Kabul,Afghanistan,AF,4434600.0,1,inland,0,,1
1,631.0,Tirana,Albania,AL,418495.0,1,inland,1,,2
2,605.0,Durres,Albania,AL,113249.0,0,coast,1,,3
3,1910.0,Algiers,Algeria,AG,2364230.0,1,coast,1,,4
4,1927.0,Batna,Algeria,AG,1119791.0,0,inland,1,,5
...,...,...,...,...,...,...,...,...,...,...
339,2214.0,Aden,Yemen,YM,588938.0,0,coast,1,,340
340,2382.0,Lusaka,Zambia,ZA,2212301.0,1,inland,0,,341
341,,Siavonga,Zambia,ZA,26951.0,0,lake,0,,342
342,2312.0,Harare,Zimbabwe,ZI,1491754.0,1,inland,0,,343


In [23]:
# reorder columns
columns_order = ['CITY_ID', 'CITY_NAME', 'CNTRY_NAME', 'FIPS_CNTRY', 
                 'POP_updated', 'capital', 'city_type', 'first_batch', 'FID', 'notes']
df = df[columns_order]

In [24]:
df

Unnamed: 0,CITY_ID,CITY_NAME,CNTRY_NAME,FIPS_CNTRY,POP_updated,capital,city_type,first_batch,FID,notes
0,1,Kabul,Afghanistan,AF,4434600.0,1,inland,0,1072.0,
1,2,Tirana,Albania,AL,418495.0,1,inland,1,631.0,
2,3,Durres,Albania,AL,113249.0,0,coast,1,605.0,
3,4,Algiers,Algeria,AG,2364230.0,1,coast,1,1910.0,
4,5,Batna,Algeria,AG,1119791.0,0,inland,1,1927.0,
...,...,...,...,...,...,...,...,...,...,...
339,340,Aden,Yemen,YM,588938.0,0,coast,1,2214.0,
340,341,Lusaka,Zambia,ZA,2212301.0,1,inland,0,2382.0,
341,342,Siavonga,Zambia,ZA,26951.0,0,lake,0,,
342,343,Harare,Zimbabwe,ZI,1491754.0,1,inland,0,2312.0,


In [25]:
df.to_csv("processed/cities_20241204.csv", index=False)

In [None]:
# Initialize geolocator
geolocator = Nominatim(user_agent="city_geocoder")

# Function to get latitude and longitude based on CITY_NAME and CNTRY_NAME
def get_lat_lon(city_name, country_name):
    try:
        location = geolocator.geocode(f"{city_name}, {country_name}")
        if location:
            return location.longitude, location.latitude
    except Exception as e:
        return None, None
    return None, None

In [None]:
# Apply the geocoding function to each row
df[['lon', 'lat']] = df.apply(
    lambda row: pd.Series(get_lat_lon(row['CITY_NAME'], row['CNTRY_NAME'])),
    axis=1
)

In [None]:
df

In [None]:
df['label'] = df['CITY_NAME']+", "+df['CNTRY_NAME']+": "+df['city_type']

In [None]:
city_map = folium.Map(zoom_start=6, min_zoom=2)
for i in range(len(df)):
    if df.capital.iloc[i]==1: # capital
        folium.Marker(location=[df.lat.iloc[i], df.lon.iloc[i]], popup=df.label.iloc[i], icon=folium.Icon(color="darkred", icon="flag")).add_to(city_map)
    else: # non capital
        folium.Marker(location=[df.lat.iloc[i], df.lon.iloc[i]], popup=df.label.iloc[i], icon=folium.Icon(color="darkgreen", icon="flag")).add_to(city_map)

In [None]:
df[df[['lon', 'lat']].isnull().any(axis=1)]