## **Section 4:** Add Capitals

In [2]:
# Import a few packages
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
import os
pd.options.display.max_rows = 300

In [3]:
print(os.getcwd())
os.chdir('G:/My Drive/Clark')
print(os.getcwd())

G:\My Drive\Clark\GIS Tutorials\Geog-312\Geog-312\1.Geopandas
G:\My Drive\Clark


In [4]:
# # Data source: https://github.com/Stefie/geojson-world
# capitals_file = "GIS Tutorials/GeoPy/activity_files/capitals.geojson"
# capitals_gdf = gpd.read_file(capitals_file, driver="GeoJSON")
# print(capitals_gdf.head())

In [5]:
# Bring in capital city population data
# Source: https://gist.github.com/ofou/df09a6834a8421b4f376c875194915c9#file-country-capital-lat-long-population-csv
table_path = "GIS Tutorials/Geog-312/Geog-312/1.Geopandas/inputData/capitals-lat-long-population.csv"
capTable = gpd.read_file(table_path)
print(len(capTable))
capTable.head(20)

234


Unnamed: 0,Country,Capital City,Latitude,Longitude,Population,Capital Type
0,Afghanistan,Kabul,34.5289,69.1725,4011770,Capital
1,Albania,TiranÃ« (Tirana),41.3275,19.8189,475577,Capital
2,Algeria,El DjazaÃ¯r (Algiers),36.7525,3.042,2693542,Capital
3,American Samoa,Pago Pago,-14.2781,-170.7025,48526,Capital
4,Andorra,Andorra la Vella,42.5078,1.5211,22614,Capital
5,Angola,Luanda,-8.8368,13.2343,7774200,Capital
6,Anguilla,The Valley,18.217,-63.0578,1402,Capital
7,Antigua and Barbuda,St. John's,17.1172,-61.8457,20764,Capital
8,Argentina,Buenos Aires,-34.6051,-58.4004,14966530,Capital
9,Armenia,Yerevan,40.182,44.5146,1080324,Capital


In [6]:
# The CSV has lat long, but we need to create geomtry
# Create a geometry column from Latitude and Longitude
capTable['geometry'] = gpd.points_from_xy(capTable['Longitude'], capTable['Latitude'])

# Ensure capTable is a GeoDataFrame
capTable = gpd.GeoDataFrame(capTable, geometry='geometry')

# A CSV will not come with a set CRS
# Set it to WGS 84 (EPSG:4326) as the coordinates are in latitude/longitude
capTable = capTable.set_crs("EPSG:4326")

# Also turn population into a numerif field
capTable['Population'] = pd.to_numeric(capTable['Population'], errors='coerce')

# Display the GeoDataFrame to confirm
capTable.head(40)

Unnamed: 0,Country,Capital City,Latitude,Longitude,Population,Capital Type,geometry
0,Afghanistan,Kabul,34.5289,69.1725,4011770,Capital,POINT (69.1725 34.5289)
1,Albania,TiranÃ« (Tirana),41.3275,19.8189,475577,Capital,POINT (19.8189 41.3275)
2,Algeria,El DjazaÃ¯r (Algiers),36.7525,3.042,2693542,Capital,POINT (3.042 36.7525)
3,American Samoa,Pago Pago,-14.2781,-170.7025,48526,Capital,POINT (-170.7025 -14.2781)
4,Andorra,Andorra la Vella,42.5078,1.5211,22614,Capital,POINT (1.5211 42.5078)
5,Angola,Luanda,-8.8368,13.2343,7774200,Capital,POINT (13.2343 -8.8368)
6,Anguilla,The Valley,18.217,-63.0578,1402,Capital,POINT (-63.0578 18.217)
7,Antigua and Barbuda,St. John's,17.1172,-61.8457,20764,Capital,POINT (-61.8457 17.1172)
8,Argentina,Buenos Aires,-34.6051,-58.4004,14966530,Capital,POINT (-58.4004 -34.6051)
9,Armenia,Yerevan,40.182,44.5146,1080324,Capital,POINT (44.5146 40.182)


In [7]:
# Check that the point locations are accurate
import folium
from folium import Popup, CircleMarker
from shapely.geometry import Point

# Center of the map (choose appropriate coordinates for your map)
map_center = [20, 0]  # Adjust as needed
m = folium.Map(location=map_center, zoom_start=2)

# Loop through each row in the DataFrame to add markers
for idx, row in capTable.iterrows():
    if isinstance(row['geometry'], Point):  # Ensure geometry is a Point
        # Extract latitude and longitude from the geometry column
        lat, lon = row['geometry'].y, row['geometry'].x
        
        # Set marker size by population with a minimum size threshold
        population = row['Population']
        radius = max(5, min(population / 100000, 30))  # Adjust scaling as needed
        
        # Create the popup text
        tooltip_text = f"""
            <b>Country:</b> {row['Country']}<br>
            <b>Capital:</b> {row['Capital City']}<br>
            <b>Population:</b> {int(population):,}
        """
        
        # Add a circle marker with a tooltip to the map
        CircleMarker(
            location=(lat, lon),
            radius=radius,
            color='blue',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6,
            tooltip=tooltip_text
        ).add_to(m)

# Display the map
m

In [8]:
# Clean up "Capital City" column
import re

# Rename the column
capTable = capTable.rename(columns={"Capital City": "Capital"})

# Update the 'Capital' column to keep only text within parentheses
capTable['Capital'] = capTable['Capital'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if '(' in x else x)

# And the Country column to remove parentheticals
capTable['Country'] = capTable['Country'].apply(lambda x: re.sub(r'\s*\(.*?\)', '', x))

# Display the updated GeoDataFrame
capTable

Unnamed: 0,Country,Capital,Latitude,Longitude,Population,Capital Type,geometry
0,Afghanistan,Kabul,34.5289,69.1725,4011770,Capital,POINT (69.1725 34.5289)
1,Albania,Tirana,41.3275,19.8189,475577,Capital,POINT (19.8189 41.3275)
2,Algeria,Algiers,36.7525,3.042,2693542,Capital,POINT (3.042 36.7525)
3,American Samoa,Pago Pago,-14.2781,-170.7025,48526,Capital,POINT (-170.7025 -14.2781)
4,Andorra,Andorra la Vella,42.5078,1.5211,22614,Capital,POINT (1.5211 42.5078)
5,Angola,Luanda,-8.8368,13.2343,7774200,Capital,POINT (13.2343 -8.8368)
6,Anguilla,The Valley,18.217,-63.0578,1402,Capital,POINT (-63.0578 18.217)
7,Antigua and Barbuda,St. John's,17.1172,-61.8457,20764,Capital,POINT (-61.8457 17.1172)
8,Argentina,Buenos Aires,-34.6051,-58.4004,14966530,Capital,POINT (-58.4004 -34.6051)
9,Armenia,Yerevan,40.182,44.5146,1080324,Capital,POINT (44.5146 40.182)


In [9]:
# Manual edits to 'Country' field:
capTable.loc[capTable['Country'] == 'Brunei Darussalam', 'Country'] = 'Brunei'
#capTable.loc[capTable['Country'] == 'Caribbean Netherlands', 'Country'] = 'Bonaire'
capTable.loc[capTable['Capital'] == 'St. Helier', 'Country'] = 'Guernsey'
capTable.loc[capTable['Capital'] == 'St. Peter Port', 'Country'] = 'Jersey'
capTable.loc[capTable['Country'] == "CÃ´te d'Ivoire", 'Country'] = 'Ivory Coast'
capTable.loc[capTable['Country'] == "CuraÃ§ao", 'Country'] = 'Curacao'
capTable.loc[capTable['Country'] == "Dem. People's Republic of Korea", 'Country'] = 'North Korea'
capTable.loc[capTable['Country'] == "Faeroe Islands", 'Country'] = 'Faroe Islands'


capTable.loc[capTable['Country'] == "Holy See", 'Country'] = 'Vatican City'
capTable.loc[capTable['Country'] == "Lao People's Democratic Republic", 'Country'] = 'Laos'
capTable.loc[capTable['Country'] == "Republic of Korea", 'Country'] = 'South Korea'
capTable.loc[capTable['Country'] == "Republic of Moldova", 'Country'] = 'Moldova'
capTable.loc[capTable['Country'] == "RÃ©union", 'Country'] = 'Reunion'
capTable.loc[capTable['Country'] == "Russian Federation", 'Country'] = 'Russia'
capTable.loc[capTable['Country'] == "State of Palestine", 'Country'] = 'Palestine'
capTable.loc[capTable['Country'] == "Syrian Arab Republic", 'Country'] = 'Syria'
capTable.loc[capTable['Country'] == "TFYR Macedonia", 'Country'] = 'North Macedonia'
capTable.loc[capTable['Country'] == "United Republic of Tanzania", 'Country'] = 'Tanzania'
capTable.loc[capTable['Country'] == "Viet Nam", 'Country'] = 'Vietnam'
capTable.loc[capTable['Country'] == "Timor-Leste", 'Country'] = 'Timor Leste'
capTable.loc[capTable['Country'] == "Cabo Verde", 'Country'] = 'Cape Verde'
capTable.loc[capTable['Country'] == "Swaziland", 'Country'] = 'Eswatini'
capTable.loc[capTable['Country'] == "Gambia", 'Country'] = 'The Gambia'
capTable.loc[capTable['Country'] == "United States of America", 'Country'] = 'United States'
capTable.loc[capTable['Country'] == "Congo", 'Country'] = 'Republic of the Congo'



# countries does not have Sint Maarten seperate from Caribean Netherlands
capTable = capTable[capTable['Country'] != 'Sint Maarten']

# # Delete Hong Kong and Macau (Not in countries_w_pop)
# capTable = capTable[capTable['Country'] != 'China, Hong Kong SAR']
# capTable = capTable[capTable['Country'] != 'China, Macao SAR']
# capTable = capTable[capTable['Country'] != 'China, Taiwan Province of China']

In [10]:
# Manual edits to 'Capital' field:
capTable.loc[capTable['Capital'] == 'Bruxelles-Brussel', 'Capital'] = 'Brussels'
capTable.loc[capTable['Capital'] == 'BrasÃ­lia', 'Capital'] = 'Brasilia'
capTable.loc[capTable['Capital'] == 'YaoundÃ©', 'Capital'] = 'Yaounde'
capTable.loc[capTable['Capital'] == 'Ottawa-Gatineau', 'Capital'] = 'Ottawa'
capTable.loc[capTable['Capital'] == "N'DjamÃ©na", 'Capital'] = "N'Djamena"
capTable.loc[capTable['Capital'] == "Taibei", 'Capital'] = "Taipei"
capTable.loc[capTable['Capital'] == "BogotÃ¡", 'Capital'] = "Bogota"
capTable.loc[capTable['Capital'] == "San JosÃ©", 'Capital'] = "San Jose"
capTable.loc[capTable['Capital'] == "P'yongyang", 'Capital'] = "Pyongyang"
capTable.loc[capTable['Capital'] == "TÃ³rshavn", 'Capital'] = "Torshavn"
capTable.loc[capTable['Capital'] == "GodthÃ¥b", 'Capital'] = "Nuuk"
capTable.loc[capTable['Capital'] == "HagÃ¥tÃ±a", 'Capital'] = "Hagatna"
capTable.loc[capTable['Capital'] == "ReykjavÃ­k", 'Capital'] = "Reykjavik"
capTable.loc[capTable['Capital'] == "Brades Estate", 'Capital'] = "Brades"
capTable.loc[capTable['Capital'] == "Nay Pyi Taw", 'Capital'] = "Naypyidaw"
capTable.loc[capTable['Capital'] == "NoumÃ©a", 'Capital'] = "Noumea"
capTable.loc[capTable['Capital'] == "AsunciÃ³n", 'Capital'] = "Asuncion"
capTable.loc[capTable['Capital'] == "ChiÅŸinÄƒu", 'Capital'] = "Chisinau"
capTable.loc[capTable['Capital'] == "SÃ£o TomÃ©", 'Capital'] = "Sao Tome"
capTable.loc[capTable['Capital'] == "Al-Quds[East Jerusalem]", 'Capital'] = "Ramallah"
capTable.loc[capTable['Capital'] == "Ramallah", 'geometry'] = Point (35.204283, 31.904980)
capTable.loc[capTable['Capital'] == "LomÃ©", 'Capital'] = "Lome"
capTable.loc[capTable['Capital'] == "Kiev", 'Capital'] = "Kyiv"
capTable.loc[capTable['Capital'] == "Washington, D.C.", 'Capital'] = "Washington, DC"
capTable.loc[capTable['Country'] == "Vietnam", 'Capital'] = "Hanoi"
capTable.loc[capTable['Capital'] == "El AaiÃºn", 'Capital'] = "Laayoune"
capTable.loc[capTable['Capital'] == "Laayoune", 'geometry'] = Point (-13.194692, 27.148249)
capTable.loc[capTable['Capital'] == "Sana'a'", 'Capital'] = "Sanaa"

In [11]:
# This cell adds a couple capital city locations that we have in the countries gdf but not capTable to capTable

from shapely.geometry import Point

# # Juba, South Sudan
# juba_data = {
#     'Country': 'South Sudan',
#     'Capital': 'Juba',
#     # 'tld': 'ss',
#     # 'iso3': 'SSD',
#     # 'iso2': 'SS',
#     'geometry': Point(31.601562, 4.859370)
# }

# # Convert to GeoDataFrame and append to capitals_gdf
# juba_gdf = gpd.GeoDataFrame([juba_data], crs=capTable.crs)
# capTable = pd.concat([capTable, juba_gdf], ignore_index=True)

# Define King Edward Point data
king_edward_point_data = {
    'Country': 'South Georgia and the South Sandwich Islands',
    'Capital': 'King Edward Point',
    # 'tld': 'gs',
    # 'iso3': 'SGS',
    # 'iso2': 'GS',
    'Population': 0,
    'geometry': Point(-54.2805, -36.5089)
}

# Convert to GeoDataFrame and append to capitals_gdf
king_edward_point_gdf = gpd.GeoDataFrame([king_edward_point_data], crs=capTable.crs)
capTable = pd.concat([capTable, king_edward_point_gdf], ignore_index=True)

# # Dictionary for Philipsburg, Saint Martin
# philipsburg_dict = {
#     'Country': 'Saint Martin',
#     'Capital': 'Philipsburg',
#     # 'tld': 'sx',
#     # 'iso3': 'SXM',
#     # 'iso2': 'SX',
#     'geometry': Point(-63.0581, 18.0226)  # Approximate coordinates for Philipsburg
# }

# # Adding the dictionary to the GeoDataFrame
# philipsburg_gdf = gpd.GeoDataFrame([philipsburg_dict], crs=capTable.crs)
# capTable = pd.concat([capTable, philipsburg_gdf], ignore_index=True)

# Dictionary for Pristina, Kosovo
pristina_dict = {
    'Country': 'Kosovo',
    'Capital': 'Pristina',
    # 'tld': 'xk',
    # 'iso3': 'XKX',
    # 'iso2': 'XK',
    'Population': 227154,
    'geometry': Point(21.1655, 42.6629)  # Approximate coordinates for Pristina
}

# Adding the dictionary to the GeoDataFrame
pristina_gdf = gpd.GeoDataFrame([pristina_dict], crs=capTable.crs)
capTable = pd.concat([capTable, pristina_gdf], ignore_index=True)


In [12]:
# Some modifications are needed to capital city names and locations as well
# Even with something as seemingly fundemental as capital city there can be lots of ambiguity!

# Yamoussoukro (not Abidjan)
capTable.loc[capTable['Country'] == "Ivory Coast", 'Capital'] = "Yamoussoukro"
capTable.loc[capTable['Capital'] == "Yamoussoukro", 'geometry'] = Point (-5.256789, 6.797897)
capTable.loc[capTable['Capital'] == "Yamoussoukro", 'Population'] = 279977

# # Pretoria (not Cape Town)
# capTable.loc[capTable['Country'] == "South Africa", 'Capital'] = "Pretoria"
# capTable.loc[capTable['Capital'] == "Yamoussoukro", 'geometry'] = Point (28.185449, -25.751299)
# capTable.loc[capTable['Capital'] == "Yamoussoukro", 'Population'] = 2818000

# Porto-Novo (not Cotonou)
capTable.loc[capTable['Country'] == "Benin", 'Capital'] = "Porto-Novo"
capTable.loc[capTable['Capital'] == "Yamoussoukro", 'geometry'] = Point (2.623842, 6.487343)
capTable.loc[capTable['Capital'] == "Yamoussoukro", 'Population'] = 264320

# Change capital of Burundi to Gitega, modify location, population
capTable.loc[capTable['Capital'] == 'Bujumbura', 'Capital'] = 'Gitega'
capTable.loc[capTable['Capital'] == 'Gitega', 'geometry'] = Point(29.924305, -3.428702)
capTable.loc[capTable['Capital'] == "Gitega", 'Population'] = 135467

# Update Dodoma, Tanzania population
capTable.loc[capTable['Capital'] == 'Dodoma', 'Population'] = 765179

# Brades, Montserrat
capTable.loc[capTable['Capital'] == 'Brades Estate', 'Capital'] = 'Brades'

# Naypyidaw, Myanmar
capTable.loc[capTable['Capital'] == 'Nay Pyi Taw', 'Capital'] = 'Naypyidaw'

# Palestine, set Ramallah as the capital
capTable.loc[capTable['Country'] == 'State of Palestine', 'Country'] = 'Palestine'
capTable.loc[capTable['Country'] == 'Palestine', 'Capital'] = 'Ramallah'
capTable.loc[capTable['Capital'] == 'Ramallah', 'geometry'] = Point (35.2075, 31.8995)
capTable.loc[capTable['Capital'] == 'Ramallah', 'Population'] = 43880

# capTable has a country called Nauru, but no capital city
# The coords are for the middle of the island, not the actual city location, lets correct that
capTable.loc[capTable['Country'] == 'Nauru', 'Capital'] = 'Yaren'
capTable.loc[capTable['Capital'] == 'Yaren', 'geometry'] = Point (166.917207, -0.547485)
capTable.loc[capTable['Capital'] == 'Yaren', 'Population'] = 803

# Palau
capTable.loc[capTable['Country'] == 'Palau', 'Capital'] = 'Ngerulmud'
capTable.loc[capTable['Capital'] == 'Ngerulmud', 'geometry'] = Point (134.624097, 7.500407)
capTable.loc[capTable['Capital'] == 'Ngerulmud', 'Population'] = 0

# Grenada minor typo
# It seems to be written everywhere with "St.", not "Saint"
capTable.loc[capTable['Capital'] == "St.George's", 'Capital'] = "St. George's"

# Hong Kong
capTable.loc[capTable['Country'] == 'China, Hong Kong SAR', 'Country'] = 'Hong Kong'
# Macau
capTable.loc[capTable['Country'] == 'China, Macao SAR', 'Country'] = 'Macau'
# Taiwan
capTable.loc[capTable['Country'] == 'China, Taiwan Province of China', 'Country'] = 'Taiwan'
# Saint Helena, Ascension and Tristan da Cunha
capTable.loc[capTable['Country'] == 'Saint Helena', 'Country'] = 'Saint Helena, Ascension and Tristan da Cunha'
# US Virgin Islands
capTable.loc[capTable['Country'] == 'United States Virgin Islands', 'Country'] = 'US Virgin Islands'

# # Falkland Islands
# capTable.loc[capTable['Country'] == 'Falkland Islands (Malvinas)', 'Country'] = 'Falkland Islands'
# # Iran (Islamic Republic of)
# capTable.loc[capTable['Country'] == 'Iran (Islamic Republic of)', 'Country'] = 'Iran'
# # Christmas Island
# capTable.loc[capTable['Capital'] == 'The Settlement', 'Capital'] = 'Flying Fish Cove'



In [13]:
# Clean up fields for merge
capTable = capTable.rename(columns={'Population': 'capPop'})
capTable = capTable.rename(columns={'geometry': 'capLoc'})

capTable = capTable.drop(columns=['Capital Type'])

In [14]:
print(len(capTable))
capTable

235


Unnamed: 0,Country,Capital,Latitude,Longitude,capPop,capLoc
0,Afghanistan,Kabul,34.5289,69.1725,4011770,POINT (69.1725 34.5289)
1,Albania,Tirana,41.3275,19.8189,475577,POINT (19.8189 41.3275)
2,Algeria,Algiers,36.7525,3.042,2693542,POINT (3.042 36.7525)
3,American Samoa,Pago Pago,-14.2781,-170.7025,48526,POINT (-170.7025 -14.2781)
4,Andorra,Andorra la Vella,42.5078,1.5211,22614,POINT (1.5211 42.5078)
5,Angola,Luanda,-8.8368,13.2343,7774200,POINT (13.2343 -8.8368)
6,Anguilla,The Valley,18.217,-63.0578,1402,POINT (-63.0578 18.217)
7,Antigua and Barbuda,St. John's,17.1172,-61.8457,20764,POINT (-61.8457 17.1172)
8,Argentina,Buenos Aires,-34.6051,-58.4004,14966530,POINT (-58.4004 -34.6051)
9,Armenia,Yerevan,40.182,44.5146,1080324,POINT (44.5146 40.182)


In [15]:
capTable_sorted= capTable.sort_values(by='Capital')
capTable_sorted.head(20)

Unnamed: 0,Country,Capital,Latitude,Longitude,capPop,capLoc
218,United Arab Emirates,Abu Dhabi,24.4648,54.3618,1419699,POINT (54.3618 24.4648)
152,Nigeria,Abuja,9.0574,7.4898,2918518,POINT (7.4898 9.0574)
83,Ghana,Accra,5.556,-0.1969,2439389,POINT (-0.1969 5.556)
71,Ethiopia,Addis Ababa,9.025,38.7469,4399674,POINT (38.7469 9.025)
2,Algeria,Algiers,36.7525,3.042,2693542,POINT (3.042 36.7525)
153,Niue,Alofi,-19.0585,-169.9213,727,POINT (-169.9213 -19.0585)
109,Jordan,Amman,31.9552,35.945,2064582,POINT (35.945 31.9552)
147,Netherlands,Amsterdam,52.374,4.8897,1131690,POINT (4.8897 52.374)
4,Andorra,Andorra la Vella,42.5078,1.5211,22614,POINT (1.5211 42.5078)
212,Turkey,Ankara,39.9199,32.8543,4919074,POINT (32.8543 39.9199)


### Read back in countries gdf from parquet file

In [16]:
# Load the Parquet file
countries_gdf = gpd.read_parquet("GIS Tutorials/Geog-312/geopandas_Files/checkpoint3/reproj.parquet")
countries_gdf.head()

Unnamed: 0,COUNTRY,ISO,COUNTRYAFF,Population,formalName,geometry,centroid,centroidOut
0,Afghanistan,AF,Afghanistan,34262840,Islamic Emirate of Afghanistan,"POLYGON ((74.88986 37.23409, 74.88962 37.23314...",POINT (65.98786 33.75477),0.0
1,Akrotiri and Dhekelia,,United Kingdom,18195,,"MULTIPOLYGON (((33.90121 35.09612, 33.90185 35...",POINT (33.36206 34.83019),1.0
2,Albania,AL,Albania,2402113,Republic of Albania,"MULTIPOLYGON (((20.0789 42.5558, 20.07939 42.5...",POINT (20.06737 41.12698),0.0
3,Algeria,DZ,Algeria,46700000,People's Democratic Republic of Algeria,"MULTIPOLYGON (((8.64188 36.94206, 8.64196 36.9...",POINT (2.68012 27.89881),0.0
4,American Samoa,AS,United States,49710,,"MULTIPOLYGON (((-168.17253 -14.55294, -168.173...",POINT (-170.3959 -14.21789),1.0


In [17]:
countries_gdf

Unnamed: 0,COUNTRY,ISO,COUNTRYAFF,Population,formalName,geometry,centroid,centroidOut
0,Afghanistan,AF,Afghanistan,34262840,Islamic Emirate of Afghanistan,"POLYGON ((74.88986 37.23409, 74.88962 37.23314...",POINT (65.98786 33.75477),0.0
1,Akrotiri and Dhekelia,,United Kingdom,18195,,"MULTIPOLYGON (((33.90121 35.09612, 33.90185 35...",POINT (33.36206 34.83019),1.0
2,Albania,AL,Albania,2402113,Republic of Albania,"MULTIPOLYGON (((20.0789 42.5558, 20.07939 42.5...",POINT (20.06737 41.12698),0.0
3,Algeria,DZ,Algeria,46700000,People's Democratic Republic of Algeria,"MULTIPOLYGON (((8.64188 36.94206, 8.64196 36.9...",POINT (2.68012 27.89881),0.0
4,American Samoa,AS,United States,49710,,"MULTIPOLYGON (((-168.17253 -14.55294, -168.173...",POINT (-170.3959 -14.21789),1.0
5,Andorra,AD,Andorra,86398,Principality of Andorra,"POLYGON ((1.7258 42.5044, 1.71149 42.49224, 1....",POINT (1.57627 42.5454),0.0
6,Angola,AO,Angola,35121734,Republic of Angola,"MULTIPOLYGON (((23.99928 -10.89013, 23.99943 -...",POINT (17.54511 -12.24251),0.0
7,Anguilla,AI,United Kingdom,15780,,"MULTIPOLYGON (((-62.93497 18.30292, -62.93414 ...",POINT (-63.0639 18.2312),0.0
8,Antarctica,AQ,,0,,"MULTIPOLYGON (((-57.28856 -63.22003, -57.25061...",POINT (27.94983 -75.41679),0.0
9,Antigua and Barbuda,AG,Antigua and Barbuda,103603,Antigua and Barbuda,"MULTIPOLYGON (((-61.66928 17.07375, -61.66839 ...",POINT (-61.79912 17.27757),1.0


In [18]:
# Merge based on 'COUNTRY' in countries_gdf and 'Location' in countryPopTable
countries_w_caps = countries_gdf.merge(
    capTable[['Country','Capital', 'capPop', 'capLoc']],  # Select only necessary columns
    left_on='COUNTRY', 
    right_on='Country', 
    how='left'
)

# Drop 'Location' column from the merged result, if no longer needed
#capitals_gdf = capitals_gdf.drop(columns='city')

# Show the result
print(len(countries_w_caps))
countries_w_caps.sort_values(by='COUNTRY')

248


Unnamed: 0,COUNTRY,ISO,COUNTRYAFF,Population,formalName,geometry,centroid,centroidOut,Country,Capital,capPop,capLoc
0,Afghanistan,AF,Afghanistan,34262840,Islamic Emirate of Afghanistan,"POLYGON ((74.88986 37.23409, 74.88962 37.23314...",POINT (65.98786 33.75477),0.0,Afghanistan,Kabul,4011770.0,POINT (69.1725 34.5289)
1,Akrotiri and Dhekelia,,United Kingdom,18195,,"MULTIPOLYGON (((33.90121 35.09612, 33.90185 35...",POINT (33.36206 34.83019),1.0,,,,
2,Albania,AL,Albania,2402113,Republic of Albania,"MULTIPOLYGON (((20.0789 42.5558, 20.07939 42.5...",POINT (20.06737 41.12698),0.0,Albania,Tirana,475577.0,POINT (19.8189 41.3275)
3,Algeria,DZ,Algeria,46700000,People's Democratic Republic of Algeria,"MULTIPOLYGON (((8.64188 36.94206, 8.64196 36.9...",POINT (2.68012 27.89881),0.0,Algeria,Algiers,2693542.0,POINT (3.042 36.7525)
4,American Samoa,AS,United States,49710,,"MULTIPOLYGON (((-168.17253 -14.55294, -168.173...",POINT (-170.3959 -14.21789),1.0,American Samoa,Pago Pago,48526.0,POINT (-170.7025 -14.2781)
5,Andorra,AD,Andorra,86398,Principality of Andorra,"POLYGON ((1.7258 42.5044, 1.71149 42.49224, 1....",POINT (1.57627 42.5454),0.0,Andorra,Andorra la Vella,22614.0,POINT (1.5211 42.5078)
6,Angola,AO,Angola,35121734,Republic of Angola,"MULTIPOLYGON (((23.99928 -10.89013, 23.99943 -...",POINT (17.54511 -12.24251),0.0,Angola,Luanda,7774200.0,POINT (13.2343 -8.8368)
7,Anguilla,AI,United Kingdom,15780,,"MULTIPOLYGON (((-62.93497 18.30292, -62.93414 ...",POINT (-63.0639 18.2312),0.0,Anguilla,The Valley,1402.0,POINT (-63.0578 18.217)
8,Antarctica,AQ,,0,,"MULTIPOLYGON (((-57.28856 -63.22003, -57.25061...",POINT (27.94983 -75.41679),0.0,,,,
9,Antigua and Barbuda,AG,Antigua and Barbuda,103603,Antigua and Barbuda,"MULTIPOLYGON (((-61.66928 17.07375, -61.66839 ...",POINT (-61.79912 17.27757),1.0,Antigua and Barbuda,St. John's,20764.0,POINT (-61.8457 17.1172)


In [19]:
# There will be a handful of countries that we have in countries_gdf, but not capTable
no_match_cap = countries_w_caps[countries_w_caps['Capital'].isna()]
print(len(no_match_cap))
no_match_cap

14


Unnamed: 0,COUNTRY,ISO,COUNTRYAFF,Population,formalName,geometry,centroid,centroidOut,Country,Capital,capPop,capLoc
1,Akrotiri and Dhekelia,,United Kingdom,18195,,"MULTIPOLYGON (((33.90121 35.09612, 33.90185 35...",POINT (33.36206 34.83019),1.0,,,,
8,Antarctica,AQ,,0,,"MULTIPOLYGON (((-57.28856 -63.22003, -57.25061...",POINT (27.94983 -75.41679),0.0,,,,
29,Bouvet Island,BV,Norway,0,,"MULTIPOLYGON (((3.41075 -54.40417, 3.41331 -54...",POINT (3.40985 -54.43381),0.0,,,,
31,British Indian Ocean Territory,IO,United Kingdom,0,,"MULTIPOLYGON (((72.44417 -7.23714, 72.44958 -7...",POINT (72.15185 -6.67208),1.0,,,,
47,Christmas Island,CX,Australia,1692,,"POLYGON ((105.70453 -10.41506, 105.70453 -10.4...",POINT (105.63636 -10.4854),0.0,,,,
48,Cocos Islands,CC,Australia,593,,"MULTIPOLYGON (((96.92292 -12.19583, 96.92042 -...",POINT (96.84159 -12.13494),1.0,,,,
77,French Southern and Antarctic Lands,TF,France,0,,"MULTIPOLYGON (((77.54511 -37.80956, 77.54839 -...",POINT (68.60178 -48.86756),1.0,,,,
95,Heard Island and McDonald Islands,HM,Australia,0,,"MULTIPOLYGON (((73.30075 -52.97333, 73.30325 -...",POINT (73.51013 -53.09405),0.0,,,,
162,Norfolk Island,NF,Australia,2188,,"MULTIPOLYGON (((167.91208 -29.00419, 167.91333...",POINT (167.93851 -29.04268),0.0,,,,
175,Pitcairn Islands,PN,United Kingdom,35,,"MULTIPOLYGON (((-124.78083 -24.66214, -124.777...",POINT (-128.4846 -24.43028),1.0,,,,


In [20]:
# Print unique 'COUNTRY' names as a list
# Give this list to Chat GPT with an explaination, and a dictionary of the missing info for each of these countries/territories
print(no_match_cap['COUNTRY'].unique().tolist())

['Akrotiri and Dhekelia', 'Antarctica', 'Bouvet Island', 'British Indian Ocean Territory', 'Christmas Island', 'Cocos Islands', 'French Southern and Antarctic Lands', 'Heard Island and McDonald Islands', 'Norfolk Island', 'Pitcairn Islands', 'Saint Barthelemy', 'Saint Martin', 'Svalbard and Jan Mayen', 'Wallis and Futuna']


In [21]:
from shapely import wkt
import geopandas as gpd

# Convert the 'capLoc' values in `missing_data` from WKT to Shapely geometries
missing_data = {
    'Akrotiri and Dhekelia': {'Capital': 'Episkopi Cantonment', 'capPop': 1500, 'capLoc': wkt.loads('POINT (32.6242 34.6769)')},
    'Antarctica': {'Capital': None, 'capPop': None, 'capLoc': None},
    'Bouvet Island': {'Capital': None, 'capPop': None, 'capLoc': None},
    'British Indian Ocean Territory': {'Capital': 'Diego Garcia', 'capPop': 4000, 'capLoc': wkt.loads('POINT (72.4231 -7.3139)')},
    'Christmas Island': {'Capital': 'Flying Fish Cove', 'capPop': 1843, 'capLoc': wkt.loads('POINT (105.6795 -10.4214)')},
    'Cocos Islands': {'Capital': 'West Island', 'capPop': 500, 'capLoc': wkt.loads('POINT (96.8292 -12.1888)')},
    'French Southern and Antarctic Lands': {'Capital': 'Port-aux-Francais', 'capPop': 45, 'capLoc': wkt.loads('POINT (70.2181 -49.3517)')},
    'Heard Island and McDonald Islands': {'Capital': None, 'capPop': None, 'capLoc': None},
    'Norfolk Island': {'Capital': 'Kingston', 'capPop': 1748, 'capLoc': wkt.loads('POINT (167.9556 -29.0569)')},
    'Pitcairn Islands': {'Capital': 'Adamstown', 'capPop': 50, 'capLoc': wkt.loads('POINT (-130.1015 -25.0660)')},
    'Saint Barthelemy': {'Capital': 'Gustavia', 'capPop': 3592, 'capLoc': wkt.loads('POINT (-62.8508 17.8962)')},
    'Saint Martin': {'Capital': 'Marigot', 'capPop': 5777, 'capLoc': wkt.loads('POINT (-63.0830 18.0731)')},
    'Svalbard and Jan Mayen': {'Capital': 'Longyearbyen', 'capPop': 2144, 'capLoc': wkt.loads('POINT (15.6333 78.2232)')},
    'Wallis and Futuna': {'Capital': 'Mata-Utu', 'capPop': 1150, 'capLoc': wkt.loads('POINT (-176.1999 -13.2825)')}
}

# Map the missing data to the columns in `capitals_w_pop`
countries_w_caps['Capital'] = countries_w_caps['Capital'].fillna(countries_w_caps['COUNTRY'].map({k: v['Capital'] for k, v in missing_data.items()}))
countries_w_caps['capPop'] = countries_w_caps['capPop'].fillna(countries_w_caps['COUNTRY'].map({k: v['capPop'] for k, v in missing_data.items()}))

# For `capLoc`, use a GeoSeries for filling NaNs with geometries
countries_w_caps['capLoc'] = countries_w_caps['capLoc'].fillna(gpd.GeoSeries(countries_w_caps['COUNTRY'].map({k: v['capLoc'] for k, v in missing_data.items()}), crs="EPSG:4326"))

# Verify the result
countries_w_caps[countries_w_caps['COUNTRY'].isin(missing_data.keys())]

Unnamed: 0,COUNTRY,ISO,COUNTRYAFF,Population,formalName,geometry,centroid,centroidOut,Country,Capital,capPop,capLoc
1,Akrotiri and Dhekelia,,United Kingdom,18195,,"MULTIPOLYGON (((33.90121 35.09612, 33.90185 35...",POINT (33.36206 34.83019),1.0,,Episkopi Cantonment,1500.0,POINT (32.6242 34.6769)
8,Antarctica,AQ,,0,,"MULTIPOLYGON (((-57.28856 -63.22003, -57.25061...",POINT (27.94983 -75.41679),0.0,,,,
29,Bouvet Island,BV,Norway,0,,"MULTIPOLYGON (((3.41075 -54.40417, 3.41331 -54...",POINT (3.40985 -54.43381),0.0,,,,
31,British Indian Ocean Territory,IO,United Kingdom,0,,"MULTIPOLYGON (((72.44417 -7.23714, 72.44958 -7...",POINT (72.15185 -6.67208),1.0,,Diego Garcia,4000.0,POINT (72.4231 -7.3139)
47,Christmas Island,CX,Australia,1692,,"POLYGON ((105.70453 -10.41506, 105.70453 -10.4...",POINT (105.63636 -10.4854),0.0,,Flying Fish Cove,1843.0,POINT (105.6795 -10.4214)
48,Cocos Islands,CC,Australia,593,,"MULTIPOLYGON (((96.92292 -12.19583, 96.92042 -...",POINT (96.84159 -12.13494),1.0,,West Island,500.0,POINT (96.8292 -12.1888)
77,French Southern and Antarctic Lands,TF,France,0,,"MULTIPOLYGON (((77.54511 -37.80956, 77.54839 -...",POINT (68.60178 -48.86756),1.0,,Port-aux-Francais,45.0,POINT (70.2181 -49.3517)
95,Heard Island and McDonald Islands,HM,Australia,0,,"MULTIPOLYGON (((73.30075 -52.97333, 73.30325 -...",POINT (73.51013 -53.09405),0.0,,,,
162,Norfolk Island,NF,Australia,2188,,"MULTIPOLYGON (((167.91208 -29.00419, 167.91333...",POINT (167.93851 -29.04268),0.0,,Kingston,1748.0,POINT (167.9556 -29.0569)
175,Pitcairn Islands,PN,United Kingdom,35,,"MULTIPOLYGON (((-124.78083 -24.66214, -124.777...",POINT (-128.4846 -24.43028),1.0,,Adamstown,50.0,POINT (-130.1015 -25.066)


In [22]:
# Same countries but they all should have data now, unless of course they are uninhabited
countries_w_caps[countries_w_caps['COUNTRY'].isin(missing_data.keys())]

Unnamed: 0,COUNTRY,ISO,COUNTRYAFF,Population,formalName,geometry,centroid,centroidOut,Country,Capital,capPop,capLoc
1,Akrotiri and Dhekelia,,United Kingdom,18195,,"MULTIPOLYGON (((33.90121 35.09612, 33.90185 35...",POINT (33.36206 34.83019),1.0,,Episkopi Cantonment,1500.0,POINT (32.6242 34.6769)
8,Antarctica,AQ,,0,,"MULTIPOLYGON (((-57.28856 -63.22003, -57.25061...",POINT (27.94983 -75.41679),0.0,,,,
29,Bouvet Island,BV,Norway,0,,"MULTIPOLYGON (((3.41075 -54.40417, 3.41331 -54...",POINT (3.40985 -54.43381),0.0,,,,
31,British Indian Ocean Territory,IO,United Kingdom,0,,"MULTIPOLYGON (((72.44417 -7.23714, 72.44958 -7...",POINT (72.15185 -6.67208),1.0,,Diego Garcia,4000.0,POINT (72.4231 -7.3139)
47,Christmas Island,CX,Australia,1692,,"POLYGON ((105.70453 -10.41506, 105.70453 -10.4...",POINT (105.63636 -10.4854),0.0,,Flying Fish Cove,1843.0,POINT (105.6795 -10.4214)
48,Cocos Islands,CC,Australia,593,,"MULTIPOLYGON (((96.92292 -12.19583, 96.92042 -...",POINT (96.84159 -12.13494),1.0,,West Island,500.0,POINT (96.8292 -12.1888)
77,French Southern and Antarctic Lands,TF,France,0,,"MULTIPOLYGON (((77.54511 -37.80956, 77.54839 -...",POINT (68.60178 -48.86756),1.0,,Port-aux-Francais,45.0,POINT (70.2181 -49.3517)
95,Heard Island and McDonald Islands,HM,Australia,0,,"MULTIPOLYGON (((73.30075 -52.97333, 73.30325 -...",POINT (73.51013 -53.09405),0.0,,,,
162,Norfolk Island,NF,Australia,2188,,"MULTIPOLYGON (((167.91208 -29.00419, 167.91333...",POINT (167.93851 -29.04268),0.0,,Kingston,1748.0,POINT (167.9556 -29.0569)
175,Pitcairn Islands,PN,United Kingdom,35,,"MULTIPOLYGON (((-124.78083 -24.66214, -124.777...",POINT (-128.4846 -24.43028),1.0,,Adamstown,50.0,POINT (-130.1015 -25.066)


In [23]:
# Don't need extra Country field
countries_w_caps.drop(columns=['Country'], inplace=True)
# Remember field lengths need to be less than 10 characters
countries_w_caps = countries_w_caps.rename(columns={'centroidOut': 'centOut'})

In [24]:
# Check the data types of the columns in the GeoDataFrame
print(countries_w_caps.dtypes)

COUNTRY         object
ISO             object
COUNTRYAFF      object
Population       int64
formalName      object
geometry      geometry
centroid      geometry
centOut        float64
Capital         object
capPop         float64
capLoc        geometry
dtype: object


In [25]:
# Population field needs to be a Int64
countries_w_caps['Population'] = countries_w_caps['Population'].astype('Int64')

In [26]:
countries_w_caps

Unnamed: 0,COUNTRY,ISO,COUNTRYAFF,Population,formalName,geometry,centroid,centOut,Capital,capPop,capLoc
0,Afghanistan,AF,Afghanistan,34262840,Islamic Emirate of Afghanistan,"POLYGON ((74.88986 37.23409, 74.88962 37.23314...",POINT (65.98786 33.75477),0.0,Kabul,4011770.0,POINT (69.1725 34.5289)
1,Akrotiri and Dhekelia,,United Kingdom,18195,,"MULTIPOLYGON (((33.90121 35.09612, 33.90185 35...",POINT (33.36206 34.83019),1.0,Episkopi Cantonment,1500.0,POINT (32.6242 34.6769)
2,Albania,AL,Albania,2402113,Republic of Albania,"MULTIPOLYGON (((20.0789 42.5558, 20.07939 42.5...",POINT (20.06737 41.12698),0.0,Tirana,475577.0,POINT (19.8189 41.3275)
3,Algeria,DZ,Algeria,46700000,People's Democratic Republic of Algeria,"MULTIPOLYGON (((8.64188 36.94206, 8.64196 36.9...",POINT (2.68012 27.89881),0.0,Algiers,2693542.0,POINT (3.042 36.7525)
4,American Samoa,AS,United States,49710,,"MULTIPOLYGON (((-168.17253 -14.55294, -168.173...",POINT (-170.3959 -14.21789),1.0,Pago Pago,48526.0,POINT (-170.7025 -14.2781)
5,Andorra,AD,Andorra,86398,Principality of Andorra,"POLYGON ((1.7258 42.5044, 1.71149 42.49224, 1....",POINT (1.57627 42.5454),0.0,Andorra la Vella,22614.0,POINT (1.5211 42.5078)
6,Angola,AO,Angola,35121734,Republic of Angola,"MULTIPOLYGON (((23.99928 -10.89013, 23.99943 -...",POINT (17.54511 -12.24251),0.0,Luanda,7774200.0,POINT (13.2343 -8.8368)
7,Anguilla,AI,United Kingdom,15780,,"MULTIPOLYGON (((-62.93497 18.30292, -62.93414 ...",POINT (-63.0639 18.2312),0.0,The Valley,1402.0,POINT (-63.0578 18.217)
8,Antarctica,AQ,,0,,"MULTIPOLYGON (((-57.28856 -63.22003, -57.25061...",POINT (27.94983 -75.41679),0.0,,,
9,Antigua and Barbuda,AG,Antigua and Barbuda,103603,Antigua and Barbuda,"MULTIPOLYGON (((-61.66928 17.07375, -61.66839 ...",POINT (-61.79912 17.27757),1.0,St. John's,20764.0,POINT (-61.8457 17.1172)


#### Amazing. Now let's save our work
##### Again, divided into shapefiles, and a multi-geometry parquet

In [27]:
# geometry
countries_geom = countries_w_caps.drop(columns=['centroid', 'capLoc'])

# centroids
countries_centroids = countries_w_caps.drop(columns=['geometry','capLoc'])
countries_centroids = countries_centroids.set_geometry('centroid')
# capitals
countries_caps = countries_w_caps.drop(columns=['geometry','centroid'])
countries_caps = countries_caps.set_geometry('capLoc')

In [54]:
countries_geom.head()

Unnamed: 0,COUNTRY,ISO,COUNTRYAFF,Population,formalName,geometry,centOut,Capital,capPop
0,Afghanistan,AF,Afghanistan,34262840,Islamic Emirate of Afghanistan,"POLYGON ((74.88986 37.23409, 74.88962 37.23314...",0.0,Kabul,4011770.0
1,Akrotiri and Dhekelia,,United Kingdom,18195,,"MULTIPOLYGON (((33.90121 35.09612, 33.90185 35...",1.0,Episkopi Cantonment,1500.0
2,Albania,AL,Albania,2402113,Republic of Albania,"MULTIPOLYGON (((20.0789 42.5558, 20.07939 42.5...",0.0,Tirana,475577.0
3,Algeria,DZ,Algeria,46700000,People's Democratic Republic of Algeria,"MULTIPOLYGON (((8.64188 36.94206, 8.64196 36.9...",0.0,Algiers,2693542.0
4,American Samoa,AS,United States,49710,,"MULTIPOLYGON (((-168.17253 -14.55294, -168.173...",1.0,Pago Pago,48526.0


In [28]:
# Save each as shapefile
countries_caps.to_file("GIS Tutorials/Geog-312/geopandas_Files/checkpoint4/countries_caps.shp", driver='ESRI Shapefile')
countries_centroids.to_file("GIS Tutorials/Geog-312/geopandas_Files/checkpoint4/countries_centroids.shp", driver='ESRI Shapefile')
countries_geom.to_file("GIS Tutorials/Geog-312/geopandas_Files/checkpoint4/countries_geom.shp", driver='ESRI Shapefile')


In [29]:
# And as multi-geometry parquet
countries_w_caps.to_parquet("GIS Tutorials/Geog-312/geopandas_Files/checkpoint4/countries.parquet")