In [6]:
%pip install --upgrade geonamescache


Note: you may need to restart the kernel to use updated packages.


In [3]:
import geonamescache
import pandas as pd


In [4]:
# Initialize cache and pull cities dict
gc = geonamescache.GeonamesCache()
cities_dict = gc.get_cities()

# Convert to DataFrame
df_cities = pd.DataFrame(cities_dict).T   # transpose so each row is one city
df_cities = df_cities[[
    'name',
    'countrycode',
    'latitude',
    'longitude',
    'population'
]]

# Save for reproducibility
df_cities.to_csv('../data/world_cities_updated.csv', index=False)

# Preview
df_cities.head()


Unnamed: 0,name,countrycode,latitude,longitude,population
3040051,les Escaldes,AD,42.50729,1.53414,15853
3041563,Andorra la Vella,AD,42.50779,1.52109,20430
290594,Umm Al Quwain City,AE,25.56473,55.55517,62747
291074,Ras Al Khaimah City,AE,25.78953,55.9432,351943
291580,Zayed City,AE,23.65416,53.70522,63482


In [5]:
import pandas as pd

# 1. Reload the data
df = pd.read_csv('../data/world_cities_updated.csv')

# 2. Quick overview
print(df.info())
display(df.describe())

# 3. Check for missing or zero values
print("Missing values:\n", df.isna().sum())
print("Zero populations:", (df['population'] == 0).sum())

# 4. Drop any rows with missing coords or name
df = df.dropna(subset=['name','latitude','longitude'])

# 5. Convert types explicitly
df['population'] = df['population'].astype(int)
df['latitude']   = df['latitude'].astype(float)
df['longitude']  = df['longitude'].astype(float)

# 6. Filter out very small towns (e.g., pop < 5k)
df = df[df['population'] >= 5000].reset_index(drop=True)

# 7. Feature engineering: log-population as proxy for urban size
import numpy as np
df['log_population'] = np.log1p(df['population'])

# Preview cleaned data
display(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26463 entries, 0 to 26462
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         26463 non-null  object 
 1   countrycode  26449 non-null  object 
 2   latitude     26463 non-null  float64
 3   longitude    26463 non-null  float64
 4   population   26463 non-null  int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 1.0+ MB
None


Unnamed: 0,latitude,longitude,population
count,26463.0,26463.0,26463.0
mean,27.859795,17.95317,123019.5
std,22.624551,73.026715,527923.2
min,-54.81084,-176.17453,0.0
25%,15.73213,-46.31347,21986.5
50%,34.05,16.46829,35604.0
75%,43.78596,77.61194,74433.0
max,78.22334,179.36451,22315470.0


Missing values:
 name            0
countrycode    14
latitude        0
longitude       0
population      0
dtype: int64
Zero populations: 14


Unnamed: 0,name,countrycode,latitude,longitude,population,log_population
0,les Escaldes,AD,42.50729,1.53414,15853,9.671177
1,Andorra la Vella,AD,42.50779,1.52109,20430,9.924809
2,Umm Al Quwain City,AE,25.56473,55.55517,62747,11.046882
3,Ras Al Khaimah City,AE,25.78953,55.9432,351943,12.771227
4,Zayed City,AE,23.65416,53.70522,63482,11.058527
