# Generate Source Data

In [None]:
from geonamescache import GeonamesCache
from geonamescache.mappers import country
from collections import defaultdict
import polars as pl
from utils import CityCountry
from pathlib import Path
import inflect
import pandas as pd
import numpy as np
import random
# Parent directory
parent_dir = str(Path().resolve().parents[1])

## 1. Extract <City, Country> pairs from GeoNames

In [17]:
country_mapper = country(from_key='iso', to_key='name')
engine = inflect.engine()
gc = GeonamesCache()
city_set = set()
for entry in gc.get_cities().values():
    if entry['name'] != '':
        city_set.update([entry['name']])


city_dict = defaultdict(set)
city_population = defaultdict(int)
for city_name in city_set:
    cities = gc.get_cities_by_name(city_name)
    for city in cities:

        city = list(city.values())[0]
        keys = city.keys()
        if 'name' in keys and 'countrycode' in keys and 'population' in keys:
            country_name = country_mapper(city['countrycode'])
            admin1code = city.get('admin1code')
            if country_name is not None and city['population'] is not None:
                if admin1code is not None and str.isalpha(admin1code) and country_name == 'United States':
                      country_name = f"{country_name}, {city['admin1code']}"
                if city['population'] > 30_000:
                    if any(keyword.lower() in country_name.lower() for keyword in ['Republic', 'United', 'Kingdom', 'of', 'Territory', 
                                                                   'South', 'North', 'Island', 'Islands', 'Coast', 'Central', 'Netherlands',
                                                                   ]):
                        country_name = 'the ' + country_name
                    elif engine.singular_noun(country_name) != False:
                        country_name = 'the ' + country_name
                    city_dict[city['name']].add(country_name)
                    if city_population[city['name']] < city['population']:
                        city_population[city['name']] = city['population'] # max population associated with the city
city_dict = {k: list(v) for k, v in city_dict.items()}
len(city_dict.keys())

14680

In [None]:
### Instantiate a Data Generation object
db = CityCountry(city_dict, category='cities')
data = db.generate_full_dataset()
data.write_json(f"{parent_dir}/datasets/generators/city_country.json") # this generates a full dataset with all cities and countries

In [None]:
# here we downsample the full dataset to 5500 samples
top_cities = sorted(city_population, key=city_population.get, reverse=True)[:700]
other_cities = set(db.keys) - set(top_cities)
other_cities = np.random.choice(list(other_cities), 700, replace=False)
cities = top_cities + list(other_cities)
subsample = db.generate_subsample(n=5500, seed=42, objects=cities).with_columns(
                 pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
subsample.write_csv(f"{parent_dir}/datasets/_city_country_subsample.csv")
## THIS PART OF CODE MIGHT OUTPUT DIFFERENT RESULT (DUE TO THE ISSUE WITH THE POLARS LIBRARY) -- THE DATASET WE USED IS PROVIDED IN THE REPOSITORY

In [None]:
subsample.group_by(['correct', 'negated']).len()

# Synthetic Cities and Countries
Here, we generate synthetic names for countries and cities. 
Generated names are stored in `datasets/generators/synthetic/*_raw.txt`.

In [30]:
def validate_city_name(name):
    results = gc.search_cities(name)
    results_1 = gc.get_cities_by_name(name)

    if len(results) == 0 and len(results_1) == 0:
        return False
    return True

def validate_country_name(name):
    if name not in db.values:
        return False
    return True

In [32]:
pl.set_random_seed(42)
city_names = sorted(data['object_1'].unique().to_list())
random.seed(42)
country_names = sorted(db.values) 

In [None]:
from namemaker import NameSet, validate_town
import namemaker

seed = 'udaxihhexdvxrcsnbacghqtargwuwr'
random.seed(seed)
namemaker_rng = namemaker.get_rng()
namemaker_rng.seed(seed)

city_NS = NameSet(names = city_names)
cities_synth = [city_NS.make_name(add_to_history=False, validation_func=validate_town) for _ in range(400)]
cities_synth = list(set(cities_synth))
# Validate
cities_validated = []
for item in cities_synth:
    if validate_city_name(item):
        pass
    else:
        cities_validated.append(item)
with open(f"{parent_dir}/datasets/synthetic/cities_raw.txt", 'w') as f:
    f.write("\n".join(map(str, cities_validated)))

In [None]:
from namemaker import NameSet
import namemaker

seed = 'udaxihhexdvxrcsnbacghqtargwuwr'
random.seed(seed)
namemaker_rng = namemaker.get_rng()
namemaker_rng.seed(seed)

country_NS = NameSet(names = country_names, order=2)
country_synth = [country_NS.make_name(add_to_history=False, n_candidates=5) for _ in range(250)]
country_synth = list(set(country_synth))
# Validate
countries_validated = []
for item in country_synth:
    if validate_country_name(item):
        pass
    else:
        if random.random() > 0.75:
            template = random.choice(['the {name} Islands', 'the Republic of {name}', 'the {name} Kingdom', 'West {name}', 'East {name}', 'North {name}', 'South {name}', '{name}land'])
            item = template.format(name=item)
        countries_validated.append(item)
        
with open(f"{parent_dir}/datasets/generators/synthetic/countries_raw.txt", 'w') as f:
    f.write("\n".join(map(str, countries_validated)))

## Create Dataset of Unverifiable Statements
Here, we load the list of names that we manually checked (i.e., filtered raw files).

In [None]:
synth_cities = pd.read_csv(f"{parent_dir}/datasets/generators/synthetic/cities_checked.csv")
synth_cities = synth_cities[synth_cities['Keep'] == 1]
synth_cities = synth_cities['Name'].tolist()

synth_countries = pd.read_csv(f"{parent_dir}/datasets/generators/synthetic/countries_checked.csv")
synth_countries = synth_countries[synth_countries['Keep'] == 1]
synth_countries = synth_countries['Name'].tolist()
random.seed(seed)
synth_dict = {}
for item in synth_cities:
    synth_dict[item] = random.sample(synth_countries, 2)

In [None]:
synth_db = CityCountry(synth_dict, category='cities', is_fake=True) #is_fake -> is_unverifiable
synth_data = synth_db.generate_full_dataset()
synth_data.write_json(f"{parent_dir}/datasets/source/city_country_synthetic.json")
synth_subsample = synth_db.generate_subsample(seed=42, n=2000).with_columns(
                 pl.col("correct_object_2").list.join(", ").alias("correct_object_2"))
synth_subsample.write_csv(f"{parent_dir}/datasets/city_locations_synthetic.csv")
synth_subsample.group_by(['correct', 'negated']).len()