# Download a subset of Global Streetscapes data

This notebook demostrates how to filter the Global Streetscapes dataset to find the desired subset of data.
As an example, we show how to filter for daytime images from Singapore, and how to prepare the input csv file for download_jpegs.py to download the required jpegs.

In [18]:
import pandas as pd
import country_converter as coco

## Load file with available points

In [None]:
# the city information is available in the `simplemaps.csv` file
# https://huggingface.co/datasets/NUS-UAL/global-streetscapes/resolve/main/data/simplemaps.csv?download=true
df_all = pd.read_csv(
    "../data/simplemaps.csv"
)  # update the location of the desired csv file

  df_all = pd.read_csv(


## Filter based on city

In [3]:
#df_city = df_all[df_all["city"] == "Singapore"]
df_city = df_all.sample(10_000, random_state=42)

# visual confirmation the city name is unique. Otherwise there would be more than one value
df_city["country"].unique()

array(['Japan', 'United States', 'Taiwan', 'Guyana', 'Greece', 'Estonia',
       'Brazil', 'Malta', 'Hungary', 'Guatemala', 'Latvia', 'Philippines',
       'Bolivia', 'Bosnia And Herzegovina', 'Algeria', 'Germany',
       'Malaysia', 'Switzerland', 'France', 'Bangladesh', 'Moldova',
       'Russia', 'Andorra', 'Chile', 'Netherlands', 'Finland', 'Thailand',
       'Bulgaria', 'Libya', 'Uganda', 'Luxembourg', 'French Guiana',
       'Canada', 'Tajikistan', 'Poland', 'Mayotte', 'Mali', 'Serbia',
       'India', 'Austria', 'Colombia', 'Tanzania', 'Mexico', 'Lithuania',
       'Belgium', 'Peru', 'New Zealand', 'Indonesia', 'United Kingdom',
       'Lesotho', 'Czechia', 'Curaçao', 'Portugal', 'Guinea', 'Australia',
       'Kosovo', 'Liberia', 'Hong Kong', 'Qatar', 'Costa Rica', 'Croatia',
       'Denmark', 'Myanmar', 'Ecuador', 'South Africa', 'Azerbaijan',
       'Mauritius', 'Bahrain', 'Israel', 'Italy', 'Morocco', 'Ireland',
       'Sweden', 'Albania', 'Brunei', 'United Arab Emirates', 'R

## Filter based on contexual information: lighting condition

In [None]:
# load contextual information
df_contextual = pd.read_csv("../data/contextual.csv")

  df_contextual = pd.read_csv("../data/contextual.csv")


In [5]:
# merge our filtered dataset with contextual data
df_city_merged = df_city.merge(df_contextual, on=["uuid", "source", "orig_id"])
df_city_merged["lighting_condition"].unique()

array(['day', 'dusk/dawn', 'night', nan], dtype=object)

In [6]:
# filter only the rows during `day`
#df_city_merged = df_city_merged[df_city_merged["lighting_condition"] == "day"]
#df_city_merged["lighting_condition"].unique()

In [7]:
df_city_merged.shape

(10000, 23)

In [20]:
df_city_merged.columns

Index(['uuid', 'source', 'orig_id', 'city', 'city_ascii', 'city_id',
       'city_lat', 'city_lon', 'country', 'iso2', 'iso3', 'admin_name',
       'capital', 'population', 'continent', 'glare', 'lighting_condition',
       'pano_status', 'platform', 'quality', 'reflection', 'view_direction',
       'weather', 'count_per_country', 'iso-3'],
      dtype='object')

In [None]:
# Add feature "count_per_country"
df_city_merged['count_per_country'] = df_city_merged.groupby("country").transform("size")

# Add ISO-3 column, for visualization with plotly.express
cc = coco.CountryConverter()
df_city_merged['iso-3'] = cc.pandas_convert(series=df_city_merged['country'], to='ISO3')

## Save to csv

In [21]:
# keep the three required columns
df_to_download = df_city_merged[["uuid", "source", "orig_id", "city_lat", "city_lon", "country", "count_per_country", "iso-3"]]
# save the file
df_to_download.to_csv("../data/imgs/sampled.csv")