In [36]:
import pandas as pd
import numpy as np
from io import StringIO
import rasterio
from tqdm import tqdm

In [3]:
df_urls = pd.read_csv('../resources/mp16_urls.csv', names=['IMG_ID', 'URL'])
df_filtered = pd.read_csv('../resources/MP16_Pro_filtered.csv')
df_places = pd.read_csv('../resources/mp16_places365.csv')

In [4]:
df_urls.head()

Unnamed: 0,IMG_ID,URL
0,92/17/5276763594.jpg,http://farm6.staticflickr.com/5042/5276763594_...
1,0d/ce/6392770405.jpg,http://farm8.staticflickr.com/7172/6392770405_...
2,2a/88/5268406683.jpg,http://farm6.staticflickr.com/5045/5268406683_...
3,82/be/2515710583.jpg,http://farm3.staticflickr.com/2389/2515710583_...
4,03/05/9498368699.jpg,http://farm4.staticflickr.com/3800/9498368699_...


In [5]:
df_filtered.head()

Unnamed: 0,IMG_ID,AUTHOR,LAT,LON,S3_Label,S16_Label,S365_Label,Prob_indoor,Prob_natural,Prob_urban,neighbourhood,city,county,state,region,country,country_code,continent
0,92_17_5276763594.jpg,42441750@N03,38.685568,-109.532951,1.0,7.0,289.0,0.000173984,0.897409,0.102417,,,Grand County,Utah,,United States,us,
1,0d_ce_6392770405.jpg,68149505@N00,34.933793,103.692741,0.0,1.0,122.0,0.9968868,0.000578,0.002535,,Lianlu,Kangle County,Gansu,Linxia,China,cn,
2,2a_88_5268406683.jpg,84867026@N00,39.983433,-75.243301,0.0,0.0,128.0,0.7201538,0.034871,0.244975,Overbrook,Philadelphia,Philadelphia County,Pennsylvania,,United States,us,
3,82_be_2515710583.jpg,75292316@N00,39.306094,-84.379291,1.0,6.0,145.0,9.05069e-05,0.516982,0.482927,,,Butler County,Ohio,,United States,us,
4,03_05_9498368699.jpg,61068860@N00,9.186625,123.581597,1.0,8.0,36.0,9.902391e-07,0.999983,1.6e-05,,Siquijor,,Siquijor,Central Visayas,Philippines,ph,


In [6]:
df_places.head()

Unnamed: 0,IMG_ID,AUTHOR,LAT,LON,S3_Label,S16_Label,S365_Label,Prob_indoor,Prob_natural,Prob_urban
0,92/17/5276763594.jpg,42441750@N03,38.685568,-109.532951,1,7,289,0.000173984,0.897409,0.102417
1,0d/ce/6392770405.jpg,68149505@N00,34.933793,103.692741,0,1,122,0.9968868,0.000578,0.002535
2,2a/88/5268406683.jpg,84867026@N00,39.983433,-75.243301,0,0,128,0.7201538,0.034871,0.244975
3,82/be/2515710583.jpg,75292316@N00,39.306094,-84.379291,1,6,145,9.05069e-05,0.516982,0.482927
4,03/05/9498368699.jpg,61068860@N00,9.186625,123.581597,1,8,36,9.902391e-07,0.999983,1.6e-05


In [7]:
df_urls.rename(columns={'IMG_ID_PATH': 'IMG_ID'}, inplace=True)
df_filtered.rename(columns={'IMG_ID_UNDERSCORE': 'IMG_ID'}, inplace=True)

df_urls['Merge_Key'] = df_urls['IMG_ID'].str.replace('/', '_')

df_combined = pd.merge(
    df_filtered, 
    df_urls[['URL', 'Merge_Key']], 
    left_on='IMG_ID', 
    right_on='Merge_Key', 
    how='inner'
)

df_combined.drop(columns=['Merge_Key'], inplace=True)

In [8]:
df_combined.head()

Unnamed: 0,IMG_ID,AUTHOR,LAT,LON,S3_Label,S16_Label,S365_Label,Prob_indoor,Prob_natural,Prob_urban,neighbourhood,city,county,state,region,country,country_code,continent,URL
0,92_17_5276763594.jpg,42441750@N03,38.685568,-109.532951,1.0,7.0,289.0,0.000173984,0.897409,0.102417,,,Grand County,Utah,,United States,us,,http://farm6.staticflickr.com/5042/5276763594_...
1,0d_ce_6392770405.jpg,68149505@N00,34.933793,103.692741,0.0,1.0,122.0,0.9968868,0.000578,0.002535,,Lianlu,Kangle County,Gansu,Linxia,China,cn,,http://farm8.staticflickr.com/7172/6392770405_...
2,2a_88_5268406683.jpg,84867026@N00,39.983433,-75.243301,0.0,0.0,128.0,0.7201538,0.034871,0.244975,Overbrook,Philadelphia,Philadelphia County,Pennsylvania,,United States,us,,http://farm6.staticflickr.com/5045/5268406683_...
3,82_be_2515710583.jpg,75292316@N00,39.306094,-84.379291,1.0,6.0,145.0,9.05069e-05,0.516982,0.482927,,,Butler County,Ohio,,United States,us,,http://farm3.staticflickr.com/2389/2515710583_...
4,03_05_9498368699.jpg,61068860@N00,9.186625,123.581597,1.0,8.0,36.0,9.902391e-07,0.999983,1.6e-05,,Siquijor,,Siquijor,Central Visayas,Philippines,ph,,http://farm4.staticflickr.com/3800/9498368699_...


In [30]:
with rasterio.open("../resources/koppen_geiger.tif") as src:
    read = src.read(1)

In [31]:
df_kg = pd.read_csv('../resources/kg_leg.csv', quotechar='"', skipinitialspace=True)

In [32]:
def get_climate(lat, lon):
    row, col = src.index(lon, lat)
    value = read[row, col]
    if value == 0:
        return "other"
    climate = df_kg["Climate"][value-1]
    return climate

In [33]:
def create_location_caption(row):
    caption = "A geo-tagged image"
    location_parts = []
    
    # Location
    if pd.notna(row['city']):
        location_parts.append(row['city'])
    if pd.notna(row['state']):
        location_parts.append(row['state'])
    if pd.notna(row['country']):
        location_parts.append(row['country'])

    if location_parts:
        caption += f" taken in {', '.join(location_parts)}."
    else:
        caption += "."

    if (row['Prob_indoor'] > 0.8):
        caption += " This image was taken indoors."
    if (row['Prob_natural'] > 0.8):
        caption += " This image was taken in nature."
    if (row['Prob_indoor'] > 0.8):
        caption += " This image was taken in a urban setting."
        
    return caption

In [34]:
def create_climate_caption(row):
    caption = "A geo-tagged image"
    
    # Climate
    if pd.notna(row['LAT']) and pd.notna(row['LON']):
        caption += f" taken in a {get_climate(row['LAT'], row['LON'])} climate."
    
    return caption

In [35]:
def create_traffic_caption(row):
    caption = "A geo-tagged image"

    left_driving_countries = [
        "Antigua and Barbuda",
        "Australia",
        "Bahamas",
        "Bangladesh",
        "Barbados",
        "Bhutan",
        "Botswana",
        "Brunei",
        "Cyprus",
        "Dominica",
        "Eswatini",
        "Guyana",
        "India",
        "Indonesia",
        "Ireland",
        "Jamaica",
        "Japan",
        "Kenya",
        "Lesotho",
        "Malawi",
        "Malaysia",
        "Maldives",
        "Malta",
        "Mauritius",
        "Mozambique",
        "Namibia",
        "Nepal",
        "New Zealand",
        "Pakistan",
        "Papua New Guinea",
        "Saint Kitts and Nevis",
        "Saint Lucia",
        "Saint Vincent and the Grenadines",
        "Samoa",
        "Seychelles",
        "Singapore",
        "Solomon Islands",
        "South Africa",
        "Sri Lanka",
        "Tanzania",
        "Thailand",
        "Uganda",
        "United Kingdom",
        "Zambia",
        "Zimbabwe"
    ]

    # Driving Side
    if row['country'] in left_driving_countries:
        caption += "taken in a left side driving country."
    else:
        caption += " taken in a right side driving country."

    return caption


In [37]:
loc_captions = []
cli_captions = []
tra_captions = []

for index, row in tqdm(df_combined.iterrows(), total=len(df_combined), desc="Building captions"):
    loc_caption = create_location_caption(row)
    cli_caption = create_climate_caption(row)
    tra_caption = create_traffic_caption(row)
    loc_captions.append(loc_caption)
    cli_captions.append(cli_caption)
    tra_captions.append(tra_caption)

df_combined['loc_caption'] = loc_captions
df_combined['cli_caption'] = cli_captions
df_combined['tra_caption'] = tra_captions

Building captions: 100%|██████████| 4122119/4122119 [02:58<00:00, 23113.04it/s]


In [41]:
df_combined.head()["loc_caption"]

0    A geo-tagged image taken in Utah, United State...
1    A geo-tagged image taken in Lianlu, Gansu, Chi...
2    A geo-tagged image taken in Philadelphia, Penn...
3     A geo-tagged image taken in Ohio, United States.
4    A geo-tagged image taken in Siquijor, Siquijor...
Name: loc_caption, dtype: object

In [42]:
df_combined.to_csv('../resources/mp16_combined.csv', index=False)

In [58]:
random_row = df_combined.sample(n=1).iloc[0]

loc_caption = random_row['loc_caption']
cli_caption = random_row['cli_caption']
tra_caption = random_row['tra_caption']

image_url = random_row['URL']

print("Location:", loc_caption)
print("Driving Side:", cli_caption)
print("Traffic Info:", tra_caption)


from IPython.display import Image, display
display(Image(url=image_url))

Location: A geo-tagged image taken in City of London, England, United Kingdom.
Driving Side: A geo-tagged image taken in a Temperate, no dry season, warm summer climate.
Traffic Info: A geo-tagged imagetaken in a left side driving country.
