In [3]:
import geopandas as gpd
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import asyncio
import aiohttp
from tqdm.asyncio import tqdm_asyncio
from shapely.geometry import Point

In [2]:
df_train = pd.read_csv('../resources/mp16_combined.csv')

In [6]:
async def _check_one(session, url, timeout=8):
    if not isinstance(url, str) or url.strip() == "":
        return False
    try:
        async with session.head(url, allow_redirects=True, timeout=timeout) as r:
            if r.status == 200:
                return True
            if r.status in (405, 501) or r.status >= 400:
                # fallback to small GET
                async with session.get(url, timeout=timeout) as r2:
                    return 200 <= r2.status < 400
            return False
    except Exception:
        try:
            async with session.get(url, timeout=timeout) as r3:
                return 200 <= r3.status < 400
        except Exception:
            return False

In [13]:
async def filter_alive_async(urls, concurrency, timeout=8):
    connector = aiohttp.TCPConnector(limit=concurrency, force_close=False)
    timeout_obj = aiohttp.ClientTimeout(total=None, sock_connect=timeout, sock_read=timeout)
    async with aiohttp.ClientSession(connector=connector, timeout=timeout_obj) as session:
        sem = asyncio.Semaphore(concurrency)
        async def guarded(u):
            async with sem:
                return await _check_one(session, u, timeout=timeout)
        tasks = [guarded(u) for u in urls]
        # tqdm over asyncio tasks
        results = []
        for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Checking URLs"):
            res = await coro
            results.append(res)
        return results

In [15]:
urls = df_train['URL'].fillna("").astype(str).tolist()
alive_mask = await filter_alive_async(urls, concurrency=200, timeout=8)

Checking URLs: 100%|██████████| 4122119/4122119 [2:54:17<00:00, 394.19it/s]  


In [16]:
df_clean = df_train[pd.Series(alive_mask, index=df_train.index)].reset_index(drop=True)
df_dead = df_train[~pd.Series(alive_mask, index=df_train.index)].reset_index(drop=True)
print(f"kept {len(df_clean)} rows, removed {len(df_dead)} rows")

kept 3784539 rows, removed 337580 rows


In [17]:
gdf = gpd.read_file('../resources/gadm_split_2.gpkg')

In [18]:
pts = gpd.GeoDataFrame(
    df_clean,
    geometry=[Point(xy) for xy in zip(df_clean['LON'], df_clean['LAT'])],
    crs="EPSG:4326"
)

joined = gpd.sjoin(pts, gdf[['geometry']], how='inner', predicate='within')

total = len(df_clean)
df_clean = df_clean.loc[joined.index].reset_index(drop=True)
kept = len(df_clean)
removed = total-kept
print(f"Rows kept inside polygons: {kept}; removed (outside polygons): {removed}")

Rows kept inside polygons: 3635097; removed (outside polygons): 149442


In [19]:
random_row = df_clean[df_clean['Prob_indoor'] > 0.95].sample(n=1).iloc[0]

caption = random_row['caption']
image_url = random_row['URL']

print("Caption:", caption)

from IPython.display import Image, display
display(Image(url=image_url))

Caption: A geo-tagged image taken in Shinagawa, Japan. This image was taken indoors. This image was taken in a urban setting. The climate is Temperate, no dry season, hot summer.


In [20]:
df_clean = df_clean[df_clean['Prob_indoor'] < 0.95].reset_index(drop=True)

In [21]:
df_clean.to_csv('../resources/mp16_combined_clean.csv', index=False)

In [22]:
df_clean

Unnamed: 0,IMG_ID,AUTHOR,LAT,LON,S3_Label,S16_Label,S365_Label,Prob_indoor,Prob_natural,Prob_urban,neighbourhood,city,county,state,region,country,country_code,continent,URL,caption
0,72_be_2780685347.jpg,29729363@N04,52.603256,-1.329860,2.0,12.0,276.0,0.000009,0.000028,0.999963,Peckleton,Hinckley and Bosworth,Leicestershire,England,,United Kingdom,gb,,http://farm4.staticflickr.com/3037/2780685347_...,A geo-tagged image taken in Hinckley and Boswo...
1,6d_10_260656099.jpg,65354895@N00,40.735291,13.888778,2.0,11.0,200.0,0.001493,0.161261,0.837246,,Forio,Napoli,Campania,,Italy,it,,http://farm1.staticflickr.com/83/260656099_fa4...,"A geo-tagged image taken in Forio, Campania, I..."
2,48_59_253000979.jpg,48085290@N00,40.758545,-73.978114,2.0,15.0,112.0,0.006207,0.003282,0.990512,Manhattan,New York,New York County,New York,,United States,us,,http://farm1.staticflickr.com/100/253000979_a7...,"A geo-tagged image taken in New York, New York..."
3,54_44_3224919760.jpg,99735090@N00,-43.149594,147.848854,2.0,11.0,252.0,0.000248,0.061956,0.937797,,Tasman,,Tasmania,,Australia,au,,http://farm4.staticflickr.com/3367/3224919760_...,"A geo-tagged image taken in Tasman, Tasmania, ..."
4,da_4c_4256229752.jpg,14469908@N00,37.764090,-122.466562,2.0,15.0,301.0,0.360779,0.000118,0.639103,Inner Sunset,San Francisco,,California,,United States,us,,http://farm5.staticflickr.com/4065/4256229752_...,"A geo-tagged image taken in San Francisco, Cal..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2934431,f9_f9_1469172988.jpg,13738448@N03,-3.724593,-38.489967,2.0,12.0,351.0,0.039593,0.007851,0.952556,Meireles,Fortaleza,,Ceará,Northeast Region,Brazil,br,,http://farm2.staticflickr.com/1404/1469172988_...,"A geo-tagged image taken in Fortaleza, Ceará, ..."
2934432,bc_7f_572224526.jpg,83738663@N00,33.761640,-84.385369,2.0,15.0,307.0,0.455680,0.038256,0.506064,Old Fourth Ward,Atlanta,Fulton County,Georgia,,United States,us,,http://farm2.staticflickr.com/1376/572224526_c...,"A geo-tagged image taken in Atlanta, Georgia, ..."
2934433,62_ce_6176671469.jpg,13836188@N04,36.960262,-76.328484,2.0,10.0,207.0,0.000306,0.000160,0.999534,,Norfolk,,Virginia,,United States,us,,http://farm7.staticflickr.com/6160/6176671469_...,"A geo-tagged image taken in Norfolk, Virginia,..."
2934434,2b_22_279705584.jpg,38324365@N00,40.256374,-111.665554,0.0,2.0,303.0,0.949491,0.022948,0.027560,Carterville,Provo,Utah County,Utah,,United States,us,,http://farm1.staticflickr.com/85/279705584_500...,"A geo-tagged image taken in Provo, Utah, Unite..."
