In [23]:
import geopandas as gpd
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import asyncio
import aiohttp
from tqdm.asyncio import tqdm_asyncio
from shapely.geometry import Point

In [24]:
df_train = pd.read_csv('../resources/mp16_combined.csv')

In [26]:
async def _check_one(session, url, timeout=8):
    if not isinstance(url, str) or url.strip() == "":
        return False
    try:
        async with session.head(url, allow_redirects=True, timeout=timeout) as r:
            if r.status == 200:
                return True
            if r.status in (405, 501) or r.status >= 400:
                # fallback to small GET
                async with session.get(url, timeout=timeout) as r2:
                    return 200 <= r2.status < 400
            return False
    except Exception:
        try:
            async with session.get(url, timeout=timeout) as r3:
                return 200 <= r3.status < 400
        except Exception:
            return False

In [27]:
async def filter_alive_async(urls, concurrency, timeout=8):
    connector = aiohttp.TCPConnector(limit=concurrency, force_close=False)
    timeout_obj = aiohttp.ClientTimeout(total=None, sock_connect=timeout, sock_read=timeout)
    async with aiohttp.ClientSession(connector=connector, timeout=timeout_obj) as session:
        sem = asyncio.Semaphore(concurrency)
        async def guarded(u):
            async with sem:
                return await _check_one(session, u, timeout=timeout)
        tasks = [guarded(u) for u in urls]
        # tqdm over asyncio tasks
        results = []
        for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Checking URLs"):
            res = await coro
            results.append(res)
        return results

In [28]:
urls = df_train['URL'].fillna("").astype(str).tolist()
alive_mask = await filter_alive_async(urls, concurrency=200, timeout=8)

Checking URLs:   1%|          | 25659/4122119 [04:58<13:12:56, 86.10it/s] 


CancelledError: 

In [16]:
df_clean = df_train[pd.Series(alive_mask, index=df_train.index)].reset_index(drop=True)
df_dead = df_train[~pd.Series(alive_mask, index=df_train.index)].reset_index(drop=True)
print(f"kept {len(df_clean)} rows, removed {len(df_dead)} rows")

kept 3784539 rows, removed 337580 rows


In [17]:
gdf = gpd.read_file('../resources/gadm_split_2.gpkg')

In [18]:
pts = gpd.GeoDataFrame(
    df_clean,
    geometry=[Point(xy) for xy in zip(df_clean['LON'], df_clean['LAT'])],
    crs="EPSG:4326"
)

joined = gpd.sjoin(pts, gdf[['geometry']], how='inner', predicate='within')

total = len(df_clean)
df_clean = df_clean.loc[joined.index].reset_index(drop=True)
kept = len(df_clean)
removed = total-kept
print(f"Rows kept inside polygons: {kept}; removed (outside polygons): {removed}")

Rows kept inside polygons: 3635097; removed (outside polygons): 149442


In [19]:
random_row = df_clean[df_clean['Prob_indoor'] > 0.95].sample(n=1).iloc[0]

caption = random_row['caption']
image_url = random_row['URL']

print("Caption:", caption)

from IPython.display import Image, display
display(Image(url=image_url))

Caption: A geo-tagged image taken in Shinagawa, Japan. This image was taken indoors. This image was taken in a urban setting. The climate is Temperate, no dry season, hot summer.


In [20]:
df_clean = df_clean[df_clean['Prob_indoor'] < 0.95].reset_index(drop=True)

In [21]:
df_clean.to_csv('../resources/mp16_combined_clean.csv', index=False)