In [26]:
import geopandas as gpd
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from shapely.geometry import Point

In [4]:
df_train = pd.read_csv('../resources/mp16_combined.csv')

In [5]:
df_train

Unnamed: 0,IMG_ID,AUTHOR,LAT,LON,S3_Label,S16_Label,S365_Label,Prob_indoor,Prob_natural,Prob_urban,neighbourhood,city,county,state,region,country,country_code,continent,URL,caption
0,92_17_5276763594.jpg,42441750@N03,38.685568,-109.532951,1.0,7.0,289.0,1.739840e-04,0.897409,0.102417,,,Grand County,Utah,,United States,us,,http://farm6.staticflickr.com/5042/5276763594_...,"A geo-tagged image taken in Utah, United State..."
1,0d_ce_6392770405.jpg,68149505@N00,34.933793,103.692741,0.0,1.0,122.0,9.968868e-01,0.000578,0.002535,,Lianlu,Kangle County,Gansu,Linxia,China,cn,,http://farm8.staticflickr.com/7172/6392770405_...,"A geo-tagged image taken in Lianlu, Gansu, Chi..."
2,2a_88_5268406683.jpg,84867026@N00,39.983433,-75.243301,0.0,0.0,128.0,7.201538e-01,0.034871,0.244975,Overbrook,Philadelphia,Philadelphia County,Pennsylvania,,United States,us,,http://farm6.staticflickr.com/5045/5268406683_...,"A geo-tagged image taken in Philadelphia, Penn..."
3,82_be_2515710583.jpg,75292316@N00,39.306094,-84.379291,1.0,6.0,145.0,9.050690e-05,0.516982,0.482927,,,Butler County,Ohio,,United States,us,,http://farm3.staticflickr.com/2389/2515710583_...,"A geo-tagged image taken in Ohio, United State..."
4,03_05_9498368699.jpg,61068860@N00,9.186625,123.581597,1.0,8.0,36.0,9.902391e-07,0.999983,0.000016,,Siquijor,,Siquijor,Central Visayas,Philippines,ph,,http://farm4.staticflickr.com/3800/9498368699_...,"A geo-tagged image taken in Siquijor, Siquijor..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4122114,62_ce_6176671469.jpg,13836188@N04,36.960262,-76.328484,2.0,10.0,207.0,3.058661e-04,0.000160,0.999534,,Norfolk,,Virginia,,United States,us,,http://farm7.staticflickr.com/6160/6176671469_...,"A geo-tagged image taken in Norfolk, Virginia,..."
4122115,d3_6f_8552562574.jpg,55715146@N08,36.097019,-80.243766,0.0,5.0,14.0,9.901024e-01,0.000460,0.009438,,Winston-Salem,Forsyth County,North Carolina,,United States,us,,http://farm9.staticflickr.com/8093/8552562574_...,"A geo-tagged image taken in Winston-Salem, Nor..."
4122116,2b_22_279705584.jpg,38324365@N00,40.256374,-111.665554,0.0,2.0,303.0,9.494914e-01,0.022948,0.027560,Carterville,Provo,Utah County,Utah,,United States,us,,http://farm1.staticflickr.com/85/279705584_500...,"A geo-tagged image taken in Provo, Utah, Unite..."
4122117,d9_dd_4097000875.jpg,92292245@N00,32.134131,-101.789231,2.0,11.0,86.0,2.011069e-02,0.105259,0.874630,,Stanton,Martin County,Texas,,United States,us,,http://farm3.staticflickr.com/2715/4097000875_...,"A geo-tagged image taken in Stanton, Texas, Un..."


In [12]:
def is_url_alive(url, session, timeout=8):
    if not isinstance(url, str) or url.strip() == "":
        return False
    try:
        r = session.head(url, allow_redirects=True, timeout=timeout)
        if r.status_code == 200:
            return True
        # some servers do not support HEAD -> try a small GET
        if r.status_code in (405, 501) or r.status_code >= 400:
            r = session.get(url, stream=True, timeout=timeout)
            # accept 200-399 as ok; ensure we don't download whole body
            ok = 200 <= r.status_code < 400
            r.close()
            return ok
        return False
    except Exception:
        # final fallback: try GET once
        try:
            r = session.get(url, stream=True, timeout=timeout)
            ok = 200 <= r.status_code < 400
            r.close()
            return ok
        except Exception:
            return False

In [13]:
def filter_alive(df, url_col='URL', max_workers=32):
    urls = df[url_col].fillna("").astype(str).tolist()
    alive_mask = [False] * len(urls)
    with requests.Session() as session:
        with ThreadPoolExecutor(max_workers=max_workers) as exe:
            futures = {exe.submit(is_url_alive, url, session): i for i, url in enumerate(urls)}
            for fut in tqdm(as_completed(futures), total=len(futures), desc="Checking URLs"):
                i = futures[fut]
                try:
                    alive_mask[i] = fut.result()
                except Exception:
                    alive_mask[i] = False
    return alive_mask

In [14]:
alive_mask = filter_alive(df_train, max_workers=24)

Checking URLs:   0%|          | 0/4122119 [00:00<?, ?it/s]

In [18]:
df_clean = df_train[pd.Series(alive_mask, index=df_train.index)].reset_index(drop=True)
df_dead = df_train[~pd.Series(alive_mask, index=df_train.index)].reset_index(drop=True)
print(f"kept {len(df_clean)} rows, removed {len(df_dead)} rows")

kept 3800984 rows, removed 321135 rows


In [25]:
gdf = gpd.read_file('../resources/gadm_split_2.gpkg')

In [29]:
pts = gpd.GeoDataFrame(
    df_clean,
    geometry=[Point(xy) for xy in zip(df_clean['LON'], df_clean['LAT'])],
    crs="EPSG:4326"
)

joined = gpd.sjoin(pts, gdf[['geometry']], how='inner', predicate='within')

total = len(df_clean)
df_clean = df_clean.loc[joined.index].reset_index(drop=True)
kept = len(df_clean)
removed = total-kept
print(f"Rows kept inside polygons: {kept}; removed (outside polygons): {removed}")

Rows kept inside polygons: 3651294; removed (outside polygons): 149690


In [55]:
random_row = df_clean[df_clean['Prob_indoor'] > 0.95].sample(n=1).iloc[0]

caption = random_row['caption']
image_url = random_row['URL']

print("Caption:", caption)

from IPython.display import Image, display
display(Image(url=image_url))

Caption: A geo-tagged image taken in Ribble Valley, England, United Kingdom. This image was taken indoors. This image was taken in a urban setting. The climate is Temperate, no dry season, warm summer.


In [None]:
df_clean = df_clean[df_clean['Prob_indoor'] < 0.95].reset_index(drop=True)

In [54]:
df_clean.to_csv('../resources/mp16_combined_clean.csv', index=False)