#This notebook is to collect the images

In [None]:
import pandas as pd

# 1. Load your filtered Nepal Landmark IDs
nepal_ids_df = pd.read_csv('nepal_monuments.csv')
nepal_list = nepal_ids_df['landmark_id'].tolist()

# 2. Load the main train.csv (Using 'usecols' saves memory)
df_train = pd.read_csv('../datasets/train.csv', usecols=['id', 'url', 'landmark_id'])

# 3. Filter for only Nepal images
nepal_images = df_train[df_train['landmark_id'].isin(nepal_list)]

# 4. Save this specific list
nepal_images.to_csv('nepal_images_to_download.csv', index=False)
print(f"Ready to download {len(nepal_images)} images!")

Ready to download 2774 images!


In [None]:
import os
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  # This adds a progress bar

# Load your filtered list
df = pd.read_csv('nepal_images_to_download.csv')

# a 'Session' to make downloading faster and more reliable
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})

def download_image(row):
    image_id = row['id']
    url = row['url']
    l_id = str(row['landmark_id'])

    folder_path = os.path.join('nepal_dataset', l_id)
    os.makedirs(folder_path, exist_ok=True)

    file_path = os.path.join(folder_path, f"{image_id}.jpg")

    if os.path.exists(file_path):
        return

    try:
        # We add a timeout and allow redirects
        response = session.get(url, timeout=15, allow_redirects=True)
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                f.write(response.content)
        else:
            # This will help to see WHY it's failing
            print(f" Failed: {image_id} (Status: {response.status_code})")
    except Exception as e:
        pass # Ignore errors to keep the progress bar clean

# Wrap the executor in a progress bar
with ThreadPoolExecutor(max_workers=10) as executor:
    list(tqdm(executor.map(download_image, [row for _, row in df.iterrows()]),
              total=len(df), desc="Downloading Nepal Monuments"))

print("\nDownload process finished! Check your 'nepal_dataset' folder now.")

Downloading Nepal Monuments:   4%|▍         | 112/2774 [02:40<33:25,  1.33it/s] 

 Failed: 58a868a053ed29aa (Status: 404)


Downloading Nepal Monuments:   8%|▊         | 219/2774 [03:38<31:18,  1.36it/s]

 Failed: 23548a412c816eef (Status: 404)


Downloading Nepal Monuments:  10%|█         | 281/2774 [04:22<21:05,  1.97it/s]

 Failed: aa2d6d823cb40bff (Status: 404)


Downloading Nepal Monuments:  16%|█▌        | 436/2774 [04:58<08:25,  4.62it/s]

 Failed: 0e811d85e89162cf (Status: 404)


Downloading Nepal Monuments:  22%|██▏       | 606/2774 [05:33<08:06,  4.46it/s]

 Failed: 9ee29dc7a4b62943 (Status: 404)


Downloading Nepal Monuments:  23%|██▎       | 638/2774 [05:38<07:26,  4.78it/s]

 Failed: 59dcf64dee82edb8 (Status: 404)


Downloading Nepal Monuments:  24%|██▍       | 659/2774 [05:43<06:59,  5.05it/s]

 Failed: 57ecaf41a8e1a4e6 (Status: 404)


Downloading Nepal Monuments:  27%|██▋       | 759/2774 [06:05<08:19,  4.03it/s]

 Failed: 9b633bee4769339f (Status: 404)


Downloading Nepal Monuments:  31%|███       | 849/2774 [06:25<07:08,  4.49it/s]

 Failed: e50a77ad5c853333 (Status: 404)


Downloading Nepal Monuments:  38%|███▊      | 1051/2774 [07:31<14:41,  1.95it/s]

 Failed: 024faa7962a5011e (Status: 404)


Downloading Nepal Monuments:  40%|████      | 1120/2774 [08:13<20:59,  1.31it/s]

 Failed: e5490d8304933ab3 (Status: 404)


Downloading Nepal Monuments:  48%|████▊     | 1320/2774 [11:02<2:10:21,  5.38s/it]

 Failed: 1436617c5ef185cb (Status: 404)


Downloading Nepal Monuments:  48%|████▊     | 1345/2774 [11:13<37:16,  1.56s/it]  

 Failed: ca1491480bcd23b1 (Status: 404)


Downloading Nepal Monuments:  58%|█████▊    | 1606/2774 [15:02<19:46,  1.02s/it]  

 Failed: 41568626ae768dc6 (Status: 404)


Downloading Nepal Monuments:  62%|██████▏   | 1710/2774 [16:03<17:28,  1.02it/s]

 Failed: 1f61b44c8b686bc0 (Status: 404)


Downloading Nepal Monuments:  65%|██████▍   | 1796/2774 [16:53<05:11,  3.14it/s]

 Failed: cedec7b7fcd4420e (Status: 404)


Downloading Nepal Monuments:  65%|██████▍   | 1802/2774 [16:56<05:10,  3.13it/s]

 Failed: 7de091fef909d139 (Status: 404)


Downloading Nepal Monuments:  70%|██████▉   | 1939/2774 [17:35<05:51,  2.37it/s]

 Failed: 130611d3ab538926 (Status: 404)


Downloading Nepal Monuments:  75%|███████▍  | 2067/2774 [18:13<02:47,  4.21it/s]

 Failed: 969dc6b68caf709d (Status: 404)


Downloading Nepal Monuments:  77%|███████▋  | 2123/2774 [18:24<05:28,  1.98it/s]

 Failed: c6dae8e80b91f46d (Status: 404)
 Failed: ae44e114ac2bc927 (Status: 404)


Downloading Nepal Monuments:  84%|████████▍ | 2334/2774 [19:41<01:01,  7.20it/s]

 Failed: 6f107103599ff646 (Status: 404)


Downloading Nepal Monuments:  85%|████████▌ | 2360/2774 [19:59<03:27,  1.99it/s]

 Failed: b345dab4b13a59a8 (Status: 404)


Downloading Nepal Monuments:  88%|████████▊ | 2430/2774 [20:01<01:03,  5.40it/s]

 Failed: e62068d89c6c3dbd (Status: 404)


Downloading Nepal Monuments: 100%|██████████| 2774/2774 [22:06<00:00,  2.09it/s]


Download process finished! Check your 'nepal_dataset' folder now.





In [None]:
import os
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Load your filtered list
df = pd.read_csv('nepal_images_to_download.csv')

def download_image(row):
    image_id = row['id']
    url = row['url']
    l_id = str(row['landmark_id'])

    # Create a folder for each landmark if it doesn't exist
    folder_path = os.path.join('nepal_dataset', l_id)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path, exist_ok=True)

    file_path = os.path.join(folder_path, f"{image_id}.jpg")

    # Skip if already downloaded
    if os.path.exists(file_path):
        return

    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                f.write(response.content)
    except Exception as e:
        print(f"Error downloading {image_id}: {e}")

# ThreadPool to speed up the process (adjust max_workers based on your internet)
with ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(download_image, [row for _, row in df.iterrows()])

print("Download complete!")