<a href="https://colab.research.google.com/github/anton-akulenko/DA-test/blob/main/images_level1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install validators

In [2]:
import re
import aiohttp
import asyncio
import pandas as pd
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import validators

In [19]:
from google.colab import drive

drive.mount('/content/drive')
all_data_folder = "/content/drive/MyDrive/tests_colab/DA/" #/content/drive/MyDrive/tests_colab/DA/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
def convert_google_sheet_url(url):
    pattern = r'https://docs\.google\.com/spreadsheets/d/([a-zA-Z0-9-_]+)(/edit#gid=(\d+)|/edit.*)?'
    replacement = lambda m: f'https://docs.google.com/spreadsheets/d/{m.group(1)}/export?' + (f'gid={m.group(3)}&' if m.group(3) else '') + 'format=csv'
    new_url = re.sub(pattern, replacement, url)
    return new_url

In [6]:
url = 'https://docs.google.com/spreadsheets/d/1QX2IhFyYmGDFMvovw2WFz3wAT4piAZ_8hi5Lzp7LjV0/edit#gid=1902149593'
new_url = convert_google_sheet_url(url)

In [7]:
df = pd.read_csv(new_url)

In [8]:
# df1 = df.loc[:2000]
df1 = df

In [15]:
df1.shape

(46888, 2)

In [21]:
DELAY = 0.0
BATCH_SIZE = 200

async def fetch(session, url):
    if not validators.url(url):
        return "url not valid"
    async with session.get(url) as response:
        return await response.read()

async def get_image_size(session, urls, delay=0):
    tasks = [fetch(session, url) for url in urls]
    results = await asyncio.gather(*tasks)
    sizes = []
    error_links = []
    for url, data in zip(urls, results):
        try:
            image = Image.open(BytesIO(data))
            width, height = image.size
            sizes.append(f"{width}x{height}")
        except Exception as e:
            error_links.append([url, e])
            sizes.append(e)
        await asyncio.sleep(delay)
    return sizes, error_links

async def main():
    image_urls = df1['image_url'].tolist()
    batch_size = BATCH_SIZE
    image_urls_batches = [image_urls[i:i + batch_size] for i in range(0, len(image_urls), batch_size)]
    async with aiohttp.ClientSession() as session:
        all_sizes = []
        total_errors = []
        for batch in tqdm(image_urls_batches, desc="Processing batches", ncols=100):
            sizes, batch_errors = await get_image_size(session, batch, delay=DELAY)
            all_sizes.extend(sizes)
            total_errors.extend(batch_errors)
        data = {'URL': image_urls, 'SIZE (pixels)': all_sizes}
        df_result = pd.DataFrame(data)
        data_err = {'original URL': [str(error[0]) for error in total_errors], "Error descr": [error[1] for error in total_errors]}
        df_err = pd.DataFrame(data_err)
        df_result.to_excel(all_data_folder + "all_images_sizes.xlsx", index=False)

        print("DataFrame saved to all_images_sizes.xlsx")

        df_err.to_excel(all_data_folder + "list_of_errors.xlsx", index=False)

        print("Errors saved to list_of_errors.xlsx")
        print(f"Total error urls: {len(total_errors)}")

await main()

Processing batches: 100%|█████████████████████████████████████████| 235/235 [02:16<00:00,  1.72it/s]


DataFrame saved to all_images_sizes.xlsx
Errors saved to list_of_errors.xlsx
Total error urls: 998


**Clean DF**

In [23]:
error_df = pd.read_excel(all_data_folder + "list_of_errors.xlsx")
img_df = pd.read_excel(all_data_folder + "all_images_sizes.xlsx")
error_urls = error_df['original URL'].tolist()
cleaned_df = img_df[~img_df['URL'].isin(error_urls)]
cleaned_df.to_excel(all_data_folder + "CLEAN_images_sizes_no_errors.xlsx", index=False)
