In [40]:
import pandas as pd
import numpy as np
from PIL import Image
from io import BytesIO
import asyncio
import aiohttp
import gspread
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials

#### Read data into DataFrame

In [2]:
url = "https://docs.google.com/spreadsheets/d/1PkqfLtDlHR9URTgr26fy53pCKFJxXl3dya3hiyirPPU/gviz/tq?tqx=out:csv"
url_img = pd.read_csv(url, header = 0)


In [3]:
url_img.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46888 entries, 0 to 46887
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   image_url  46868 non-null  object 
 1   SIZE       0 non-null      float64
dtypes: float64(1), object(1)
memory usage: 732.8+ KB


#### Some preparations

In [4]:
PRE_URL = "https://data.sanitino.eu/"
adress_list = [str(url).split('/', 3)[-1] for url in url_img['image_url']]

#### Function just for testing connection

In [5]:
def get_size_test(url):    
    headers = {
        "accept-encoding": "gzip, deflate, br",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
    }
    response = requests.get(url = url)
    try:
        img = Image.open(BytesIO(response.content))
        res = str(img.size[0]) + 'x' + str(img.size[1])
    except:
        res = np.nan
    return res

#### Main function and list 'size_arr' like an res array

In [25]:
size_arr = list()

async def get_size(cont):
    try:
        sizes = Image.open(BytesIO(cont)).size
        res = str(sizes[0]) + 'x' + str(sizes[1]) 
    except:
        return np.nan
    return res
    
    
    
async def get_req(url, session, semaphore):
    await semaphore.acquire()
    try:
        resp = await session.request(method="GET", url=url, allow_redirects=True)
        cont = await resp.read()
        res = await get_size(cont)
    except aiohttp.ClientConnectorError:
        res = 404
    semaphore.release()
    return res

async def gather_req(urls):
    headers = {
        "accept-encoding": "gzip, deflate, br",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
    }
    semaphore = asyncio.Semaphore(value = 100)
    async with aiohttp.ClientSession(headers = headers, connector=aiohttp.TCPConnector(ssl=False)) as session:
        tasks = []
        for url in urls:
            tasks.append(
                get_req(url=PRE_URL + url, session=session, semaphore = semaphore)
            )
        results = await asyncio.gather(*tasks)

    for result in results:
        size_arr.append(result)

In [26]:
await gather_req(adress_list)

In [27]:
np.save('size_data.npy', size_arr)

In [28]:
url_img['SIZE'] = size_arr

In [29]:
url_img

Unnamed: 0,image_url,SIZE
0,https://data.sanitino.eu/PRODUCT-33916/8607663...,1080x1614
1,https://data.sanitino.eu/PRODUCT-62434/f7aa3c2...,1080x1080
2,https://data.sanitino.eu/PRODUCT-33915/8607663...,1080x1614
3,https://data.sanitino.eu/PRODUCT-62426/506d67b...,1080x1080
4,https://data.sanitino.eu/PRODUCT-33893/ccd99b4...,1080x1614
...,...,...
46883,https://data.sanitino.eu/PRODUCT-113844/9786df...,2301x1080
46884,https://data.sanitino.eu/PRODUCT-20240/4a457da...,1621x1080
46885,https://data.sanitino.eu/PRODUCT-47765/58a1a3d...,1080x1218
46886,https://data.sanitino.eu/PRODUCT-56372/764c369...,1080x1080


#### Write DataFrame with result to spreadsheet - 'result_script'

In [44]:
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name(
    'project-test-newage-375007fc5920.json', scope)
gc = gspread.authorize(credentials)

In [45]:
spreadsheet_key = '1PkqfLtDlHR9URTgr26fy53pCKFJxXl3dya3hiyirPPU'
wks_name = 'result_script'
d2g.upload(url_img, spreadsheet_key, wks_name, credentials=credentials, row_names=False)

<Worksheet 'result_script' id:1724244167>