In [17]:
!pip install asyncio_pool

Collecting asyncio_pool
  Downloading asyncio_pool-0.6.0-py3-none-any.whl (8.5 kB)
Installing collected packages: asyncio-pool
Successfully installed asyncio-pool-0.6.0


In [5]:
!pip install aiofiles

Collecting aiofiles
  Downloading aiofiles-23.1.0-py3-none-any.whl (14 kB)
Installing collected packages: aiofiles
Successfully installed aiofiles-23.1.0


In [3]:
!pip install asyncio_pool



In [4]:
import requests
import os
from pathlib import Path
import tqdm


import asyncio
import aiohttp
import aiofiles
from asyncio_pool import AioPool
from tqdm.asyncio import tqdm_asyncio

In [5]:
class SPBU_scrapper():
    WORKS_LIMIT = 2000
        
    def check_downloaded(self):
        self.works_to_download = [f"work_{i}.pdf" for i in range(1, self.WORKS_LIMIT)]        
        
        for path in os.listdir(f"{self.data_dir}/spbu/pdf"):
            if os.path.isfile(os.path.join(f"{self.data_dir}/spbu/pdf", path)):
                self.works_to_download.remove(path)

    def __init__(self, data_dir="data"):
        self.data_dir = data_dir
        path = Path(f"{data_dir}/spbu/pdf").mkdir(parents=True, exist_ok=True)
        
        self.check_downloaded()
        
        print(f"Found {self.WORKS_LIMIT - len(self.works_to_download)} pdfs")
        print()
    
    async def parse(self, is_async=True, pool_size=20):

        error_works = []

        async def fetch_file(work):
            work_id = work[5:-4]
            url = f"https://se.math.spbu.ru/thesis_download?thesis_id={work_id}"
            
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as resp:
                    if resp.status == 200:
                        data = await resp.read()
                        print(f"Gotcha {work_id}")
                        async with aiofiles.open(
                            os.path.join(self.data_dir, 'spbu/pdf', work), "wb"
                        ) as outfile:
                            await outfile.write(data)
        
        if is_async:
            pool = AioPool(size=pool_size)
            await pool.map(fetch_file, self.works_to_download)
        else:
            for work in tqdm.tqdm(self.works_to_download):
                await fetch_file(work)
            
        self.check_downloaded()

In [7]:
scrapper = SPBU_scrapper()
await scrapper.parse()

Found 278 pdfs

Gotcha 4
Gotcha 21
Gotcha 12
Gotcha 13
Gotcha 18
Gotcha 35
Gotcha 46
Gotcha 38
Gotcha 44
Gotcha 47
Gotcha 69
Gotcha 74
Gotcha 70
Gotcha 59
Gotcha 98
Gotcha 117
Gotcha 97
Gotcha 99
Gotcha 149
Gotcha 153
Gotcha 155
Gotcha 192
Gotcha 175
Gotcha 151
Gotcha 152
Gotcha 184
Gotcha 241
Gotcha 238
Gotcha 248
Gotcha 239
Gotcha 250
Gotcha 234
Gotcha 199
Gotcha 244
Gotcha 242
Gotcha 223
Gotcha 260
Gotcha 271
Gotcha 281
Gotcha 284
Gotcha 286
Gotcha 294
Gotcha 323
Gotcha 328
Gotcha 332
Gotcha 331
Gotcha 333
Gotcha 326
Gotcha 349
Gotcha 352
Gotcha 357
Gotcha 379
Gotcha 364
Gotcha 375
Gotcha 378
Gotcha 354
Gotcha 396
Gotcha 403
Gotcha 409
Gotcha 406
Gotcha 380
Gotcha 412
Gotcha 436
Gotcha 433
Gotcha 435
Gotcha 438
Gotcha 466
Gotcha 475
Gotcha 473
Gotcha 469
Gotcha 487
Gotcha 494
Gotcha 480
Gotcha 557
Gotcha 567
Gotcha 564
Gotcha 589
Gotcha 588
Gotcha 592
Gotcha 586
Gotcha 597
Gotcha 663
Gotcha 692
Gotcha 689
Gotcha 683
Gotcha 703
Gotcha 687
Gotcha 716
Gotcha 713
Gotcha 711
Gotcha 705
G