In [17]:
!pip install asyncio_pool

Collecting asyncio_pool
  Downloading asyncio_pool-0.6.0-py3-none-any.whl (8.5 kB)
Installing collected packages: asyncio-pool
Successfully installed asyncio-pool-0.6.0


In [5]:
!pip install aiofiles

Collecting aiofiles
  Downloading aiofiles-23.1.0-py3-none-any.whl (14 kB)
Installing collected packages: aiofiles
Successfully installed aiofiles-23.1.0


In [1]:
import requests
import os
from pathlib import Path
import tqdm


import asyncio
import aiohttp
import aiofiles
from asyncio_pool import AioPool
from tqdm.asyncio import tqdm_asyncio

In [2]:
class SPBU_scrapper():
    WORKS_LIMIT = 2000
        
    def check_downloaded(self):
        self.works_to_download = [f"work_{i}.pdf" for i in range(1, self.WORKS_LIMIT)]        
        
        for path in os.listdir(f"{self.data_dir}/spbu/pdf"):
            if os.path.isfile(os.path.join(f"{self.data_dir}/spbu/pdf", path)):
                self.works_to_download.remove(path)

    def __init__(self, data_dir="data"):
        self.data_dir = data_dir
        path = Path(f"{data_dir}/spbu/pdf").mkdir(parents=True, exist_ok=True)
        
        self.check_downloaded()
        
        print(f"Found {self.WORKS_LIMIT - len(self.works_to_download)} pdfs")
        print()
    
    async def parse(self, is_async=True, pool_size=20):

        error_works = []

        async def fetch_file(work):
            work_id = work[5:-4]
            url = f"https://se.math.spbu.ru/thesis_download?thesis_id={work_id}"
            
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as resp:
                    if resp.status == 200:
                        data = await resp.read()
                        print(f"Gotcha {work_id}")
                        async with aiofiles.open(
                            os.path.join(self.data_dir, 'spbu/pdf', work), "wb"
                        ) as outfile:
                            await outfile.write(data)
        
        if is_async:
            pool = AioPool(size=pool_size)
            await pool.map(fetch_file, self.works_to_download)
        else:
            for work in tqdm.tqdm(self.works_to_download):
                await fetch_file(work)
            
        self.check_downloaded()

In [3]:
scrapper = SPBU_scrapper()
await scrapper.parse()

Found 129 pdfs

Gotcha 24
Gotcha 30
Gotcha 27
Gotcha 47
Gotcha 35
Gotcha 48
Gotcha 51
Gotcha 52
Gotcha 34
Gotcha 53
Gotcha 69
Gotcha 57
Gotcha 68
Gotcha 63
Gotcha 85
Gotcha 65
Gotcha 83
Gotcha 88
Gotcha 91
Gotcha 110
Gotcha 108
Gotcha 87
Gotcha 111
Gotcha 114
Gotcha 132
Gotcha 135
Gotcha 127
Gotcha 134
Gotcha 138
Gotcha 155
Gotcha 163
Gotcha 157
Gotcha 167
Gotcha 144
Gotcha 193
Gotcha 190
Gotcha 192
Gotcha 196
Gotcha 217
Gotcha 219
Gotcha 223
Gotcha 205
Gotcha 235
Gotcha 233
Gotcha 236
Gotcha 238
Gotcha 224
Gotcha 208
Gotcha 228
Gotcha 246
Gotcha 241
Gotcha 253
Gotcha 267
Gotcha 269
Gotcha 263
Gotcha 251
Gotcha 271
Gotcha 247
Gotcha 284
Gotcha 286
Gotcha 276
Gotcha 303
Gotcha 304
Gotcha 306
Gotcha 305
Gotcha 289
Gotcha 288
Gotcha 275
Gotcha 344
Gotcha 313
Gotcha 347
Gotcha 346
Gotcha 327
Gotcha 349
Gotcha 370
Gotcha 372
Gotcha 371
Gotcha 368
Gotcha 393
Gotcha 397
Gotcha 395
Gotcha 398
Gotcha 427
Gotcha 392
Gotcha 429
Gotcha 422
Gotcha 425
Gotcha 458
Gotcha 457
Gotcha 431
Gotcha 460
Got

## TODO
- составить список пдф парсеров
- протестить весь список
- подумать как можно обьединять результаты(сделать из двух какашек золото)