In [11]:
import asyncio
import json
from dataclasses import dataclass
from itertools import product, islice

import aiohttp
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [2]:
BASE_URL: str = 'https://gorodrabot.ru/salary'
KWPARAMS: dict[str, list[str | int, ...]] = json.load(
    open('data/kwattrs.json', encoding='utf-8')
)
PARAMS_LABELS: dict[str, str] = {
    'y': 'year',
    'mnt': 'month',
    'l': 'region',
}
SALARIES_NAMES: tuple[str, ...] = ('median', 'mean', 'modal')

In [3]:
@dataclass(frozen=True, kw_only=True)
class Salary:
    parsed_from: str
    year: int
    month: str
    region: str
    median: int | None
    mean: int | None
    modal: int | None

    @classmethod
    def from_labels(cls, **kwargs):
        kwargs = {PARAMS_LABELS.get(key, key): value for key, value in kwargs.items()}
        return cls(**kwargs)

In [4]:
@dataclass(frozen=True)
class Proxy:
    ip: str
    port: str
    code: str
    country: str
    anonymity: str
    google: bool
    https: bool
    last_check: int

    @property
    def url(self):
        return f'http://{self.ip}:{self.port}'


class ProxyFactory:
    UNITS: dict[str, int] = {
        'sec': 1,
        'secs': 1,
        'min': 60,
        'mins': 60,
        'hour': 3600,
        'hours': 3600
    }

    URLS: list[str] = [
        'https://free-proxy-list.net/',
        'https://www.us-proxy.org/',
        'https://free-proxy-list.net/uk-proxy.html',
        'https://www.sslproxies.org/'
    ]

    @classmethod
    async def create(
        cls,
        *,
        code: list[str] | None = None,
        exclude_code: list[str] | None = None,
        anonymity: list[str] | None = None,
        google: bool | None = None,
        https: bool | None = None
    ):
        """Construct proxy factory"""

        self = cls()

        self.code = set(code) if code else None
        self.exclude_code = set(exclude_code) if exclude_code else None
        self.anonymity = set(anonymity) if anonymity else None
        self.google = google
        self.https = https
        self.proxies = iter(await self.get_proxies())

        return self

    async def get_proxies(self) -> list[Proxy]:
        """Get proxies from all sources"""

        async with aiohttp.ClientSession() as session:
            tasks = [
                asyncio.create_task(self._fetch_proxies(session, url))
                for url in self.URLS
            ]
            results = await asyncio.gather(*tasks)

        proxies = self._filter_proxies(
            [proxy for result in results for proxy in result]
        )
        proxies.sort(key=lambda proxy: proxy.last_check)

        return proxies

    async def get(
        self,
        *,
        timeout: int = 2,
        check_url: str = BASE_URL
    ) -> Proxy:
        """Get first working proxy"""

        async with aiohttp.ClientSession(
            timeout=aiohttp.ClientTimeout(total=timeout)
        ) as session:
            for proxy in self.proxies:
                try:
                    async with session.get(check_url, proxy=proxy.url) as response:
                        if response.status == 200:
                            return proxy
                except aiohttp.ClientError:
                    pass
                except asyncio.TimeoutError:
                    pass
            else:
                raise ValueError('No working proxy found')

    async def _fetch_proxies(
        self,
        session: aiohttp.ClientSession,
        url: str
    ) -> list[Proxy]:
        async with session.get(url) as response:
            doc = await response.text()

        soup = BeautifulSoup(doc, 'html.parser')
        tags = soup.select('div.fpl-list tbody tr')

        proxies = []
        for tag in tags:
            columns = [td.get_text(strip=True).lower() for td in tag.select('td')[:8]]
            ip, port, code, country, anonymity, google, https, last_check = columns

            code = code.upper()
            google = google == 'yes'
            https = https == 'yes'
            value, unit = last_check.split()[:2]
            last_check = int(value) * self.UNITS[unit]

            proxies.append(
                Proxy(ip, port, code, country, anonymity, google, https, last_check)
            )

        return proxies

    def _filter_proxies(
        self,
        proxies: list[Proxy]
    ) -> list[Proxy]:
        if self.code is not None:
            proxies = [proxy for proxy in proxies if proxy.code in self.code]
        if self.exclude_code is not None:
            proxies = [proxy for proxy in proxies if proxy.code not in self.exclude_code]
        if self.anonymity is not None:
            proxies = [proxy for proxy in proxies if proxy.anonymity in self.anonymity]
        if self.google is not None:
            proxies = [proxy for proxy in proxies if proxy.google == self.google]
        if self.https is not None:
            proxies = [proxy for proxy in proxies if proxy.https == self.https]

        return proxies

In [5]:
requests_limit: int | None = None
params_list: list = list(
    dict(zip(KWPARAMS.keys(), values))  #  Add keys
    for values in islice(product(*KWPARAMS.values()), requests_limit)  #  Product values
)
proxy_factory: ProxyFactory = await ProxyFactory.create(https=True)
simultaneity_limit: int = 15
batch_count: int = 3
batch_size: int = len(params_list) // batch_count

In [6]:
semaphore = asyncio.Semaphore(simultaneity_limit)


def extract_salaries(doc: str) -> dict[str, int | None]:
    soup = BeautifulSoup(doc, 'html.parser')
    tags = soup.find_all('span', class_='statistics-list-section__item-title', limit=3)
    salaries = [int(tag.get_text(strip=True).replace(' ', '')) for tag in tags]
    match salaries:
        case [median, mean, modal]:
            salaries = (median, mean, modal)
        case [mean]:
            salaries = (None, mean, None)
        case []:
            salaries = (None, None, None)
    return dict(zip(SALARIES_NAMES, salaries))


async def parse(session: aiohttp.ClientSession, url: str, **kwargs) -> Salary:
    async with semaphore:
        async with session.get(url, **kwargs) as response:
            doc = await response.text()

    salaries = extract_salaries(doc)
    params = kwargs.get('params')
    return Salary.from_labels(parsed_from=response.url, **params, **salaries)


async def parse_all() -> list[Salary]:
    docs: list[Salary] = []
    async with aiohttp.ClientSession(
        connector=aiohttp.TCPConnector(limit=simultaneity_limit),
        raise_for_status=True,
    ) as session:
        for i in tqdm(range(0, len(params_list), batch_size), desc='Parsing'):
            proxy = await proxy_factory.get()
            tasks = [
                parse(
                    session,
                    BASE_URL,
                    params=params,
                    proxy=proxy.url,
                )
                for params in params_list[i : i + batch_size]
            ]
            results = [
                await i
                for i in tqdm(
                    asyncio.as_completed(tasks),
                    total=len(tasks),
                    desc=f'Processing {i} - {i + batch_size} urls',
                )
            ]
            docs.extend(results)
    return docs

In [8]:
data = await parse_all()

Parsing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing 0 - 896 urls:   0%|          | 0/896 [00:00<?, ?it/s]

Processing 896 - 1792 urls:   0%|          | 0/896 [00:00<?, ?it/s]

Processing 1792 - 2688 urls:   0%|          | 0/896 [00:00<?, ?it/s]

In [12]:
df = pd.DataFrame(data)
df.to_csv('data/salaries.csv', index=False)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2688 entries, 0 to 2687
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   parsed_from  2688 non-null   object 
 1   year         2688 non-null   int64  
 2   month        2688 non-null   object 
 3   region       2688 non-null   object 
 4   median       1931 non-null   float64
 5   mean         2687 non-null   float64
 6   modal        1931 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 147.1+ KB
