In [1]:
import sys
import asyncio
from aiohttp import ClientSession, TCPConnector
from bs4 import BeautifulSoup
import re
import urllib.request

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time, sleep

from itertools import islice
from IPython.display import clear_output

In [2]:
COVERAGE_TOTALS = {
    1: [26, 87, 79, 142, 78, 18, 189, 110, 198, 85, 35, 98, 62, 52],
    2: [103, 24, 52, 85, 76, 88, 83, 78, 50, 159, 90, 64],
    3: [30, 53, 84, 38, 45, 54, 97, 103, 65, 93, 57, 104],
    4: [60, 45, 41, 26, 32, 64, 83, 138, 80, 78, 128],
    5: [52, 54, 52, 55, 57, 80, 65, 69, 43, 98],
}

In [3]:
totals = [sum(COVERAGE_TOTALS[i]) for i in range(1, 6)]
global_total = sum(totals)
print(global_total)

4434


In [4]:
def generate_zads(sem):
    return [
        ".".join((str(i + 1), str(j)))
        for i in range(len(COVERAGE_TOTALS[sem]))
        for j in range(1, COVERAGE_TOTALS[sem][i] + 1)
    ]

In [5]:
url_list = [
    f"https://mipt1.ru/1_2_3_4_5_kor.php?sem={sem}&zad={zad}"
    for sem in range(1, 6)
    for zad in generate_zads(sem)
]

In [6]:
# url_dict = {}
# for sem in range(1, 6):
#     url_dict[sem] = [f"https://mipt1.ru/1_2_3_4_5_kor.php?sem={sem}&zad={zad}" for zad in generate_zads(sem)]

# for sem in range(1, 6):
#     assert len(url_dict[sem]) == len(set(url_dict[sem]))

In [7]:
with open("data.csv", "w", encoding="utf8") as f:
    f.write("sem,zad,page\n")

In [8]:
def purify(output):
    if re.match(r".*странице", output):
        page = re.search(r"№\d*", output)[0][1:]
    elif re.match(r".*не найдена", output):
        page = 0
    elif re.match(r"Укажите номер задачи корректно!", output):
        raise Exception("Oh shit... wrong input")
    else:
        raise Exception("Oh shit... this is even worse...")
    return page

In [9]:
def parse_and_save(html, io):
    soup = BeautifulSoup(html, "html.parser")
    div = soup.find_all("div", class_="short_content")[0]
    output = div.b.get_text()
    page = purify(output)
    meta = soup.find(id="footer").p.a.get('href')
    metalist = re.split('=|&', meta)
    sem, zad = metalist[1], metalist[3]
    io.write(",".join([sem, zad, str(page)]) + "\n")

In [10]:
lock = asyncio.Lock()


async def fetch_links_and_pass(url: str, session: ClientSession, io_to_write):
    
    async with session.get(url) as response:
        html = await response.text()
        
    parse_and_save(html, io_to_write)


async def fetch(urls):
    with open("data.csv", "a", encoding="utf8") as opened_file:
        def chunk(it, size):
            it = iter(it)
            return iter(lambda: tuple(islice(it, size)), ())
        chunks = list(chunk(urls, 300))
        counter = 0
        for urls_chunk in chunks:
            clear_output(wait=True)
            counter += len(urls_chunk)
            conn = TCPConnector(limit=1)
            async with ClientSession(connector=conn) as session:
                tasks = [
                    asyncio.create_task(fetch_links_and_pass(url, session, opened_file)) for url in urls_chunk
                ]
                await asyncio.gather(*tasks)
            print(f"{counter} urls out of {len(urls)}")


start = time()
await fetch(url_list)
time_took = time() - start
print("All done! Seconds took: ", time_took)

4434 urls out of 4434
All done! Seconds took:  824.3027274608612


In [11]:
# with open("data.csv", "a", encoding="utf8") as f:
#     cnt = 0
#     length = len(url_list)
#     for url in url_list:
#         clear_output(wait=True)
#         with urllib.request.urlopen(url) as response:
#             html = response.read()
#         parse_and_save(html, f)
        
#         cnt += 1
#         print(f"{cnt} out of {length}")


# print("All done!")

In [12]:
# # temp cell
# with urllib.request.urlopen("https://mipt1.ru/1_2_3_4_5_kor.php?sem=4&zad=1.1") as response:
#     html = response.read()
# soup = BeautifulSoup(html, "html.parser")
# div = soup.find_all("div", class_="short_content")[0]
# meta = soup.find(id="footer").p.a.get('href')
# metalist = re.split('=|&', meta)
# sem, zad = metalist[1], metalist[3]
# output = div.b.get_text()
# page = purify(output)
# print(page)
# print(sem, zad)

In [53]:
df = pd.read_csv("data.csv", dtype={"zad": str, "sem": str})
# df["identifier"] = str(df["sem"]) + str(df["zad"])
df = df.assign(identifier = lambda x: (x['sem'] + x['zad']))

In [45]:
total_got = len(df)
assert total_got == global_total
df_non_zero = df[df["page"] != 0]
in_koryavov = len(df_non_zero)
coverage = round(in_koryavov / total_got * 100, 1)
coverage

73.7

In [46]:
for sem in range(1, 6):
    print(len(df[(df["sem"] == sem)]["zad"].unique()), len(df[(df["sem"] == sem)]))

1259 1259
952 952
823 823
775 775
625 625


|hey|hey|
| --- | --- |
|1141|1259|
|862|952|
|746|823|
|702|775|
|567|625|


In [54]:
df.head()

Unnamed: 0,sem,zad,page,identifier
0,1,1.1,10,11.1
1,1,1.2,10,11.2
2,1,1.3,10,11.3
3,1,1.4,11,11.4
4,1,1.5,11,11.5


In [43]:
df[(df["sem"] == 2) & (df["zad"] == "1.1")]["page"].iloc[0]

40

In [60]:
df.to_csv('data2.csv', index=False)

In [61]:
df2 = pd.read_csv("data2.csv")
df2.head(10)

Unnamed: 0,sem,zad,page,identifier
0,1,1.1,10,11.1
1,1,1.2,10,11.2
2,1,1.3,10,11.3
3,1,1.4,11,11.4
4,1,1.5,11,11.5
5,1,1.6,0,11.6
6,1,1.7,15,11.7
7,1,1.8,0,11.8
8,1,1.9,12,11.9
9,1,1.1,24,11.1


In [None]:
for i in range(10):
    print("hey")
for i in range(10):
    print

In [2]:
def A(dfd):
    pass

In [3]:
import numpy as np