In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
response = requests.get('https://practicum.yandex.ru/catalog/programming/')
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('a', class_='prof-card__content-link')

In [None]:
courses = []
links = []
for card in cards:
    courses.append(card.text)
    links.append(card.get('href'))

In [None]:
courses.pop(0)
links.pop(0)

In [None]:
courses = [course.replace("\xa0", "") for course in courses]
links = ['https://practicum.yandex.ru' + link for link in links]
links = [link.replace("/?from=catalog","") for link in links]

In [None]:
df = pd.DataFrame({'Course':courses, 'Link':links, 'Price':None})

In [None]:
df['Price'] = None
df.head()

In [None]:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('https://practicum.yandex.ru/catalog/programming')
cards = WebDriverWait(driver,20).until(
    EC.presence_of_all_elements_located((By.CLASS_NAME, "prof-card__content-link"))
)   


In [None]:

import time
for i, card in enumerate(cards):
    
    driver.execute_script("arguments[0].scrollIntoView(true);", card)
    time.sleep(1)
    driver.implicitly_wait(5)
    card.screenshot(f'{i}.png')
    

In [None]:
import pytesseract
import re
from PIL import Image
for i in range(1, len(cards)):
    image = Image.open(f'{i}.png')
    text = pytesseract.image_to_string(image, lang='rus')
    pattern = r'(?:\d+\s)*\d*[2Р?]\b' 
    matches = re.findall(pattern, text)
    if len(matches) == 0:
        df.iloc[i-1]['Price'] = None
    if len(matches) == 1:
        df.iloc[i-1]['Price'] = int(matches[0][:-1].replace(" ", ""))
    if len(matches) == 2:
        df.iloc[i-1]['Price'] = int(matches[1][:-1].replace(" ", ""))

In [None]:
df

In [None]:
df.dropna(inplace=True)
df

In [None]:
df.drop([54,55, 57], inplace=True) #Обработка исключений
df


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
prices = df['Price'].to_numpy()
print(prices)
sns.set(style="whitegrid")

# Создание гистограммы
plt.figure(figsize=(10, 6))  # Размер графика
sns.histplot(prices, color='skyblue', bins=10, edgecolor='black')

# Добавление заголовка и подписей осей
plt.title('Распределение цен курсов Яндекс Практикума', fontsize=16)
plt.xlabel('Значения', fontsize=14)
plt.ylabel('Кол-во курсов', fontsize=14)

# Отображение графика
plt.show()

In [None]:
df_sorted = df.sort_values(by='Price')
del df_sorted['Link']
df_sorted = df_sorted.tail()

In [None]:
plt.figure(figsize=(8, 4))
sns.set_style("whitegrid")
# Создаем таблицу с помощью seaborn
table = plt.table(cellText=df_sorted.values,
                  colLabels=df_sorted.columns,
                  loc='center',
                  cellLoc='center',
                  colColours=['#f0f0f0']*len(df.columns))  # цвет заголовков

# Настраиваем внешний вид таблицы
table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1.2, 1.5)  # масштабируем таблицу (ширина, высота ячеек)

# Убираем оси
plt.axis('off')

# Показываем таблицу
plt.show()