In [1]:
import requests
from lxml import html
from pymongo import MongoClient
import pandas as pd
import threading
import logging
from fake_useragent import UserAgent
import os
import time

In [2]:
#Пропишем базовый конфигурационный журнал
logging.basicConfig(level=logging.INFO)

In [3]:
#Пропишем строку агента пользователя в заголовке HTTP-запроса, чтобы имитировать веб-браузер и избежать блокировки сервером
headers = {
    'User_Agent': UserAgent().random
}

In [4]:
#Создадим класс Scraper и пропишем в нём функцию scrape_data, которая скрейпит данные с сайта
class Scraper:
    def __init__(self, url):
        self.url = url

    def scrape_data(self) -> list:
        response = requests.get(self.url, headers=headers)
        tree = html.fromstring(response.content)
        rows = tree.xpath("//table[@class='records-table']/tbody/tr")
        
        result_list = list()
        for row in rows:
            row_data = row.xpath(".//td/text()")
            data_info = {}
            data_info['Rank'] = int(row_data[0].strip())
            data_info['Mark'] = float(row_data[1].strip())
            data_info['WIND'] = row_data[2].strip() if row_data[2].strip() else "0"
            data_info['Competitor'] = row.xpath(".//td[4]/a/text()")[0].strip()
            data_info['DOB'] = row_data[5].strip()
            data_info['Nat'] = row_data[7].strip()
            data_info['Pos'] = row_data[8].strip()
            data_info['Venue'] = (row_data[9].strip())
            data_info['Date'] = row_data[10].strip()
            data_info['ResultsScore'] = int(row_data[11].strip())
            result_list.append(data_info)
        return result_list

In [5]:
#Создадим класс DataSaver и пропишем в нём функции save_to_mongo, которая сохранит данные в базу данных mongodb, и функцию save_to_csv,
#которая сохраняет полученные данные в csv файл
class DataSaver:
    def __init__(self, data_list):
        self.data_list = data_list

    def save_to_mongo(self) -> None:
        try:
            client = MongoClient('localhost', 27017)
            sport_db = client['sport_db']
            results_table = sport_db['results_table']
            results_table.insert_many(self.data_list)
            logging.info("Data saved to MongoDB")
        except Exception as e:
            logging.error(f"Error saving data to MongoDB: {e}")
            
    def save_to_csv(self) -> None:
        try:
            df = pd.DataFrame(self.data_list)
            mode = 'a' if os.path.exists('results_table.csv') else 'w'
            df.to_csv('results_table.csv', mode=mode, index=False, header=not os.path.exists('results_table.csv'))
            logging.info("Data saved to CSV")
        except Exception as e:
            logging.error(f"Error saving data to CSV: {e}")

In [6]:
#Пропишем главную функцию, которая связывает все функции, и пропишем потоки для ускорения работы(скрейпинга данных с сайта)
def main_function():
    threads = []
    for i in range(1, 19):
        url = f"https://worldathletics.org/records/all-time-toplists/sprints/60-metres/all/women/senior?page={i}"
        scraper = Scraper(url)
        data_list = scraper.scrape_data()
        
        mongodb_saver = DataSaver(data_list)
        csv_saver = DataSaver(data_list)
        
        thread1 = threading.Thread(target=mongodb_saver.save_to_mongo)
        thread2 = threading.Thread(target=csv_saver.save_to_csv)
        
        threads.append(thread1)
        threads.append(thread2)
        
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
    logging.info("All data saved")

In [7]:
#Запустим код, и выведим сообщение о работе 
if __name__ == '__main__':
    main_function()
    time.sleep(1)

INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to MongoDB
INFO:root:Data saved to MongoDB
INFO:root:Data saved to CSV
INFO:root:Data saved to MongoDB
INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to MongoDB
INFO:root:Data saved to MongoDB
INFO:root:Data saved to MongoDB
INFO:root:Data saved to MongoDB
INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to MongoDB
INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to CSV
INFO:root:Data saved to MongoDB
INFO:root:Data saved to MongoDB
INFO:root:Data saved to MongoDB
INFO:root:Data saved to MongoDB
INFO:root:Data saved to MongoDB
INFO:root:Data saved to MongoDB
INFO:root:Data saved to MongoDB
INFO:root:Data s