In [1]:
# Importing all necessary libraries

In [None]:
# Egemen.kz

In [1]:
import requests
from bs4 import BeautifulSoup as BS
import re as re
import os
import math
from datetime import datetime
import csv
import json

from selenium import webdriver
from selenium.webdriver.common.by import By

In [66]:
# Function for extracting the URL for articles in Egemen.kz

In [69]:
def extract_urls(base_url,search_url):
    response = requests.get(search_url) # It is the default
    html_content = response.text
    
    # Парсинг HTML
    soup = BS(html_content, "html.parser")
    
    divs = soup.find_all('div', class_='clearfix news-t flexBlock')

    # Extract URLs from <a> tags inside those divs
    urls = [div.a['href'] for div in divs if div.a]
    full_urls = [base_url.rstrip('/') + div.a['href'] for div in divs if div.a]

    print(full_urls)

In [70]:
extract_urls("https://egemen.kz","https://egemen.kz/search?q=%D0%94%D0%BE%D0%BD%D0%BE%D1%80")

['https://egemen.kz/article/377917-mayittik-donorlyqty-damytugha-ne-kedergi', 'https://egemen.kz/article/375100-qaytys-bolghan-donordynh-aghza-musheleri-4-adamnynh-omirin-saqtap-qaldy', 'https://egemen.kz/article/373373-suyek-kemigi-donory', 'https://egemen.kz/article/370779-bes-adamgha-omir-syylaghan-donor', 'https://egemen.kz/article/367937-donor-buyrekpen-dgurip-dombyra-unin-kokke-samghatty']


In [68]:
# Created the function specifically for the parsing Egemen.kz web-site content and saved as a text file
# Only works for one URL 

In [69]:
def article_contents(article_url, storage_dir):
    
    try:
        # Загрузка HTML страницы
        response = requests.get(article_url)
        
        # Check for 404 errors or other HTTP status codes
        if response.status_code != 200:
            raise requests.HTTPError(f"HTTP Error: {response.status_code} for {article_url}")
        
        html_content = response.text
        
        # Парсинг HTML
        soup = BS(html_content, "html.parser")
        
        # Title extract
        try:
            title = soup.find('h1')
            title = title.get_text(strip=True)
        except Exception: 
            title = "Title not found"
        
        # Author extract
        try:
            author = soup.find('div', class_='name-auth').text.strip()
        except Exception: 
            author = "Author not found"
        
        # Date extract
        try:
            date_tag = soup.find('meta', itemprop="datePublished")
            if date_tag and date_tag.has_attr('content'):
                date_published = date_tag['content']
                date_published_dt = datetime.strptime(date_published, "%Y-%m-%d %H:%M:%S")
            else:
                date_published = "Date not found"
        except Exception: 
            date_published = "Date not found"
        
        # Content extract
        try:
            article_body = soup.find("div", itemprop="articleBody")
            article_text = article_body.get_text(separator="\n", strip=True) if article_body else "Content not found"
        except Exception: 
            article_text = "Content not found"
        
        # Сохранение текста в файл
        valid_title = re.sub(r'[\\/:"*?<>|]+', '', title)  # Remove invalid characters
        filename = os.path.join(storage_dir, f"{valid_title}.txt")
        
        with open(filename, "w", encoding="utf-8") as file:
            file.write(f"{title}\n\n")
            file.write(article_text)
        
        print("Статья успешно сохранена!")
        
        return {
        "Title": title,
        "Date Published": date_published,
        "Author": author,
        "URL": article_url,
        "Source": article_url.split('/')[2]
    }

    
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [70]:
def extract_article_details(article_url):
    """Extract title, date published, author, and URL."""
    response = requests.get(article_url)
    html_content = response.text
    soup = BS(html_content, "html.parser")
    
    # Extract title
    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else "No Title"
    
    # Extract date published
    date_tag = soup.find('meta', itemprop="datePublished")
    date_published = date_tag['content'] if date_tag and date_tag.has_attr('content') else "No Date"
    date_published_dt = None
    if date_published != "No Date":
        date_published_dt = datetime.strptime(date_published, "%Y-%m-%d %H:%M:%S")
        date_published = date_published_dt.strftime("%Y-%m-%d %H:%M:%S")
    
    # Extract author
    author_tag = soup.find('div', class_='name-auth')
    author = author_tag.get_text(strip=True) if author_tag else "No Author"
    
    return {
        "Title": title,
        "Date Published": date_published,
        "Author": author,
        "URL": article_url
    }


In [71]:
def main():
    
    # Key word input
    key_word_input = input("Введите ключевое слово: \n")
    
    # Creating url with key-word
    base_url_search = "https://egemen.kz"                         #base url can be changed by other URL
    search_url = base_url_search + "/search?q=" 
    words = key_word_input.split()
    string = "+".join(words)
    key_word_url = search_url + string

    print(f"Ссылка по вашему запросу: \n{key_word_url}")
    
    # Creating directory with the name of the input 
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)

    response = requests.get(key_word_url) 
    html_content = response.text
    
    # Parsing HTML
    soup = BS(html_content, "html.parser")
    
    # Finding articles number, included in web-site
    article_founded = soup.find('small').text

    print(article_founded)
    
    # Conversion to int from list
    num_article = re.findall(r'\d+', article_founded)
    num = int(num_article[0]) 
    
    # Each web-page in site only contains 5 articels, it can be also changed 
    articles_per_page = 5
    pages = math.ceil(num / articles_per_page) 

    print(f"Чило страниц:{pages}\n")
    
    urls_list = []  # Initialize the list

    for page in range(1, pages + 1):
        full_url = key_word_url + "&page=" + str(page)
        print(f"Processing page: {page} {full_url}\n")
    
        # Call the function and get the extracted URLs
        article_urls = extract_urls(base_url_search, full_url)
    
        for article_url in article_urls: 
            print(f"{article_url}\n")    
    
        # Append URLs to the list
        urls_list.extend(article_urls) 
    
        # Skip if no URLs are found
        if not article_urls:  
            print(f"Skipping page {page} due to no articles found.\n")
        continue

    # The final list of URLs
    print("\nAll Extracted URLs:\n")

    for url in urls_list:
        print(url)
    
    print(f"\nCount:{len(urls_list)}\n")
    print("Starting saving all the extracted articles to the text")
    for url in urls_list:
        print(url)
        article_contents(url,storage_dir)
        
    # CSV file creation
    csv_file = os.path.join(storage_dir, f"{key_word_input}.csv")
    
    # CSV headers
    headers = ["Title", "Date Published", "Author", "URL", "Source", "Status"]
    

        
    # Writing to CSV
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
    
        for url in urls_list:
            print(f"Processing article: {url}")
            article_details = extract_article_details(url)
            writer.writerow(article_details)
    print("CSV файл успешно сохранен !")

In [72]:
if __name__ == "__main__":
    main()

Введите ключевое слово: 
Донор
Ссылка по вашему запросу: 
https://egemen.kz/search?q=Донор
73 материал табылды
Чило страниц:15

Processing page: 1 https://egemen.kz/search?q=Донор&page=1

https://egemen.kz/article/377917-mayittik-donorlyqty-damytugha-ne-kedergi

https://egemen.kz/article/375100-qaytys-bolghan-donordynh-aghza-musheleri-4-adamnynh-omirin-saqtap-qaldy

https://egemen.kz/article/373373-suyek-kemigi-donory

https://egemen.kz/article/370779-bes-adamgha-omir-syylaghan-donor

https://egemen.kz/article/367937-donor-buyrekpen-dgurip-dombyra-unin-kokke-samghatty

Processing page: 2 https://egemen.kz/search?q=Донор&page=2

https://egemen.kz/article/367714-donor-oblys-qatary-qaytse-artady

https://egemen.kz/article/357191-mayit-donorlyghyn-qalpyna-keltiru-qadget-eks-ministr-transplantatsiyadaghy-tuytk

https://egemen.kz/article/356281-eki-donor-toghyz-adamnynh-omirin-saqtady-ministrlik-donorlyqqa-kelisken-azamatta

https://egemen.kz/article/356279-aghza-mushelerimdi-donor-retinde-b

Статья успешно сохранена!
https://egemen.kz/article/375100-qaytys-bolghan-donordynh-aghza-musheleri-4-adamnynh-omirin-saqtap-qaldy
Статья успешно сохранена!
https://egemen.kz/article/373373-suyek-kemigi-donory
Статья успешно сохранена!
https://egemen.kz/article/370779-bes-adamgha-omir-syylaghan-donor
Статья успешно сохранена!
https://egemen.kz/article/367937-donor-buyrekpen-dgurip-dombyra-unin-kokke-samghatty
Статья успешно сохранена!
https://egemen.kz/article/367714-donor-oblys-qatary-qaytse-artady
Статья успешно сохранена!
https://egemen.kz/article/357191-mayit-donorlyghyn-qalpyna-keltiru-qadget-eks-ministr-transplantatsiyadaghy-tuytk
Статья успешно сохранена!
https://egemen.kz/article/356281-eki-donor-toghyz-adamnynh-omirin-saqtady-ministrlik-donorlyqqa-kelisken-azamatta
Статья успешно сохранена!
https://egemen.kz/article/356279-aghza-mushelerimdi-donor-retinde-beruge-kelisemin-densaulyq-saqtau-ministri
Статья успешно сохранена!
https://egemen.kz/article/351147-shymkent-donor-shaqar

Processing article: https://egemen.kz/article/356281-eki-donor-toghyz-adamnynh-omirin-saqtady-ministrlik-donorlyqqa-kelisken-azamatta
Processing article: https://egemen.kz/article/356279-aghza-mushelerimdi-donor-retinde-beruge-kelisemin-densaulyq-saqtau-ministri
Processing article: https://egemen.kz/article/351147-shymkent-donor-shaqargha-aynala-ma
Processing article: https://egemen.kz/article/350387-qazaqstanda-europalyq-aghza-donorynynh-kunine-oray-donor-kuni-atap-otiledi
Processing article: https://egemen.kz/article/346275-bir-donor-tort-adamdy-qutqaryp-qaldy
Processing article: https://egemen.kz/article/345996-bir-donor-4-adamnynh-omirin-saqtap-qaldy
Processing article: https://egemen.kz/article/333286-transplantatsiyada-donor-tapshy
Processing article: https://egemen.kz/article/332119-kutu-paraghy-–-eki-dgylda-donorlyq-aghzagha-zaru-558-patsient-koz-dgumdy
Processing article: https://egemen.kz/article/327890-donor-bolu-–-dgauapty-is
Processing article: https://egemen.kz/article/32

In [None]:
###### Tengri-news

In [None]:
## https://tengrinews.kz 

In [115]:
def extract_urls(base_url,search_url):
    response = requests.get(search_url) # It is the default
    html_content = response.text
    
    # Парсинг HTML
    soup = BS(html_content, "html.parser")
    
    divs = soup.find_all('div', class_='content_main_item')

    # Extract URLs from <a> tags inside those divs
    urls = [div.a['href'] for div in divs if div.a]
    full_urls = [base_url.rstrip('/') + div.a['href'] for div in divs if div.a]

    return full_urls

In [116]:
def save_to_csv(data_list, csv_file, headers):
    """Saves a list of article details to a CSV file."""
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
        writer.writerows(data_list)
    print(f"CSV файл успешно сохранен в {csv_file}!")


In [117]:
def extract_article_details(article_url):
    """Extract title, date published, author, and URL."""
    response = requests.get(article_url)
    html_content = response.text
    soup = BS(html_content, "html.parser")
    
    # Extract title
    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else "No Title"
    
    # Extract date published
    json_ld_tags = soup.find_all('script', type="application/ld+json")
    date_published = "No Date"

    # Iterate through the tags to find the one with 'datePublished'
    for tag in json_ld_tags:
        try:
            # Parse the JSON content
            json_ld = json.loads(tag.string)
        
            # Check if it's a dictionary and has 'datePublished'
            if isinstance(json_ld, dict) and 'datePublished' in json_ld:
                date_published = json_ld['datePublished']
                break
            elif isinstance(json_ld, list):  # JSON-LD can also be a list of objects
                for item in json_ld:
                    if '@type' in item and item['@type'] == 'NewsArticle' and 'datePublished' in item:
                        date_published = item['datePublished']
                        break
        except Exception as e:
            print(f"Error parsing JSON-LD: {e}")

    # Parse the date if found
    if date_published != "No Date":
        try:
            date_published_dt = datetime.strptime(date_published, "%Y-%m-%dT%H:%M:%S%z")
            date_published = date_published_dt.strftime("%Y-%m-%d %H:%M:%S")
        except ValueError as e:
            print(f"Error parsing date: {e}")
    
    # Extract author
    author_tag = soup.find('span', class_='content_main_meta_author_item_name')
    author = author_tag.get_text(strip=True) if author_tag else "No Author"
    
    return {
        "Title": title,
        "Date Published": date_published,
        "Author": author,
        "URL": article_url,
        "Source": article_url.split('/')[2]
    }


In [114]:
def article_contents(article_url, storage_dir):
    
    try:
        # Загрузка HTML страницы
        response = requests.get(article_url)
        
        # Check for 404 errors or other HTTP status codes
        if response.status_code != 200:
            raise requests.HTTPError(f"HTTP Error: {response.status_code} for {article_url}")
        
        html_content = response.text
        
        # Парсинг HTML
        soup = BS(html_content, "html.parser")
        
        # Title extract
        try:
            title_tag = soup.find('h1')
            title = title_tag.get_text(strip=True) if title_tag else "No Title"
        except Exception: 
            title = "Title not found"
        
        # Author extract
        try:
            author_tag = soup.find('span', class_='content_main_meta_author_item_name')
            author = author_tag.get_text(strip=True) if author_tag else "No Author"
   
        except Exception: 
            author = "Author not found"
        # Extract date published
        json_ld_tags = soup.find_all('script', type="application/ld+json")
        date_published = "No Date"
        # Date extract
        for tag in json_ld_tags:
            try:
                # Parse the JSON content
                json_ld = json.loads(tag.string)
        
                # Check if it's a dictionary and has 'datePublished'
                if isinstance(json_ld, dict) and 'datePublished' in json_ld:
                    date_published = json_ld['datePublished']
                    break
                elif isinstance(json_ld, list):  # JSON-LD can also be a list of objects
                    for item in json_ld:
                        if '@type' in item and item['@type'] == 'NewsArticle' and 'datePublished' in item:
                            date_published = item['datePublished']
                            break
            except Exception as e:
                print(f"Error parsing JSON-LD: {e}")

        # Parse the date if found
        if date_published != "No Date":
            try:
                date_published_dt = datetime.strptime(date_published, "%Y-%m-%dT%H:%M:%S%z")
                date_published = date_published_dt.strftime("%Y-%m-%d %H:%M:%S")
            except ValueError as e:
                print(f"Error parsing date: {e}")
        # Content extract
        try:
            article_body = soup.find("div", class_="content_main_text")
            article_text = article_body.get_text(separator="\n", strip=True) if article_body else "Content not found"
        except Exception: 
            article_text = "Content not found"
        
        # Сохранение текста в файл
        valid_title = re.sub(r'[\\/:"*?<>|]+', '', title)  # Remove invalid characters
        filename = os.path.join(storage_dir, f"{valid_title}.txt")
        
        with open(filename, "w", encoding="utf-8") as file:
            file.write(f"{title}\n\n")
            file.write(article_text)
        
        print("Статья успешно сохранена!")
        
        return {
            "Title": title,
            "Date Published": date_published,
            "Author": author,
            "URL": article_url,
            "Source": article_url.split('/')[2],
            "Status": "Success"
        }
        
    
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [118]:
def main():
    
    key_word_input = input("Input key word:")
    base_url = "https://tengrinews.kz"
    search_url = base_url + "/search/?text="
    words = key_word_input.split()
    string = "+".join(words)
    key_word_url = search_url + string

    print(f"Ссылка по вашему запросу: \n{key_word_url}")
    
    # Creating directory with the name of the input 
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)

    response = requests.get(key_word_url) 
    html_content = response.text
    
    # Parsing HTML
    soup = BS(html_content, "html.parser")
    
    pages = 5 # Counting manually how much page usually indicates in the front page. 
    articles_count = 0
    urls_list = []

    print(f"Чило страниц:{pages}\n")
    
    urls_list = []  # Initialize the list

    for page in range(1, pages + 1):  # Итерируемся от 1 до 5 включительно
        full_url = base_url + "/search/page/" + str(page) + "/?field=all&text=" + string +"&sort=date"
        print(f"Processing page:{page}\n")
        # Call the function and get the extracted URLs
        article_urls = extract_urls(base_url, full_url)
    
        for article_url in article_urls: 
            print(f"{article_url}\n")    
    
        # Append URLs to the list
        urls_list.extend(article_urls) 
    
        articles_count += len(article_urls)
    
        # Skip if no URLs are found
        if not article_urls:  
            print(f"Skipping page {page} due to no articles found.\n")
        continue
        
    print(f"Articles were found:{articles_count}")

    # The final list of URLs
    print("\nAll Extracted URLs:\n")

    for url in urls_list:
        print(url)
    
    print(f"\nCount:{len(urls_list)}\n")
    print("Starting saving all the extracted articles to text")

    articles_data = []  # To store article details for CSV

    for url in urls_list:
        print(url)
        article_detail = article_contents(url, storage_dir)
        articles_data.append(article_detail)

    # CSV file creation
    csv_file = os.path.join(storage_dir, f"{key_word_input}.csv")
    
    # CSV headers
    headers = ["Title", "Date Published", "Author", "URL", "Source", "Status"]
    
    # Save to CSV
    save_to_csv(articles_data, csv_file, headers)

if __name__ == "__main__":
    main()


Input key word:Донор
Ссылка по вашему запросу: 
https://tengrinews.kz/search/?text=Донор
Чило страниц:5

Processing page:1

https://tengrinews.kz/mixnews/mark-tsukerberg-meta-otkajetsya-faktchekinga-facebook-559053/

https://tengrinews.kz/usa/bayden-obratilsya-strane-svyazi-pobedoy-trampa-vyiborah-553322/

https://tengrinews.kz/kazakhstan_news/sledam-pavla-durova-kazahstantsyi-stanovyatsya-donorami-543721/

https://tengrinews.kz/curious/v-reanimatsii-povtoryal-imya-neznakomogo-cheloveka-istoriya-543236/

https://tengrinews.kz/world_news/vtoroy-patsient-vyilechilsya-ot-vich-v-germanii-541917/

https://tengrinews.kz/kazakhstan_news/si-tszinpin-napisal-statyu-obschem-stremlenii-kitaya-540060/

https://tengrinews.kz/kazakhstan_news/stat-geroem-prosto-sdat-stvolovyie-kletki-spasti-bolnogo-533819/

https://tengrinews.kz/kazakhstan_news/balanyin-fotosuretne-ruyina-karap-tandauga-boladyi-elmzde-531199/

https://tengrinews.kz/healthy/mogut-donoryi-krovi-poluchat-zakonnyie-otgulyi-kazahstane-507

Статья успешно сохранена!
https://tengrinews.kz/usa/bayden-obratilsya-strane-svyazi-pobedoy-trampa-vyiborah-553322/
Статья успешно сохранена!
https://tengrinews.kz/kazakhstan_news/sledam-pavla-durova-kazahstantsyi-stanovyatsya-donorami-543721/
Статья успешно сохранена!
https://tengrinews.kz/curious/v-reanimatsii-povtoryal-imya-neznakomogo-cheloveka-istoriya-543236/
Статья успешно сохранена!
https://tengrinews.kz/world_news/vtoroy-patsient-vyilechilsya-ot-vich-v-germanii-541917/
Статья успешно сохранена!
https://tengrinews.kz/kazakhstan_news/si-tszinpin-napisal-statyu-obschem-stremlenii-kitaya-540060/
Статья успешно сохранена!
https://tengrinews.kz/kazakhstan_news/stat-geroem-prosto-sdat-stvolovyie-kletki-spasti-bolnogo-533819/
Статья успешно сохранена!
https://tengrinews.kz/kazakhstan_news/balanyin-fotosuretne-ruyina-karap-tandauga-boladyi-elmzde-531199/
Статья успешно сохранена!
https://tengrinews.kz/healthy/mogut-donoryi-krovi-poluchat-zakonnyie-otgulyi-kazahstane-507042/
Статья успе

Статья успешно сохранена!
https://tengrinews.kz/kazakhstan_news/izbejat-moshennichestva-okazanii-pomoschi-bolnyim-detyam-301732/
Статья успешно сохранена!
https://tengrinews.kz/usa/prestijnyiy-amerikanskiy-vuz-vyistupil-protiv-301127/
Статья успешно сохранена!
https://tengrinews.kz/kazakhstan_news/dvuhletney-tahmine-abyishevoy-peresadili-kostnyiy-mozg-299141/
Статья успешно сохранена!
https://tengrinews.kz/kazakhstan_news/kitay-vyinujden-postavlyat-svoih-rabochih-kazahstan-ekspert-288408/
Статья успешно сохранена!
https://tengrinews.kz/kazakhstan_news/pomenyat-operatora-svyazi-otmenyi-mobilnogo-rabstva-286640/
Статья успешно сохранена!
https://tengrinews.kz/medicine/v-kazahstane-vpervyie-nezakonno-izyyali-chelovecheskiy-organ-283261/
Статья успешно сохранена!
https://tengrinews.kz/story/kazahstanskiy-pevets-otvetil-post-razlichii-almatyi-astanyi-280092/
Статья успешно сохранена!
https://tengrinews.kz/europe/detyam-donorov-spermyi-razreshili-uznavat-imya-ottsa-lyubom-269204/
Статья успе

In [None]:
### Azattyk too long name modification

In [8]:
def extract_urls(base_url,search_url):
    response = requests.get(search_url) # It is the default
    html_content = response.text
    
    # Парсинг HTML
    soup = BS(html_content, "html.parser")
    
    divs = soup.find_all('li', class_='col-xs-12 col-sm-12 col-md-12 col-lg-12 fui-grid__inner')
    urls = [div.a['href'] for div in divs if div.a]
    full_urls = [base_url.rstrip('/') + div.a['href'] for div in divs if div.a]
    
    return full_urls

def extract_urls(base_url,search_url):
    response = requests.get(search_url) # It is the default
    html_content = response.text
    
    # Парсинг HTML
    soup = BS(html_content, "html.parser")
    
    divs = soup.find_all('li', class_='col-xs-12 col-sm-12 col-md-12 col-lg-12 fui-grid__inner')
    urls = [div.a['href'] for div in divs if div.a]
    full_urls = [base_url.rstrip('/') + div.a['href'] for div in divs if div.a]
    
    return full_urls

def article_contents(article_url, storage_dir):
    try:
        # Загрузка HTML страницы
        response = requests.get(article_url)
        if response.status_code != 200:
            raise requests.HTTPError(f"HTTP Error: {response.status_code} for {article_url}")
        
        html_content = response.text
        soup = BS(html_content, "html.parser")
        
        # Title extract
        title_tag = soup.find('h1')
        title = title_tag.get_text(strip=True) if title_tag else "No Title"
        
        # Author extract
        author_tag = soup.find('a', class_='links__item-link')
        author = author_tag.get_text(strip=True) if author_tag else "No Author"
        
        # Extract date published
        date_tag = soup.find('time', pubdate="pubdate")
        if date_tag and date_tag.has_attr('datetime'):
            date_published = date_tag['datetime']
            date_published_dt = datetime.strptime(date_published, "%Y-%m-%dT%H:%M:%S%z")
            formatted_timestamp = date_published_dt.strftime("%Y-%m-%d %H:%M:%S")
        else:
            formatted_timestamp = "Date not found"
        
        # Content extract
        article_body = soup.find("div", class_="col-xs-12 col-sm-12 col-md-10 col-lg-10 pull-right")
        article_text = article_body.get_text(separator="\n", strip=True) if article_body else "Content not found"
        
        # Generate safe filename
        valid_title = re.sub(r'[\\/:"*?<>|]+', '', title)[:100]
        filename = os.path.join(storage_dir, f"{valid_title}.txt")
        
        # Save article to file
        with open(filename, "w", encoding="utf-8") as file:
            file.write(f"{valid_title}\n\n")
            file.write(article_text)
        
        print(f"Статья успешно сохранена как {filename}!")
        
        return {
            "Title": valid_title,
            "Date Published": formatted_timestamp,
            "Author": author,
            "URL": article_url,
            "Source": article_url.split('/')[2],
            "Status": "Success"
        }
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return {
            "Title": "Error",
            "Date Published": "Error",
            "Author": "Error",
            "URL": article_url,
            "Source": article_url.split('/')[2],
            "Status": f"Error: {e}"
        }
    
def main():
    
    key_word_input = input("Input key word:")
    base_url = "https://www.azattyq.org/"
    search_url = base_url + "/s?k="
    words = key_word_input.split()
    string = "+".join(words)
    key_word_url = search_url + string

    print(f"Ссылка по вашему запросу: \n{key_word_url}")
    
    # Creating directory with the name of the input 
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)

    response = requests.get(key_word_url) 
    html_content = response.text
    
    # Parsing HTML
    soup = BS(html_content, "html.parser")
    
    article_founded = soup.find("span", class_ ="srch-result__results-count")
    articles = article_founded.get_text()

    num_article = re.findall(r'\d+', articles)
    num = int(num_article[0]) 
    
    # Each web-page in site only contains 5 articels, it can be also changed 
    articles_per_page = 10
    pages = math.ceil(num / articles_per_page) 

    print(f"Чило страниц:{pages}\n")
    
    urls_list = []  # Initialize the list

    for page in range(1, pages + 1):  # Итерируемся от 1 до 5 включительно
        full_url = key_word_url + "&tab=any-content&pi=" + str(page) + "&r=any&pp=10"
        print(f"Processing page:{page}\n")
        # Call the function and get the extracted URLs
        article_urls = extract_urls(base_url, full_url)
    
        for article_url in article_urls: 
            print(f"{article_url}\n")    
    
        # Append URLs to the list
        urls_list.extend(article_urls) 
    
    
        # Skip if no URLs are found
        if not article_urls:  
            print(f"Skipping page {page} due to no articles found.\n")
        continue
        

    # The final list of URLs
    print("\nAll Extracted URLs:\n")

    for url in urls_list:
        print(url)
    
    print(f"\nCount:{len(urls_list)}\n")
    print("Starting saving all the extracted articles to text")

    articles_data = []  # To store article details for CSV

    for url in urls_list:
        print(url)
        article_detail = article_contents(url, storage_dir)
        articles_data.append(article_detail)

    # CSV file creation
    csv_file = os.path.join(storage_dir, f"{key_word_input}.csv")
    
    # CSV headers
    headers = ["Title", "Date Published", "Author", "URL", "Source", "Status"]
    
    # Save to CSV
    save_to_csv(articles_data, csv_file, headers)

if __name__ == "__main__":
    main()


Input key word:Донор
Ссылка по вашему запросу: 
https://www.azattyq.org//s?k=Донор
Чило страниц:16

Processing page:1

https://www.azattyq.org/a/kremlin-interference-2024-elections-eastern-europe-disinformation/33165336.html

https://www.azattyq.org/a/32814254.html

https://www.azattyq.org/a/32805928.html

https://www.azattyq.org/a/russia-independent-media-abroad-emigre-journalism-alive/32800811.html

https://www.azattyq.org/a/32795027.html

https://www.azattyq.org/a/32706241.html

https://www.azattyq.org/a/kazakhsan-jusan-bank-central-asia-west-china/32504577.html

https://www.azattyq.org/a/32498757.html

https://www.azattyq.org/a/kazakhstan-media-review-tasmagambetov-putin-sanction/32384676.html

https://www.azattyq.org/a/32091469.html

Processing page:2

https://www.azattyq.org/a/32015084.html

https://www.azattyq.org/a/russia-ukraine-invasion-six-months-five-things-analysis/32002079.html

https://www.azattyq.org/a/31967325.html

https://www.azattyq.org/a/afghanistan-taliban-legitim

https://www.azattyq.org/a/1161400.html

https://www.azattyq.org/a/1156371.html

https://www.azattyq.org/a/1153502.html

https://www.azattyq.org/a/1150851.html

https://www.azattyq.org/a/1150829.html

https://www.azattyq.org/a/1149578.html

https://www.azattyq.org/a/1149513.html

https://www.azattyq.org/a/1149240.html

https://www.azattyq.org/a/1148599.html

https://www.azattyq.org/a/1148412.html

Processing page:16

https://www.azattyq.org/a/1148322.html

https://www.azattyq.org/a/1148308.html

https://www.azattyq.org/a/1147746.html

https://www.azattyq.org/a/1147108.html

https://www.azattyq.org/a/1146851.html

https://www.azattyq.org/a/1146669.html

https://www.azattyq.org/a/1146596.html


All Extracted URLs:

https://www.azattyq.org/a/kremlin-interference-2024-elections-eastern-europe-disinformation/33165336.html
https://www.azattyq.org/a/32814254.html
https://www.azattyq.org/a/32805928.html
https://www.azattyq.org/a/russia-independent-media-abroad-emigre-journalism-alive/32800811.h

Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Кремль 2024 жылы сайлауларға араласқанда қолданатын бес амал.txt!
https://www.azattyq.org/a/32814254.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Израиль армиясы БҰҰ-ның Газадағы ғимараты астынан ХАМАС туннелін тапты.txt!
https://www.azattyq.org/a/32805928.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/HRW НеМолчи.kz қорының жетекшісін бейтарап тергеуге шақырды.txt!
https://www.azattyq.org/a/russia-independent-media-abroad-emigre-journalism-alive/32800811.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Ресей журналистикасы өлген жоқ. Шетелге кеткен ресейлік журналистер қалай жұмыс істеп жатыр.txt!
https://www.azattyq.org/a/32795027.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Тоғыз мемлекет палестиналық босқындарға қаржылай көмекті тоқтатты.txt!
https://www.azattyq.org/a/32706241.html
Статья успешно сохр

Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Еуропа нарығына шығу – Қырғызстанға әлі арман.txt!
https://www.azattyq.org/a/27751313.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Стамбулда дүниежүзілік гуманитарлық саммит өтеді.txt!
https://www.azattyq.org/a/kazakhstan_environmental_organization_stopped_to_exist/27652615.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/«Каспий табиғаты» ұйымы қызметін тоқтатпақ.txt!
https://www.azattyq.org/a/kazakhstan_gongo_and_ngo/27568294.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/ҮЕҰ дербес деректерін ашуды міндеттейтін ережеге шағымданды.txt!
https://www.azattyq.org/a/27533357.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Донор елдер Сирияға 10 миллиард доллар көмек бермек.txt!
https://www.azattyq.org/a/27531510.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Лондонда Сирияға донор ел

Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/БҰҰ басшысы донор елдерді кедейлікпен күреске ақшаны аямауға шақырды.txt!
https://www.azattyq.org/a/2162727.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Әлем елдері Пәкістанға көбірек көмек бермекші.txt!
https://www.azattyq.org/a/2133820.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Пәкістанға 500 млн. долларға жуық қаржы жиналды.txt!
https://www.azattyq.org/a/2129760.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Дүниежүзілік банк Пәкістанға 900 млн. доллар ұсынды.txt!
https://www.azattyq.org/a/2111188.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Халықаралық донорлар Қырғызстанға 1,1 миллиард доллар бөледі.txt!
https://www.azattyq.org/a/Kyrgyzstan_International_donors_conference_in_Bishkek/2111167.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Шетелдік донорлар Қырғызстанға

Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Чернобыль апатының зардаптарын еңсеруге 170 млрд $ қажет.txt!
https://www.azattyq.org/a/1163884.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Ана болғысы келетін әйелдерге медицина көмегі шексіз, тек ниетіңіз бен қаржыңыз болса жеткілікті.txt!
https://www.azattyq.org/a/1163086.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Ауғанстандық ресми топ өкілдерінің Лондон конференциясының нәтижелеріне көңілі толып отыр.txt!
https://www.azattyq.org/a/1163050.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Халықаралық қауымдастық Ауғаныстанның алдағы 5 жылда елді қайта құруына 10,5 млрд доллар бөлеміз деп.txt!
https://www.azattyq.org/a/1163014.html
Статья успешно сохранена как /home/alikhan/Desktop/Data/Parsing/Донор/Еуроодақ министрлері Беларус, Ауғанстан, Иран және Ирақ мәселелерін талқылады.txt!
https://www.azattyq.org/a/1162952.html
Статья 

In [None]:
### https://rus.azattyq.org/

In [2]:
def extract_urls(base_url,search_url):
    response = requests.get(search_url) # It is the default
    html_content = response.text
    
    # Парсинг HTML
    soup = BS(html_content, "html.parser")
    
    divs = soup.find_all('li', class_='col-xs-12 col-sm-12 col-md-12 col-lg-12 fui-grid__inner')
    urls = [div.a['href'] for div in divs if div.a]
    full_urls = [base_url.rstrip('/') + div.a['href'] for div in divs if div.a]
    
    return full_urls

In [3]:
def save_to_csv(data_list, csv_file, headers):
    """Saves a list of article details to a CSV file."""
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
        writer.writerows(data_list)
    print(f"CSV файл успешно сохранен в {csv_file}!")


In [6]:
def article_contents(article_url, storage_dir):
    
    try:
        # Загрузка HTML страницы
        response = requests.get(article_url)
        
        # Check for 404 errors or other HTTP status codes
        if response.status_code != 200:
            raise requests.HTTPError(f"HTTP Error: {response.status_code} for {article_url}")
        
        html_content = response.text
        
        # Парсинг HTML
        soup = BS(html_content, "html.parser")
        
        # Title extract
        try:
            title_tag = soup.find('h1')
            title = title_tag.get_text(strip=True) if title_tag else "No Title"
        except Exception: 
            title = "Title not found"
        
        
        # Author extract
        try:
            author_tag = soup.find('a', class_='links__item-link')
            author = author_tag.get_text(strip=True) if author_tag else "No Author"
   
        except Exception: 
            author = "Author not found"
        # Extract date published
        
        try:
            date_tag = soup.find('time',pubdate ="pubdate")
            if date_tag and date_tag.has_attr('datetime'):
                date_published = date_tag['datetime']
                date_published_dt = datetime.strptime(date_published, "%Y-%m-%dT%H:%M:%S%z")
                formatted_timestamp = date_published_dt.strftime("%Y-%m-%d %H:%M:%S")
            else:
                date_published = "Date not found"
        except Exception: 
            date_published = "Date not found"
    
        # Content extract
        try:
            article_body = soup.find("div", class_="col-xs-12 col-sm-12 col-md-10 col-lg-10 pull-right")
            article_text = article_body.get_text(separator="\n", strip=True) if article_body else "Content not found"
        except Exception: 
            article_text = "Content not found"
        
        # Сохранение текста в файл
        valid_title = re.sub(r'[\\/:"*?<>|]+', '', title)  # Remove invalid characters
        filename = os.path.join(storage_dir, f"{valid_title}.txt")
        max_title_length = 100  # Set max length for title
        valid_title = valid_title[:max_title_length]  # Truncate title if it's too long
        
        
        with open(filename, "w", encoding="utf-8") as file:
            file.write(f"{valid_title}\n\n")
            file.write(article_text)
        
        print("Статья успешно сохранена!")
        
        return {
            "Title": valid_title,
            "Date Published": formatted_timestamp,
            "Author": author,
            "URL": article_url,
            "Source": article_url.split('/')[2],
            "Status": "Success"
        }
        
    
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [7]:
def main():
    
    key_word_input = input("Input key word:")
    base_url = "https://rus.azattyq.org/"
    search_url = base_url + "/s?k="
    words = key_word_input.split()
    string = "+".join(words)
    key_word_url = search_url + string

    print(f"Ссылка по вашему запросу: \n{key_word_url}")
    
    # Creating directory with the name of the input 
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)

    response = requests.get(key_word_url) 
    html_content = response.text
    
    # Parsing HTML
    soup = BS(html_content, "html.parser")
    
    article_founded = soup.find("span", class_ ="srch-result__results-count")
    articles = article_founded.get_text()

    num_article = re.findall(r'\d+', articles)
    num = int(num_article[0]) 
    
    # Each web-page in site only contains 5 articels, it can be also changed 
    articles_per_page = 10
    pages = math.ceil(num / articles_per_page) 

    print(f"Чило страниц:{pages}\n")
    
    urls_list = []  # Initialize the list

    for page in range(1, pages + 1):  # Итерируемся от 1 до 5 включительно
        full_url = key_word_url + "&tab=any-content&pi=" + str(page) + "&r=any&pp=10"
        print(f"Processing page:{page}\n")
        # Call the function and get the extracted URLs
        article_urls = extract_urls(base_url, full_url)
    
        for article_url in article_urls: 
            print(f"{article_url}\n")    
    
        # Append URLs to the list
        urls_list.extend(article_urls) 
    
    
        # Skip if no URLs are found
        if not article_urls:  
            print(f"Skipping page {page} due to no articles found.\n")
        continue
        

    # The final list of URLs
    print("\nAll Extracted URLs:\n")

    for url in urls_list:
        print(url)
    
    print(f"\nCount:{len(urls_list)}\n")
    print("Starting saving all the extracted articles to text")

    articles_data = []  # To store article details for CSV

    for url in urls_list:
        print(url)
        article_detail = article_contents(url, storage_dir)
        articles_data.append(article_detail)

    # CSV file creation
    csv_file = os.path.join(storage_dir, f"{key_word_input}.csv")
    
    # CSV headers
    headers = ["Title", "Date Published", "Author", "URL", "Source", "Status"]
    
    # Save to CSV
    save_to_csv(articles_data, csv_file, headers)

if __name__ == "__main__":
    main()


Input key word:Донор
Ссылка по вашему запросу: 
https://www.azattyq.org//s?k=Донор
Чило страниц:16

Processing page:1

https://www.azattyq.org/a/kremlin-interference-2024-elections-eastern-europe-disinformation/33165336.html

https://www.azattyq.org/a/32814254.html

https://www.azattyq.org/a/32805928.html

https://www.azattyq.org/a/russia-independent-media-abroad-emigre-journalism-alive/32800811.html

https://www.azattyq.org/a/32795027.html

https://www.azattyq.org/a/32706241.html

https://www.azattyq.org/a/kazakhsan-jusan-bank-central-asia-west-china/32504577.html

https://www.azattyq.org/a/32498757.html

https://www.azattyq.org/a/kazakhstan-media-review-tasmagambetov-putin-sanction/32384676.html

https://www.azattyq.org/a/32091469.html

Processing page:2

https://www.azattyq.org/a/32015084.html

https://www.azattyq.org/a/russia-ukraine-invasion-six-months-five-things-analysis/32002079.html

https://www.azattyq.org/a/31967325.html

https://www.azattyq.org/a/afghanistan-taliban-legitim

https://www.azattyq.org/a/1161400.html

https://www.azattyq.org/a/1156371.html

https://www.azattyq.org/a/1153502.html

https://www.azattyq.org/a/1150851.html

https://www.azattyq.org/a/1150829.html

https://www.azattyq.org/a/1149578.html

https://www.azattyq.org/a/1149513.html

https://www.azattyq.org/a/1149240.html

https://www.azattyq.org/a/1148599.html

https://www.azattyq.org/a/1148412.html

Processing page:16

https://www.azattyq.org/a/1148322.html

https://www.azattyq.org/a/1148308.html

https://www.azattyq.org/a/1147746.html

https://www.azattyq.org/a/1147108.html

https://www.azattyq.org/a/1146851.html

https://www.azattyq.org/a/1146669.html

https://www.azattyq.org/a/1146596.html


All Extracted URLs:

https://www.azattyq.org/a/kremlin-interference-2024-elections-eastern-europe-disinformation/33165336.html
https://www.azattyq.org/a/32814254.html
https://www.azattyq.org/a/32805928.html
https://www.azattyq.org/a/russia-independent-media-abroad-emigre-journalism-alive/32800811.h

Статья успешно сохранена!
https://www.azattyq.org/a/32814254.html
Статья успешно сохранена!
https://www.azattyq.org/a/32805928.html
Статья успешно сохранена!
https://www.azattyq.org/a/russia-independent-media-abroad-emigre-journalism-alive/32800811.html
Статья успешно сохранена!
https://www.azattyq.org/a/32795027.html
Статья успешно сохранена!
https://www.azattyq.org/a/32706241.html
Статья успешно сохранена!
https://www.azattyq.org/a/kazakhsan-jusan-bank-central-asia-west-china/32504577.html
Статья успешно сохранена!
https://www.azattyq.org/a/32498757.html
Статья успешно сохранена!
https://www.azattyq.org/a/kazakhstan-media-review-tasmagambetov-putin-sanction/32384676.html
Статья успешно сохранена!
https://www.azattyq.org/a/32091469.html
Статья успешно сохранена!
https://www.azattyq.org/a/32015084.html
Статья успешно сохранена!
https://www.azattyq.org/a/russia-ukraine-invasion-six-months-five-things-analysis/32002079.html
Статья успешно сохранена!
https://www.azattyq.org/a/31967325.htm

Статья успешно сохранена!
https://www.azattyq.org/a/Kabul_conference_Karzai_urges_Afghan_forces_be_ready_to_protect_the_country_2014/2104974.html
Статья успешно сохранена!
https://www.azattyq.org/a/UN_peacekeepers_mission_and_the_Osh_events/2095201.html
Статья успешно сохранена!
https://www.azattyq.org/a/Osh_events_damage_calculated/2097958.html
Статья успешно сохранена!
https://www.azattyq.org/a/Kazakhstan_press_digest_china_credit/1997537.html
Статья успешно сохранена!
https://www.azattyq.org/a/1924764.html
Статья успешно сохранена!
https://www.azattyq.org/a/1924055.html
Статья успешно сохранена!
https://www.azattyq.org/a/afghanistan_government_corruption_president_parliament/1920122.html
Статья успешно сохранена!
https://www.azattyq.org/a/1872985.html
Статья успешно сохранена!
https://www.azattyq.org/a/1829065.html
Статья успешно сохранена!
https://www.azattyq.org/a/1815770.html
Статья успешно сохранена!
https://www.azattyq.org/a/1770943.html
Статья успешно сохранена!
https://www.az

AttributeError: 'NoneType' object has no attribute 'keys'

In [None]:
### Informburo https://informburo.kz/new-search?query=Донор

In [56]:
def extract_urls(base_url,search_url):
    response = requests.get(search_url) # It is the default
    html_content = response.text
    
    # Парсинг HTML
    soup = BS(html_content, "html.parser")
    
    divs = soup.find_all('div', class_='uk-width-1-3@m')

    # Extract URLs from <a> tags inside those divs
    urls = [div.a['href'] for div in divs if div.a]
    full_urls = [base_url.rstrip('/') + div.a['href'] for div in divs if div.a]
    
    return full_urls

In [57]:
def save_to_csv(data_list, csv_file, headers):
    """Saves a list of article details to a CSV file."""
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
        writer.writerows(data_list)
    print(f"CSV файл успешно сохранен в {csv_file}!")

In [58]:
def article_contents(article_url, storage_dir):
    
    try:
        # Загрузка HTML страницы
        response = requests.get(article_url)
        
        # Check for 404 errors or other HTTP status codes
        if response.status_code != 200:
            raise requests.HTTPError(f"HTTP Error: {response.status_code} for {article_url}")
        
        html_content = response.text
        
        # Парсинг HTML
        soup = BS(html_content, "html.parser")
        
        # Title extract
        try:
            title_tag = soup.find('h1')
            title = title_tag.get_text(strip=True) if title_tag else "No Title"
        except Exception: 
            title = "Title not found"
        
        
        # Author extract
        try:
            author_tag = soup.find("small", class_= "article-meta-authors uk-text-muted")
            author = author_tag.get_text(strip=True) if author_tag else "No Author"
   
        except Exception: 
            author = "Author not found"
        # Extract date published
        
        try:
            date_tag = soup.find('time')
            if date_tag and date_tag.has_attr('datetime'):
                date_published = date_tag['datetime']
                date_published_dt = datetime.strptime(date_published, "%Y-%m-%dT%H:%M:%S%z")
                formatted_timestamp = date_published_dt.strftime("%Y-%m-%d %H:%M:%S")
            else:
                date_published = "Date not found"
        except Exception: 
            date_published = "Date not found"
    
        # Content extract
        try:
            article_body = soup.find("div", class_="uk-width-2-3@m uk-width-1-1")
            article_text = article_body.get_text(separator="\n", strip=True) if article_body else "Content not found"
        except Exception: 
            article_text = "Content not found"
        
        # Сохранение текста в файл
        valid_title = re.sub(r'[\\/:"*?<>|]+', '', title)  # Remove invalid characters
        filename = os.path.join(storage_dir, f"{valid_title}.txt")
        max_title_length = 100  # Set max length for title
        valid_title = valid_title[:max_title_length]  # Truncate title if it's too long
        
        
        with open(filename, "w", encoding="utf-8") as file:
            file.write(f"{valid_title}\n\n")
            file.write(article_text)
        
        print("Статья успешно сохранена!")
        
        return {
            "Title": valid_title,
            "Date Published": formatted_timestamp,
            "Author": author,
            "URL": article_url,
            "Source": article_url.split('/')[2],
            "Status": "Success"
        }
        
    
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [60]:
def main():
    
    key_word_input = input("Input key word:")
    base_url = "https://informburo.kz"
    search_url = base_url + "/new-search?query="
    words = key_word_input.split()
    string = "+".join(words)
    key_word_url = search_url + string

    print(f"Ссылка по вашему запросу: \n{key_word_url}")
    
    # Creating directory with the name of the input 
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)

    pages = 57 # Counting manually how much page usually indicates in the front page. 
    articles_count = 0
    urls_list = []

    print(f"Чило страниц:{pages}\n")
    
    urls_list = []  # Initialize the list

    for page in range(1, pages + 1):  # Итерируемся от 1 до 5 включительно
        full_url = key_word_url + "&page=" + str(page)
        print(full_url)
        print(f"Processing page:{page}\n")
        article_urls = extract_urls(base_url, full_url)
    
        for article_url in article_urls: 
            print(f"{article_url}\n")    
    
        # Append URLs to the list
        urls_list.extend(article_urls) 
    
    
        # Skip if no URLs are found
        if not article_urls:  
            print(f"Skipping page {page} due to no articles found.\n")
        continue
        

    # The final list of URLs
    print("\nAll Extracted URLs:\n")

    for url in urls_list:
        print(url)
    
    print(f"\nCount:{len(urls_list)}\n")
    print("Starting saving all the extracted articles to text")

    articles_data = []  # To store article details for CSV

    for url in urls_list:
        print(url)
        article_detail = article_contents(url, storage_dir)
        articles_data.append(article_detail)

    # CSV file creation
    csv_file = os.path.join(storage_dir, f"{key_word_input}.csv")
    
    # CSV headers
    headers = ["Title", "Date Published", "Author", "URL", "Source", "Status"]
    
    # Save to CSV
    save_to_csv(articles_data, csv_file, headers)

if __name__ == "__main__":
    main()

Input key word:Донор
Ссылка по вашему запросу: 
https://informburo.kz/new-search?query=Донор
Чило страниц:57

https://informburo.kz/new-search?query=Донор&page=1
Processing page:1

https://informburo.kz/fotoreportazh/zacem-kazaxstancy-sdayut-svoyu-krov-i-skolko-polucayut-donory

https://informburo.kz/stati/transplantaciya-kostnogo-mozga-kak-eta-bezopasnaya-dlya-donora-procedura-mozet-spasti-sotni-pacientov-s-rakom-krovi

https://informburo.kz/stati/skolko-donorov-v-kazaxstane-i-skolko-ziznei-oni-spasli

https://informburo.kz/special/put-donora-bezvozmezdnyx-donacii-krovi-v-astane-stanovitsya-vsyo-bolse

https://informburo.kz/kaz/newskaz/qazaqstanda-qurmetti-donor-tosbelgisi-qaita-tabystalady

https://informburo.kz/stati/put-krovi-otkuda-berutsya-donory-i-kak-preparaty-dlya-perelivaniya-popadayut-v-bolnicy

https://informburo.kz/stati/trebuyutsya-donory-7248.html

https://informburo.kz/novosti/mangistauskaya-oblast-poteryaet-status-regiona-donora-respublikanskogo-byudzheta-v-2023-godu



https://informburo.kz/novosti/rodnye-donora-iz-aktobe-u-kotorogo-izyali-pochki-trebuyut-vyzvat-v-sud-religioveda.html

https://informburo.kz/novosti/arestovannogo-kostanayca-povtorno-etapiruyut-v-astanu-dlya-spaseniya-zhizni-onkobolnoy-sestry.html

https://informburo.kz/novosti/organy-52-letney-zhitelnicy-pavlodara-peresadili-chetverym-kazahstancam.html

https://informburo.kz/novosti/v-vko-ne-hvataet-donorskoy-krovi-6476.html

https://informburo.kz/novosti/kazhdyy-umirayushchiy-ot-travm-dolzhen-soglasitsya-stat-donorom-chtoby-pokryt-deficit-organov.html

https://informburo.kz/novosti/v-avstrii-vypustili-zhurnal-s-krovyu-vich-inficirovannyh-na-stranicah-4025.html

https://informburo.kz/novosti/sotrudniki-conov-strany-sdali-280-litrov-krovi.html

https://informburo.kz/novosti/v-nur-sultane-otkryli-novyj-art-obuekt

https://informburo.kz/novosti/nalogoviki-vernuli-okolo-100-mlrd-tenge-v-byudzhet-posle-proverok-nedropolzovatelej

https://informburo.kz/kaz/omir/uryq-donory-zasandy-tusik-zan

https://informburo.kz/novosti/v-almaty-proveli-unikalnuyu-dlya-kazahstana-operaciyu-po-peresadke-pecheni.html

https://informburo.kz/stati/v-aktau-obezvrezhena-upravlyaemaya-termoyadernaya-bomba-no-eyo-othody-budut-opasny-eshchyo-50-let-13483.html

https://informburo.kz/novosti/peresadku-kostnogo-mozga-tahmine-abyshevoy-ne-sdelali-v-kazahstane-iz-za-devalvacii.html

https://informburo.kz/novosti/pochki-ot-umershego-cheloveka-vpervye-peresadili-bolnym-v-aktobe.html

https://informburo.kz/novosti/dorogie-patsienty-iz-kazahstana-957.html

https://informburo.kz/novosti/mamin-iz-za-pandemii-kolichestvo-operacij-po-transplantacii-organov-sokratilos-v-dva-raza

https://informburo.kz/stati/iscelilsya-muzcina-40-let-bolevsii-vic-eto-vsego-cetvyortyi-podobnyi-slucai

https://informburo.kz/novosti/den-otkrytyh-dverej-po-voprosam-transplantacii-pecheni-i-pochek-projdyot-v-astane

https://informburo.kz/novosti/ne-budut-lecit-ctoby-bystree-zabrat-organy-pocemu-kazaxstancy-ne-resayutsya-na-posmertnoe

https://informburo.kz/stati/mladenec-kotoromu-27-let-v-ssha-poyavilas-na-svet-devochka-zhdavshaya-rozhdeniya-dolshe-vseh-v-mire.html

https://informburo.kz/novosti/oon-prosit-bolee-400-mln-dollarov-dlya-pomoshchi-nepalu-3719.html

https://informburo.kz/stati/my-pobedili-vmeste-kak-prohodyat-igry-chempionov-v-nur-sultane.html

https://informburo.kz/novosti/mne-rk-predstavilo-zakonoproekt-o-povyshenii-nalogovyh-stavok-v-shymkente-i-turkestane.html

https://informburo.kz/novosti/sportivnuyu-shkolu-za-50-mln-tenge-otkryli-na-okraine-astany.html

https://informburo.kz/stati/npo-v-kazahstane-kak-gosudarstvo-vzaimodeystvuet-s-obshchestvennikami-.html

https://informburo.kz/novosti/vrachi-doveli-studentku-iz-astany-do-sostoyaniya-komy-i-brosili-na-proizvol-sudby-mat-bolnoy.html

https://informburo.kz/novosti/mat-devochki-stradayushchey-leykozom-prosit-nazarbaeva-pomoch-v-poluchenii-kvoty-dlya-docheri-27935.html

https://informburo.kz/cards/chem-zanimaetsya-nedavno-sozdannoe-ministerstvo-po-del

https://informburo.kz/kaz/newskaz/qazaqstan-onirleri-respublikalyq-byudzetten-qansa-aqsa-alady-zane-qansa-beredi-infografika

https://informburo.kz/kaz/kogam/kolsai-kolderinin-direktory-ulttyq-sayabaqqa-kelusilerdi-sekteu-tazalyqty-saqtauga-komektesedi

https://informburo.kz/stati/lyudi-kotorye-ne-ispugalis-vospominaniya-o-tragiceskom-yanvare

https://informburo.kz/kaz/newskaz/syucai-numerologiyanyn-bir-turi-qmdb-ukimi-qandai

https://informburo.kz/novosti/zaklyucyonnym-v-ssa-predlozili-sokratit-sroki-v-obmen-na-donorstvo-organov

https://informburo.kz/novosti/v-riddere-ne-budut-povysat-tarify-na-teplo-posle-stroitelstva-zavoda-donora-dlya-tec

https://informburo.kz/novosti/cto-proisxodit-v-ukraine-glavnoe-za-den-4-fevralya

https://informburo.kz/novosti/zitelyam-riddera-vernut-63-mln-tenge-za-nedopostavlennoe-teplo

https://informburo.kz/novosti/na-tec-v-riddere-dalo-sboi-novoe-oborudovanie

https://informburo.kz/stati/golod-razruxa-xolera-a-teper-zemletryasenie-pocemu-siriya-postrada

https://informburo.kz/asem-ditch/old/beyrutty-emdegen-bauyr.html

https://informburo.kz/cards/innovacii-i-finansy-gde-startapu-vzyat-dengi-na-svoyu-ideyu-v-kazahstane.html

https://informburo.kz/novosti/peresadka-organov-spasla-zhizni-shesterym-kazahstancam.html

https://informburo.kz/kaz/balalar-allergiyasy-nege-ayrysha-aupt-drger-kees.html

https://informburo.kz/kaz/it-etnen-as-zrleytn-dmhanalardy-zhabudy-synbapyz-toaeva-hat-zhazan-brizhit-bardo-oryny-klmen-shbat.html

https://informburo.kz/kaz/azastan-europada-balalar-suicidnen-alashy-zhet-eld-atarynda-zhaalytara-sholu.html

https://informburo.kz/pikir/kainar-oljay/zhapon-men-aza-zhayn-men-ashy.html

https://informburo.kz/stati/apostrofy-ili-digrafy-diskussiya-o-latinice-prodolzhaetsya.html

https://informburo.kz/pikir/kainar-oljay/ambash.html

https://informburo.kz/stati/chtoby-stat-policeyskim-pridyotsya-snachala-proyti-test-u-borcov-s-korrupciey.html

https://informburo.kz/new-search?query=Донор&page=39
Processing page:39

https:

https://informburo.kz/novosti/v-sko-sdali-krov-dlya-bolnyh-rakom-detey--5682.html

https://informburo.kz/novosti/v-nepale-mladenca-spasli-iz-pod-zavalov-spustya-sutki-posle-zemletryaseniya-3774.html

https://informburo.kz/novosti/v-kazahstane-perelivanie-bolnym-s-covid-19-plazmy-vyzdorovevshego-cheloveka-okazalos-neeffektivnym-108535.html

https://informburo.kz/stati/kak-vsyo-ustroeno-v-tyan-shanskoy-astronomicheskoy-observatorii-na-bao.html

https://informburo.kz/novosti/u-90-pacientov-kotorye-poluchali-lechenie-immunnoy-plazmoy-sostoyanie-uluchshilos.html

https://informburo.kz/novosti/srednyaya-zarplata-almatinca-sostavlyaet-160-tysyach-tenge.html

https://informburo.kz/novosti/mid-agentstvo-sodeystviya-mezhdunarodnomu-razvitiyu-kazaid-dolzhno-zarabotat-v-sentyabre-.html

https://informburo.kz/stati/atambaev-chego-to-ochen-silno-boitsya-chto-govoryat-eksperty-o-vystupleniyah-prezidenta-kyrgyzstana.html

https://informburo.kz/novosti/voz-obyavil-o-sbore-sredstv-dlya-uskoreniya-razrab

https://informburo.kz/mneniya/dmitriy-mostovoy/feysbuk-lzhivotvoryashchiy-4535.html

https://informburo.kz/novosti/ustroivshey-samopodzhog-u-genprokuratury-mayre-rysmanovoy-nuzhna-krov.html

https://informburo.kz/novosti/v-pavlodare-umershiy-donor-spas-zhizn-tryoh-chelovek.html

https://informburo.kz/novosti/miss-kostanay-otkazalas-ot-intervyu-i-poslala-vseh-k-menedzheru.html

https://informburo.kz/novosti/voz-uchredila-fond-dlya-finansovoy-podderzhki-zdravoohraneniya-106947.html

https://informburo.kz/novosti/vice-ministr-zdravoohraneniya-rasskazal-pochemu-kazahstancam-ne-hvataet-kvot-na-lechenie-za-rubezhom.html

https://informburo.kz/novosti/vzryv-v-dome-vblizi-kostanaya-unyos-zhizn-eshchyo-odnogo-cheloveka.html

https://informburo.kz/novosti/minzdrav-dal-nadezhdu-studentke-iz-astany-vpavshey-v-komu-posle-vizita-k-vracham.html

https://informburo.kz/novosti/nursultan-nazarbaev-skorbit-po-konchine-shimona-peresa.html

https://informburo.kz/stati/covid-2019-2024-itogi-pyatiletnei-borb

Статья успешно сохранена!
https://informburo.kz/stati/transplantaciya-kostnogo-mozga-kak-eta-bezopasnaya-dlya-donora-procedura-mozet-spasti-sotni-pacientov-s-rakom-krovi
Статья успешно сохранена!
https://informburo.kz/stati/skolko-donorov-v-kazaxstane-i-skolko-ziznei-oni-spasli
Статья успешно сохранена!
https://informburo.kz/special/put-donora-bezvozmezdnyx-donacii-krovi-v-astane-stanovitsya-vsyo-bolse
Статья успешно сохранена!
https://informburo.kz/kaz/newskaz/qazaqstanda-qurmetti-donor-tosbelgisi-qaita-tabystalady
Статья успешно сохранена!
https://informburo.kz/stati/put-krovi-otkuda-berutsya-donory-i-kak-preparaty-dlya-perelivaniya-popadayut-v-bolnicy
Статья успешно сохранена!
https://informburo.kz/stati/trebuyutsya-donory-7248.html
Статья успешно сохранена!
https://informburo.kz/novosti/mangistauskaya-oblast-poteryaet-status-regiona-donora-respublikanskogo-byudzheta-v-2023-godu
Статья успешно сохранена!
https://informburo.kz/kaz/an-donory-bolu-aupt-me-maman-kees.html
Статья успешно

Статья успешно сохранена!
https://informburo.kz/novosti/koronavirus-v-kazahstane-situaciya-na-13-maya-live-.html
Статья успешно сохранена!
https://informburo.kz/novosti/ehavshiy-na-operaciyu-po-peresadke-pochki-zhitel-karagandinskoy-oblasti-pogib-v-dtp.html
Статья успешно сохранена!
https://informburo.kz/novosti/oon-mezhdunarodnoe-soobshchestvo-vydelit-7-mlrd-dollarov-dlya-pomoshchi-sirii.html
Статья успешно сохранена!
https://informburo.kz/sport/old/marina-cheyshvili-ko-mne-tolko-nedavno-prishlo-osoznanie-togo-chto-syn-vse-1012.html
Статья успешно сохранена!
https://informburo.kz/novosti/organizaciyu-po-poisku-donorov-sozdayot-zhitelnica-pavlodara-s-peresazhennoy-pochkoy.html
Статья успешно сохранена!
https://informburo.kz/novosti/kazaxstan-i-tyurkskie-strany-sozdadut-edinuyu-informacionnuyu-sistemu-donorov
Статья успешно сохранена!
https://informburo.kz/novosti/aktyubinskie-vrachi-reshili-posmertno-stat-donorami-organov.html
Статья успешно сохранена!
https://informburo.kz/novosti/rod

Статья успешно сохранена!
https://informburo.kz/stati/zhizn-s-vesom-v-30-kg-zhitelnicu-almatinskoy-oblasti-spasli-ot-smerti-peresadiv-donorskuyu-pechen.html
Статья успешно сохранена!
https://informburo.kz/cards/eko-donorstvo-ili-surrogatnoe-materinstvo-kak-v-kazahstane-reshayut-problemy-besplodnyh-par.html
Статья успешно сохранена!
https://informburo.kz/kaz/myt-donorlyy-zge-adama-azayzdy-aldyru-shn-ne-odan-bas-tartu-shn-ne-steu-kerek.html
Статья успешно сохранена!
https://informburo.kz/kaz/newskaz/transplantaciya-zasauga-bola-ma-gulamalar-kenesinin-patuasy
Статья успешно сохранена!
https://informburo.kz/novosti/minzdrav-vystupil-protiv-predlozheniya-deputata-razreshit-prodazhu-pochek-dlya-transplantacii
Статья успешно сохранена!
https://informburo.kz/cards/privivki-donorstvo-i-zapret-na-vykladku-sigaret-chto-izmenilos-s-novym-kodeksom-o-zdorove.html
Статья успешно сохранена!
https://informburo.kz/novosti/odin-celovek-mozet-spasti-sem-ziznei-vraci-prosyat-vernut-prezumpciyu-soglasiya-na

Статья успешно сохранена!
https://informburo.kz/novosti/senat-vnyos-izmeneniya-v-zakonoproekt-o-transfertax-iz-byudzeta-i-vernul-ego-v-mazilis
Статья успешно сохранена!
https://informburo.kz/kaz/kogam/qazaq-seziresine-nege-aielder-kirmeidi-zane-islamdy-zana-zamanga-sai-zangyrtu-mumkin-be
Статья успешно сохранена!
https://informburo.kz/stati/pocemu-v-kazaxstane-uvelicivaetsya-kolicestvo-onkobolnyx-i-stoit-li-exat-na-lecenie-za-granicu
Статья успешно сохранена!
https://informburo.kz/novosti/prezident-kyrgyzstana-predlagaet-obmenyat-vnesnii-dolg-respubliki-na-zelyonye-iniciativy
Статья успешно сохранена!
https://informburo.kz/novosti/na-cto-tratit-dengi-fond-qazaqstan-xalqyna
Статья успешно сохранена!
https://informburo.kz/stati/tragediya-v-sovi-kak-tayushhie-ledniki-kavkaza-udarili-po-gruzinskomu-rayu-na-zemle
Статья успешно сохранена!
https://informburo.kz/novosti/v-ssa-proveli-pervuyu-v-mire-operaciyu-po-peresadke-celogo-glaza
Статья успешно сохранена!
https://informburo.kz/novosti/fon

Статья успешно сохранена!
https://informburo.kz/interview/primet-li-kazaxstan-otdelnyi-zakon-po-borbe-s-torgovlei-lyudmi
Статья успешно сохранена!
https://informburo.kz/stati/eksperty-pryamoi-ugrozy-kazaxstanu-ot-talibana-net-no-pugaet-religioznyi-ekstremizm-i-eksport-ideologii
Статья успешно сохранена!
https://informburo.kz/stati/ozivit-mamontov-ucyonye-iz-ssa-pytayutsya-vernut-na-zemlyu-zivotnyx-vymersix-tysyaci-let-nazad
Статья успешно сохранена!
https://informburo.kz/kaz/elaynasy/saltanat-baiqosqarova-reproduktolog-26-zylda-eko-adisimen-25-mynnan-astam-bala-duniege-keldi
Статья успешно сохранена!
https://informburo.kz/stati/bescennye-druzya-celoveka-zivotnye-za-kotoryx-platyat-milliony
Статья успешно сохранена!
https://informburo.kz/stati/pocemu-masiny-iz-rossii-okazalis-vne-zakona-v-kazaxstane-i-kak-eto-ispravit
Статья успешно сохранена!
https://informburo.kz/stati/zitelnica-argentiny-izbavilas-ot-vic-bez-lekarstv-eto-vtoroi-takoi-slucai-v-istorii
Статья успешно сохранена!
https:/

Статья успешно сохранена!
https://informburo.kz/stati/ucyonym-udalos-sozdat-celoveceskii-embrion-iz-odnoi-stvolovoi-kletki-kak-eto-vozmozno-a-glavnoe-zacem
Статья успешно сохранена!
https://informburo.kz/novosti/cto-proicxodit-v-ukraine-glavnoe-k-utru-21-iyunya
Статья успешно сохранена!
https://informburo.kz/novosti/tokaev-na-forume-volontyorov-miloserdie-kacestvo-prisushhee-kazaxskomu-narodu
Статья успешно сохранена!
https://informburo.kz/kaz/tusyndirme/qazaqstandyqtar-paterinin-texnikalyq-pasportyn-auystyruy-tiis-ol-usin-ne-qazet
Статья успешно сохранена!
https://informburo.kz/novosti/spartakiada-dlya-detei-kotorye-naxodyatsya-v-remissii-posle-onkologiceskix-zabolevanii-prosla-v-astane
Статья успешно сохранена!
https://informburo.kz/novosti/cto-proicxodit-v-ukraine-glavnoe-k-utru-26-iyulya
Статья успешно сохранена!
https://informburo.kz/novosti/komissiya-ustanovila-veroyatnyi-istocnik-inficirovaniya-vic-v-cgkb-almaty
Статья успешно сохранена!
https://informburo.kz/kaz/tusyndirme/tung

Статья успешно сохранена!
https://informburo.kz/kaz/balalar-allergiyasy-nege-ayrysha-aupt-drger-kees.html
Статья успешно сохранена!
https://informburo.kz/kaz/it-etnen-as-zrleytn-dmhanalardy-zhabudy-synbapyz-toaeva-hat-zhazan-brizhit-bardo-oryny-klmen-shbat.html
Статья успешно сохранена!
https://informburo.kz/kaz/azastan-europada-balalar-suicidnen-alashy-zhet-eld-atarynda-zhaalytara-sholu.html
Статья успешно сохранена!
https://informburo.kz/pikir/kainar-oljay/zhapon-men-aza-zhayn-men-ashy.html
Статья успешно сохранена!
https://informburo.kz/stati/apostrofy-ili-digrafy-diskussiya-o-latinice-prodolzhaetsya.html
Статья успешно сохранена!
https://informburo.kz/pikir/kainar-oljay/ambash.html
Статья успешно сохранена!
https://informburo.kz/stati/chtoby-stat-policeyskim-pridyotsya-snachala-proyti-test-u-borcov-s-korrupciey.html
Статья успешно сохранена!
https://informburo.kz/novosti/samolyot-aviakompanii-bek-air-poterpel-krushenie-v-v-almatinskoy-oblasti.html
Статья успешно сохранена!
https://

Статья успешно сохранена!
https://informburo.kz/stati/koronakrizis-budet-samym-razoritelnym-za-stoletie-pochemu-ssha-postradayut-menshe-drugih.html
Статья успешно сохранена!
https://informburo.kz/novosti/po-faktu-chp-s-obvalom-na-rudnike-akmolinskoy-oblasti-nachali-proverku.html
Статья успешно сохранена!
https://informburo.kz/stati/ya-ne-hochu-detey-monologi-kazahstanskih-chayldfri.html
Статья успешно сохранена!
https://informburo.kz/novosti/parlamentskie-vybory-v-kyrgyzstane-naznachili-na-20-dekabrya-2020-goda-112923.html
Статья успешно сохранена!
https://informburo.kz/stati/byudzhet-almaty-v-2020-godu-vyrastet-na-37-iz-chego-formiruyutsya-dohody-goroda-i-na-chto-ih-potratyat.html
Статья успешно сохранена!
https://informburo.kz/novosti/kazahstan-vtroe-uvelichil-rashody-na-borbu-s-bednostyu.html
Статья успешно сохранена!
https://informburo.kz/novosti/nazarbaev-ekonomika-almaty-v-pyat-raz-prevyshaet-vvp-kyrgyzstana-i-tadzhikistana.html
Статья успешно сохранена!
https://informburo.kz/car

Статья успешно сохранена!
https://informburo.kz/stati/otcy-i-deti-parlamentskie-partii-razglyadeli-v-strane-obostrenie-konflikta-pokoleniy.html
Статья успешно сохранена!
https://informburo.kz/stati/kazashka-zhdyot-v-irake-svoego-spasitelya-9151.html
Статья успешно сохранена!
https://informburo.kz/novosti/kogda-nachnyotsya-stroitelstvo-bakad-rasskazal-zhenis-kasymbek.html
Статья успешно сохранена!
https://informburo.kz/novosti/v-kazahstane-naschitali-okolo-polumilliona-bezrabotnyh-.html
Статья успешно сохранена!
https://informburo.kz/novosti/v-astane-kardiohirurgi-vpervye-v-mire-implantirovali-besprovodnuyu-sistemu-dlya-podderzhki-serdca.html
Статья успешно сохранена!
https://informburo.kz/novosti/dariga-nazarbaeva-ya-za-to-chtoby-upravleniya-obrazovaniya-v-kazahstane-zakryli.html
Статья успешно сохранена!
https://informburo.kz/special/nurlan-nogaev-est-chyotkoe-videnie-togo-chto-predstoit-sdelat-v-atyrauskoy-oblasti.html
Статья успешно сохранена!
https://informburo.kz/novosti/nazarbaev

Статья успешно сохранена!
https://informburo.kz/novosti/bolnicy-vmesto-restoranov-i-barov-budet-stroit-baybek-v-almaty.html
Статья успешно сохранена!
https://informburo.kz/novosti/v-ssha-snova-zayavlyayut-o-zhelanii-vyjti-iz-voz
Статья успешно сохранена!
https://informburo.kz/kaz/newskaz/usaq-apatynan-aman-qalgandar-usin-115-adam-qan-tapsyrdy
Статья успешно сохранена!
https://informburo.kz/novosti/kostanaycy-sdayut-krov-dlya-pyatiletney-devochki-kotoraya-mesyac-lezhit-v-kome.html
Статья успешно сохранена!
CSV файл успешно сохранен в /home/alikhan/Desktop/Data/Parsing/Донор/Донор.csv!


In [None]:
### 24.kz


In [65]:
def extract_urls(base_url,search_url):
    response = requests.get(search_url) # It is the default
    html_content = response.text
    
    # Парсинг HTML
    soup = BS(html_content, "html.parser")
    
    divs = soup.find_all('div', class_='results-item')

    # Extract URLs from <a> tags inside those divs
    urls = [div.a['href'] for div in divs if div.a]
    full_urls = [base_url.rstrip('/') + div.a['href'] for div in divs if div.a]
    
    return full_urls

In [66]:
def save_to_csv(data_list, csv_file, headers):
    """Saves a list of article details to a CSV file."""
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
        writer.writerows(data_list)
    print(f"CSV файл успешно сохранен в {csv_file}!")

In [67]:
def article_contents(article_url, storage_dir):
    
    try:
        # Загрузка HTML страницы
        response = requests.get(article_url)
        
        # Check for 404 errors or other HTTP status codes
        if response.status_code != 200:key_word_input = input("Input key word:")
    base_url = "https://24.kz"
    search_url = base_url + "/search?searchword="
    words = key_word_input.split()
    string = "+".join(words)
    key_word_url = search_url + string + "&searchphrase=all"

    print(f"Ссылка по вашему запросу: \n{key_word_url}")
    
    # Creating directory with the name of the input 
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)

    response = requests.get(key_word_url)
    html_content = response.text
    soup = BS(html_content, "html.parser")
    
    num_tag = soup.find("strong")
    num = num_tag.get_text(strip = True)

    num_conv = re.findall(r"\d+",num)
    articles = int(num_conv[0])
    
    pages = articles/20
    
    print(f"Чило страниц:{pages}\n")
    
    urls_list = []  # Initialize the list

    for page in range(0, int(pages) ):  # Итерируемся от 1 до 5 включительно
        tag = page * 20
        full_url = key_word_url + "&start=" + str(tag)
        print(full_url)
        print(f"Processing page:{page}\n")
            raise requests.HTTPError(f"HTTP Error: {response.status_code} for {article_url}")
        
        html_content = response.text
        
        # Парсинг HTML
        soup = BS(html_content, "html.parser")
        
        # Title extract
        try:
            title_tag = soup.find('h1')
            title = title_tag.get_text(strip=True) if title_tag else "No Title"
        except Exception: 
            title = "Title not found"
        
        
        # Author extract
        try:
            author_tag = soup.find("small", class_= "article-meta-authors uk-text-muted")
            author = author_tag.get_text(strip=True) if author_tag else "No Author"
   
        except Exception: 
            author = "Author not found"
        # Extract date published
        formatted_date = "Date not found"
        try:
            date_tags = soup.find_all('li', class_="entry__meta-date")
            if date_tags:
                first_date_tag = date_tags[0]
                date_text = first_date_tag.get_text(strip=True)

                # Parse the date (assuming the format "dd.mm.yyyy, HH:MM")
                date_parsed = datetime.strptime(date_text, "%d.%m.%Y, %H:%M")

                # Convert to desired format
                formatted_date = date_parsed.strftime("%Y-%m-%d %H:%M:%S")
            else:
                formatted_date = "Date not found"
        except ValueError:
            formatted_date = "Invalid date format"
            
        try:
            article_body = soup.find("div", class_="entry__article")
            article_text = article_body.get_text(separator="\n", strip=True) if article_body else "Content not found"
        except Exception:
            article_text = "Content not found"
        
        # Сохранение текста в файл
        valid_title = re.sub(r'[\\/:"*?<>|]+', '', title)  # Remove invalid characters
        filename = os.path.join(storage_dir, f"{valid_title}.txt")
        max_title_length = 100  # Set max length for title
        valid_title = valid_title[:max_title_length]  # Truncate title if it's too long
        
        
        with open(filename, "w", encoding="utf-8") as file:
            file.write(f"{valid_title}\n\n")
            file.write(article_text)
        
        print("Статья успешно сохранена!")
        
        return {
            "Title": valid_title,
            "Date Published": formatted_date,
            "Author": author,
            "URL": article_url,
            "Source": article_url.split('/')[2],
            "Status": "Success"
        }
        
    
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [69]:
def main():
    
    key_word_input = input("Input key word:")
    base_url = "https://24.kz"
    search_url = base_url + "/search?searchword="
    words = key_word_input.split()
    string = "+".join(words)
    key_word_url = search_url + string + "&searchphrase=all"

    print(f"Ссылка по вашему запросу: \n{key_word_url}")
    
    # Creating directory with the name of the input 
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)

    response = requests.get(key_word_url)
    html_content = response.text
    soup = BS(html_content, "html.parser")
    
    num_tag = soup.find("strong")
    num = num_tag.get_text(strip = True)

    num_conv = re.findall(r"\d+",num)
    articles = int(num_conv[0])
    
    pages = articles/20
    
    print(f"Чило страниц:{pages}\n")
    
    urls_list = []  # Initialize the list

    for page in range(0, int(pages) ):  # Итерируемся от 1 до 5 включительно
        tag = page * 20
        full_url = key_word_url + "&start=" + str(tag)
        print(full_url)
        print(f"Processing page:{page}\n")
        article_urls = extract_urls(base_url, full_url)

    
        for article_url in article_urls: 
            print(f"{article_url}\n")    
    
        # Append URLs to the list
        urls_list.extend(article_urls) 
    
    
        # Skip if no URLs are found
        if not article_urls:  
            print(f"Skipping page {page} due to no articles found.\n")
        continue
        

    for url in urls_list:
        print(url)
    
    print(f"\nCount:{len(urls_list)}\n")
    print("Starting saving all the extracted articles to text")

    articles_data = []  # To store article details for CSV

    for url in urls_list:
        print(url)
        
        try:
            article_body = soup.find("div", class_="entry__article")
            article_text = article_body.get_text(separator="\n", strip=True) if article_body else "Content not found"
        except Exception:
            article_text = "Content not found"
        article_detail = article_contents(url, storage_dir)
        articles_data.append(article_detail)

    # CSV file creation
    csv_file = os.path.join(storage_dir, f"{key_word_input}.csv")
    
    # CSV headers
    headers = ["Title", "Date Published", "Author", "URL", "Source", "Status"]
    
    # Save to CSV
    save_to_csv(articles_data, csv_file, headers)

if __name__ == "__main__":
    main()

Input key word:Донор
Ссылка по вашему запросу: 
https://24.kz/search?searchword=Донор&searchphrase=all
Чило страниц:5.0

https://24.kz/search?searchword=Донор&searchphrase=all&start=0
Processing page:0

https://24.kz/kz/zha-aly-tar/densaulyk/item/688583-eldegi-200-my-a-zhuy-balany-k-ru-abileti-b-zyl-an

https://24.kz/kz/zha-aly-tar/densaulyk/item/688324-lemde-densauly-sa-tau-salasynda-zhi-ke-inen-oldanylyp-keledi

https://24.kz/kz/zha-aly-tar/densaulyk/item/687674-a-tauda-skeriler-sha-apatynan-zardap-shekkenderge-an-tapsyrdy

https://24.kz/kz/zha-aly-tar/densaulyk/item/687647-t-rkistan-oblysynda-23-adamny-b-jregi-auystyryldy

https://24.kz/kz/zha-aly-tar/kogam/item/687625-eki-k-nni-ishinde-1-5-my-nan-astam-adam-800-litr-an-tapsyrdy

https://24.kz/kz/zha-aly-tar/kogam/item/687108-sha-apatynan-zardap-shekkender-shin-astanaly-tar-an-tapsyryp-zhatyr

https://24.kz/kz/zha-aly-tar/densaulyk/item/687055-astana-t-r-yndary-a-tauda-y-ue-apatynan-zardap-shekkenderge-an-tapsyryp-zhatyr

https://24

https://24.kz/kz/zha-aly-tar/lemde/item/630606-aral-ta-dyryna-lem-nazaryn-audaru

https://24.kz/kz/zha-aly-tar/ekonomika/item/627254-8-ajda-aza-stan-byudzhetine-8-6-trln-t-sti

https://24.kz/kz/zha-aly-tar/bilim-zh-ne-ylym/item/627091-almatyda-studentter-festivali-tip-zhatyr

https://24.kz/kz/zha-aly-tar/lemde/item/626918-ispaniyany-shirkeui-donorly-ty-oldajdy

https://24.kz/kz/zha-aly-tar/lemde/item/626908-t-rkiyany-din-isteri-bas-armasy-a-za-donorly-yn-ptajdy

https://24.kz/kz/zha-aly-tar/lemde/item/626791-t-rkiyada-30-my-nau-as-transplantatsiya-kezeginde-t-r

https://24.kz/kz/zha-aly-tar/lemde/item/626742-europada-transplantatsiya-bojynsha-ispaniya-k-sh-bastap-t-r

https://24.kz/kz/zha-aly-tar/kogam/item/626740-elimizde-130-adam-zh-rek-transplantatsiyasyna-m-tazh

https://24.kz/kz/zha-aly-tar/kogam/item/626738-aza-standa-m-jittik-donor-tapshy

https://24.kz/kz/zha-aly-tar/lemde/item/626459-lybritaniyada-y-mindetti-a-za-donorly-y-turaly-za

https://24.kz/kz/zha-aly-tar/kogam/item/626

Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/densaulyk/item/688324-lemde-densauly-sa-tau-salasynda-zhi-ke-inen-oldanylyp-keledi
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/densaulyk/item/687674-a-tauda-skeriler-sha-apatynan-zardap-shekkenderge-an-tapsyrdy
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/densaulyk/item/687647-t-rkistan-oblysynda-23-adamny-b-jregi-auystyryldy
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/kogam/item/687625-eki-k-nni-ishinde-1-5-my-nan-astam-adam-800-litr-an-tapsyrdy
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/kogam/item/687108-sha-apatynan-zardap-shekkender-shin-astanaly-tar-an-tapsyryp-zhatyr
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/densaulyk/item/687055-astana-t-r-yndary-a-tauda-y-ue-apatynan-zardap-shekkenderge-an-tapsyryp-zhatyr
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/kogam/item/686663-bejbit-k-nni-batyrlary
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/lemde/item/686

Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/lemde/item/637887-tayau-shy-ysta-akhual-ushy-ty
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/kogam/item/637813-astana-t-r-yny-80-zhyldy-tarikhy-bar-k-likterdi-zhinap-zh-r
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/lemde/item/636963-b-palestinaly-bos-yndar-zh-nindegi-agenttigine-atysty-zhanzhal-ushy-yp-barady
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/lemde/item/636777-my-da-an-palestinaly-ash-rsa-khalde
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/kogam/item/636613-elimizde-donorly-an-a-tapshyly-tuyndauy-m-mkin
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/lemde/item/636055-ispaniya-a-za-auystyrudan-k-sh-bastady
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/densaulyk/item/635195-k-kshetauda-volonterler-an-tapsyru-aktsiyasyna-atysty
Статья успешно сохранена!
https://24.kz/kz/zha-aly-tar/kogam/item/634995-men-donormyn-an-tapsyru-aktsiyasy-tti
Статья успешно сохранена!
http

In [None]:
### Inbusiness-kz.rus

In [52]:
def extract_urls(base_url,search_url):
    response = requests.get(search_url) # It is the default
    html_content = response.text
    
    # Парсинг HTML
    soup = BS(html_content, "html.parser")
    
    divs = soup.find_all("div", class_='catitems')
    urls = []
    for div in divs:
        anchors = div.find_all('a', href=True)
        for anchor in anchors:
            urls.append(anchor['href'])

    full_urls = [base_url.rstrip('/') + url for url in urls]
    
    return full_urls

In [53]:

# Фильтрация данных перед сохранением в CSV
def save_to_csv(data_list, csv_file, headers):
    data_list = [row for row in data_list if row is not None]  # Убираем None
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
        writer.writerows(data_list)
    print(f"CSV файл успешно сохранен в {csv_file}!")


In [54]:
def article_contents(article_url, storage_dir):
    try:
        # Загрузка HTML страницы
        response = requests.get(article_url)
        if response.status_code != 200:
            raise requests.HTTPError(f"HTTP Error: {response.status_code} for {article_url}")
        
        html_content = response.text
        soup = BS(html_content, "html.parser")

        # Title
        title_tag = soup.find('h1')
        title = title_tag.get_text(strip=True) if title_tag else "No Title"

        # Date
        date_tag = soup.find("time")
        if date_tag and date_tag.has_attr('datetime'):
            date_published = date_tag['datetime']
            date_published_dt = datetime.strptime(date_published, "%Y-%m-%dT%H:%M:%S%z")
            formatted_timestamp = date_published_dt.strftime("%Y-%m-%d %H:%M:%S")
        else:
            formatted_timestamp = "Date not found"

        # Author
        author_tag = soup.find("div", class_="author")
        author = author_tag.get_text(strip=True) if author_tag else "No Author"

        # Content
        article_body = soup.find("div", class_="text")
        article_text = article_body.get_text(separator="\n", strip=True) if article_body else "Content not found"

        # Сохранение текста в файл
        valid_title = re.sub(r'[\\/:"*?<>|]+', '', title)[:100]
        filename = os.path.join(storage_dir, f"{valid_title}.txt")
        with open(filename, "w", encoding="utf-8") as file:
            file.write(f"{valid_title}\n\n{article_text}")

        print("Статья успешно сохранена!")

        return {
            "Title": valid_title,
            "Date Published": formatted_timestamp,
            "Author": author,
            "URL": article_url,
            "Source": article_url.split('/')[2],
            "Status": "Success"
        }
    except Exception as e:
        print(f"An unexpected error occurred for URL {article_url}: {e}")
        return {
            "Title": "Error",
            "Date Published": "Error",
            "Author": "Error",
            "URL": article_url,
            "Source": article_url.split('/')[2],
            "Status": f"Error: {e}"
        }


In [55]:
def main():
    
    key_word_input = input("Input key word:")
    base_url = "https://inbusiness.kz"
    search_url = base_url + "/kz/search?q="
    words = key_word_input.split()
    string = "+".join(words)
    key_word_url = search_url + string 
    
    print(f"Ссылка по вашему запросу: \n{key_word_url}")
    
    # Creating directory with the name of the input 
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)

    response = requests.get(key_word_url)
    html_content = response.text
    soup = BS(html_content, "html.parser")
    
    articles_tag = soup.find("p", class_= "descr")
    article = articles_tag.get_text(strip = True)
    articles = re.findall(r"\d+",article)
    articles_num = int(articles[0])
    articles_per_page = 18
    pages = math.ceil(articles_num  / articles_per_page) 

    print(f"Чило страниц:{pages}\n")
    print(f"Найдено публикаций:{articles_num}\n")
    
    urls_list = []  # Initialize the list
    

    for page in range(1, pages + 1):  # Итерируемся от 1 до 5 включительно
        full_url = key_word_url + "&filter=news&page=" + str(page)
        print(full_url)
        print(f"Processing page:{page}\n")
        article_urls = extract_urls(base_url, full_url)
        for article_url in article_urls: 
            print(f"{article_url}\n")    
    
        # Append URLs to the list
        urls_list.extend(article_urls) 
    
    
        # Skip if no URLs are found
        if not article_urls:  
            print(f"Skipping page {page} due to no articles found.\n")
        continue
        

    # The final list of URLs
    print("\nAll Extracted URLs:\n")

    for url in urls_list:
        print(url)
    
    print(f"\nCount:{len(urls_list)}\n")
    print("Starting saving all the extracted articles to text")

    articles_data = []  # To store article details for CSV

    for url in urls_list:
        print(url)
        article_detail = article_contents(url, storage_dir)
        articles_data.append(article_detail)

    # CSV file creation
    csv_file = os.path.join(storage_dir, f"{key_word_input}.csv")
    
    # CSV headers
    headers = ["Title", "Date Published", "Author", "URL", "Source", "Status"]
    
    # Save to CSV
    save_to_csv(articles_data, csv_file, headers)

if __name__ == "__main__":
    main()

Input key word:Донор
Ссылка по вашему запросу: 
https://inbusiness.kz/kz/search?q=Донор
Чило страниц:6

Найдено публикаций:105

https://inbusiness.kz/kz/search?q=Донор&filter=news&page=1
Processing page:1

https://inbusiness.kz/kz/news/antikor-mugalimderge-beriletin-kansha-tengenin-guli-para-bolyp-eseptelmejtin-ajtty

https://inbusiness.kz/kz/news/majmyl-sheshegi-zhaanga-kauip-tondirip-dunieni-sharpuy-mumkin-be

https://inbusiness.kz/kz/news/biyl-el-byudzhetine-kansha-trillion-tengenin-salygy-tusti

https://inbusiness.kz/kz/news/kostanajda-suga-konamyn-dep-tonkerilip-tusken-ushak-biraz-bylyktyn-betin-ashty

https://inbusiness.kz/kz/news/astanany-kezinde-kujreuden-ne-kutkaryp-kalgany-anyktaldy

https://inbusiness.kz/kz/news/kytaj-men-kazakstannyn-halykaralyk-zhane-onirlik-isterdegi-ustanymy-uksas

https://inbusiness.kz/kz/news/elimizde-er-azamattardyn-shaueti-zhetispejdi

https://inbusiness.kz/kz/news/ortalyk-aziyada-zhana-munajly-el-pajda-boldy

https://inbusiness.kz/kz/news/ukraina-az

https://inbusiness.kz/kz/news/donor-dauy

https://inbusiness.kz/kz/news/karzhylyk-astananyn-byudzhettik-procesteri-ashyk-emes

https://inbusiness.kz/kz/news/donor-kala

https://inbusiness.kz/kz/news/agris-prejmanis-shikizattyk-sektordagy-arbir-ondiristik-tiz

https://inbusiness.kz/kz/news/grigorij-marchenko-«almatyga-zhaksy-akim-bujyrmaj-zhur»

https://inbusiness.kz/kz/news/kogamdyk-pikirdin-indikatory-adebiet-pen-madenietti-trend-s

https://inbusiness.kz/kz/news/myrzakeldi-kemel-maktashylykta-statistikany-dalita-bermeu-k

https://inbusiness.kz/kz/news/ministrlik-kazakstandyktardan-oz-agzalarynyn-«bolashagyn»

https://inbusiness.kz/kz/news/elimiz-elorda-kunin-tojlauda

https://inbusiness.kz/kz/news/olgender-omir-syjlasyn

https://inbusiness.kz/kz/news/recipient-aman-donor-kajtys-boldy

https://inbusiness.kz/kz/news/g20-elderinin-karzhygerleri-baden-badende-bas-kosty

https://inbusiness.kz/kz/news/«bizdin-bajlygymyz-–-gylym-men-bilikti-kadr»

https://inbusiness.kz/kz/news/majit-transpla

Статья успешно сохранена!
https://inbusiness.kz/kz/news/majmyl-sheshegi-zhaanga-kauip-tondirip-dunieni-sharpuy-mumkin-be
Статья успешно сохранена!
https://inbusiness.kz/kz/news/biyl-el-byudzhetine-kansha-trillion-tengenin-salygy-tusti
Статья успешно сохранена!
https://inbusiness.kz/kz/news/kostanajda-suga-konamyn-dep-tonkerilip-tusken-ushak-biraz-bylyktyn-betin-ashty
Статья успешно сохранена!
https://inbusiness.kz/kz/news/astanany-kezinde-kujreuden-ne-kutkaryp-kalgany-anyktaldy
Статья успешно сохранена!
https://inbusiness.kz/kz/news/kytaj-men-kazakstannyn-halykaralyk-zhane-onirlik-isterdegi-ustanymy-uksas
Статья успешно сохранена!
https://inbusiness.kz/kz/news/elimizde-er-azamattardyn-shaueti-zhetispejdi
Статья успешно сохранена!
https://inbusiness.kz/kz/news/ortalyk-aziyada-zhana-munajly-el-pajda-boldy
Статья успешно сохранена!
https://inbusiness.kz/kz/news/ukraina-azerbajzhannan-komek-surady
Статья успешно сохранена!
https://inbusiness.kz/kz/news/kazakstannyn-araldy-kajta-tiriltetin-

Статья успешно сохранена!
https://inbusiness.kz/kz/news/adam-agzasyn-et-dep-satkan
Статья успешно сохранена!
https://inbusiness.kz/kz/news/biznesti-damytu-ushin-ukimet-salyktan-bas-tartpak
Статья успешно сохранена!
https://inbusiness.kz/kz/news/notr-damga-kujylgan-kumandi-milliard
Статья успешно сохранена!
https://inbusiness.kz/kz/news/zhana-basshynyn-sayasatymen-duniezhuzilik-banktin-alemdik-ykpaly-azayuy-mumkin
Статья успешно сохранена!
https://inbusiness.kz/kz/news/kytaj-buu-ny-satyp-aldy
Статья успешно сохранена!
https://inbusiness.kz/kz/news/izrailde-zhasalgan-kurylgy-kazakstan-azamatynyn-zhuregine-ornatyldy
Статья успешно сохранена!
https://inbusiness.kz/kz/news/surrogatpen-tol-aludyn-tiimdiligi-kop
Статья успешно сохранена!
https://inbusiness.kz/kz/news/buu-aralga-donor-kerek
Статья успешно сохранена!
https://inbusiness.kz/kz/news/alemdik-bank-algash-ret-adam-kapitaly-indeksin-zhariyalady
Статья успешно сохранена!
https://inbusiness.kz/kz/news/kazakstanda-bir-adamnyn-zhuregi-eki

In [None]:
### Orda.kz

In [49]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Path to ChromeDriver
chrome_driver_path = "/usr/bin/chromedriver"

# Initialize WebDriver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)

# Open the website
driver.get("https://orda.kz/search-results.html?q=%D0%94%D0%BE%D0%BD%D0%BE%D1%80#gsc.tab=0&gsc.q=%D0%94%D0%BE%D0%BD%D0%BE%D1%80&gsc.page=1")

# Wait for results to load
WebDriverWait(driver, 20).until(
    EC.presence_of_all_elements_located((By.CLASS_NAME, "gsc-webResult"))
)

# Scroll to ensure all elements are loaded
results = driver.find_elements(By.CLASS_NAME, "gsc-webResult")
for result in results:
    ActionChains(driver).move_to_element(result).perform()
time.sleep(2)

# Extract links
article_links = []
for idx, result in enumerate(results):
    try:
        # Try primary selector
        link = result.find_element(By.CSS_SELECTOR, "a.gs-title").get_attribute("href")
    except:
        # Fallback: Extract any <a> tag
        try:
            link = result.find_element(By.TAG_NAME, "a").get_attribute("href")
        except:
            link = None

    # Debugging: Print each result and link
    print(f"Result {idx + 1} HTML:")
    print(result.get_attribute("outerHTML"))
    print(f"Extracted Link: {link}")

    # Filter valid article links
    if link and "/tag/" not in link and ".pdf" not in link:
        article_links.append(link)
    else:
        print(f"Skipped Link: {link}")

# Remove duplicates
article_links = list(set(article_links))

# Print results
print(f"Total Links Extracted: {len(article_links)}")
print("Filtered Article Links:")
for link in article_links:
    print(link)

# Close the browser
driver.quit()


Result 1 HTML:
<div class="gsc-results gsc-webResult"><div class="gsc-expansionArea"><div class="gsc-webResult gsc-result"><div class="gs-webResult gs-result"><div class="gsc-thumbnail-inside"><div class="gs-title"><a class="gs-title" href="https://orda.kz/posmertnyj-donor-geroj-o-borbe-teh-chja-zhizn-zavisit-ot-peresadki-organov-394560/" target="_self" dir="ltr" data-cturl="https://www.google.com/url?client=internal-element-cse&amp;cx=15fbbd8537dec4af8&amp;q=https://orda.kz/posmertnyj-donor-geroj-o-borbe-teh-chja-zhizn-zavisit-ot-peresadki-organov-394560/&amp;sa=U&amp;ved=2ahUKEwiA2v6dt_iKAxUqJBAIHWR-D3EQFnoECAMQAg&amp;usg=AOvVaw37yefvWIhL99M8WmpGtSwq&amp;fexp=72821495,72821494" data-ctorig="https://orda.kz/posmertnyj-donor-geroj-o-borbe-teh-chja-zhizn-zavisit-ot-peresadki-organov-394560/">«Посмертный <b>донор</b> — герой»: о борьбе тех, чья жизнь зависит ...</a></div></div><div class="gsc-url-top"><div class="gs-bidi-start-align gs-visibleUrl gs-visibleUrl-short" dir="ltr">orda.kz</d

Result 10 HTML:
<div class="gsc-webResult gsc-result"><div class="gs-webResult gs-result"><div class="gsc-thumbnail-inside"><div class="gs-title"><a class="gs-title" href="https://orda.kz/uploads/sites/2/2023/04/cets_186.docx.pdf" target="_self" dir="ltr" data-cturl="https://www.google.com/url?client=internal-element-cse&amp;cx=15fbbd8537dec4af8&amp;q=https://orda.kz/uploads/sites/2/2023/04/cets_186.docx.pdf&amp;sa=U&amp;ved=2ahUKEwiA2v6dt_iKAxUqJBAIHWR-D3EQFnoECAoQAQ&amp;usg=AOvVaw0HOK_CUPFNj--Dwwou-XrR&amp;fexp=72821495,72821494" data-ctorig="https://orda.kz/uploads/sites/2/2023/04/cets_186.docx.pdf">Дополнительный протокол к Конвенции о правам человека и ...</a></div></div><div class="gsc-url-top"><div class="gs-bidi-start-align gs-visibleUrl gs-visibleUrl-short" dir="ltr">orda.kz</div><div class="gs-bidi-start-align gs-visibleUrl gs-visibleUrl-long" dir="ltr" style="word-break:break-all;">https://orda.kz/uploads/sites/2/2023/04/cets_186.docx.pdf</div><div class="gs-bidi-start-align

In [None]:
### https://aqparat.info/search?text=%D0%94%D0%BE%D0%BD%D0%BE%D1%80

In [95]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup as BS


In [142]:
def extract_urls(search_url):
    try:
        # Path to ChromeDriver
        chrome_driver_path = "/usr/bin/chromedriver"

        # Initialize WebDriver
        service = Service(chrome_driver_path)
        driver = webdriver.Chrome(service=service)

        # Open the website
        driver.get(search_url)

        # Wait for the page to load
        driver.implicitly_wait(10)

        # Get the page source after rendering
        html_content = driver.page_source

        # Use BeautifulSoup for parsing
        soup = BS(html_content, "html.parser")
        articles = soup.find_all("article", class_="hentry entry xfolkentry gradient")

        # Extract links
        base_url = "https://aqparat.info"
        links = [
            base_url + article.find("a")["href"] if article.find("a")["href"].startswith("/")
            else article.find("a")["href"]
            for article in articles if article.find("a")
        ]

        return links

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        # Close the browser
        if 'driver' in locals():
            driver.quit()

# Call the function


In [143]:
def save_to_csv(data_list, csv_file, headers):
    # Filter out None or invalid entries
    valid_data = [entry for entry in data_list if isinstance(entry, dict)]

    # Write data to CSV
    with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
        writer.writerows(valid_data)
    print(f"CSV файл успешно сохранен в {csv_file}!")

In [144]:
def article_contents(article_url, storage_dir):
    try:
        # Загрузка HTML страницы
        response = requests.get(article_url)
        if response.status_code != 200:
            raise requests.HTTPError(f"HTTP Error: {response.status_code} for {article_url}")
        
        html_content = response.text
        soup = BS(html_content, "html.parser")

        # Title
        title_tag = soup.find('h1')
        title = title_tag.get_text(strip=True) if title_tag else "No Title"

        # Date
        date_tag = soup.find("span",itemprop="datePublished")
        date = date_tag.get_text()

        if date!= "No Date":
            try:
                date_published_dt = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S%z")
                date_published = date_published_dt.strftime("%Y-%m-%d %H:%M:%S")
            except ValueError as e:
                print(f"Error parsing date: {e}")

        # Author
        author_tag = soup.find("article", itemprop="articleBody")
        author = author_tag.get_text(strip=True) if author_tag else "No Author"

        # Content
        article_body = soup.find("article", itemprop="articleBody")
        article_text = article_body.get_text(separator="\n", strip=True) if article_body else "Content not found"

        # Сохранение текста в файл
        valid_title = re.sub(r'[\\/:"*?<>|]+', '', title)[:100]
        filename = os.path.join(storage_dir, f"{valid_title}.txt")
        with open(filename, "w", encoding="utf-8") as file:
            file.write(f"{valid_title}\n\n{article_text}")

        print("Статья успешно сохранена!")

        return {
            "Title": valid_title,
            "Date Published": date_published,
            "Author": author,
            "URL": article_url,
            "Source": article_url.split('/')[2],
            "Status": "Success"
        }
    except Exception as e:
        print(f"An unexpected error occurred for URL {article_url}: {e}")
        return {
            "Title": "Error",
            "Date Published": "Error",
            "Author": "Error",
            "URL": article_url,
            "Source": article_url.split('/')[2],
            "Status": f"Error: {e}"
        }


In [145]:
def main():
    
    key_word_input = input("Input key word:")
    base_url = "https://aqparat.info"
    search_url = base_url + "/search?text="
    words = key_word_input.split()
    string = "+".join(words)
    key_word_url = search_url + string 
    
    print(f"Ссылка по вашему запросу: \n{key_word_url}")
    
    # Creating directory with the name of the input 
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)

    pages = 17
    
    print(f"Чило страниц:{pages}\n")

    
    urls_list = []  # Initialize the list
    

    for page in range(1, pages + 1):  # Итерируемся от 1 до 5 включительно
        full_url = key_word_url + "&page=" + str(page)
        print(full_url)
        print(f"Processing page:{page}\n")
        article_urls = extract_urls(full_url)
        for article_url in article_urls: 
            print(f"{article_url}\n")    
    
        # Append URLs to the list
        urls_list.extend(article_urls) 
    
    
        # Skip if no URLs are found
        if not article_urls:  
            print(f"Skipping page {page} due to no articles found.\n")
        continue
        

    # The final list of URLs
    print("\nAll Extracted URLs:\n")

    for url in urls_list:
        print(url)
    
    print(f"\nCount:{len(urls_list)}\n")
    print("Starting saving all the extracted articles to text")

    articles_data = []  # To store article details for CSV

    for url in urls_list:
        print(url)
        article_detail = article_contents(url, storage_dir)
        articles_data.append(article_detail)

    # CSV file creation
    csv_file = os.path.join(storage_dir, f"{key_word_input}.csv")
    
    # CSV headers
    headers = ["Title", "Date Published", "Author", "URL", "Source", "Status"]
    
    # Save to CSV
    save_to_csv(articles_data, csv_file, headers)

if __name__ == "__main__":
    main()

Input key word:Донор
Ссылка по вашему запросу: 
https://aqparat.info/search?text=Донор
Чило страниц:17

https://aqparat.info/search?text=Донор&page=1
Processing page:1

https://aqparat.info/news/2024/12/25/10738612-zhogorku_kenesh_prinyal_proekt_vodnogo_k.html

https://aqparat.info/news/2024/12/24/10738403-gde_v_kazahstane_bolshe_vsego_soglasii_n.html

https://aqparat.info/news/2024/12/18/10736736-v_kazahstane_stalo_bolshe_lyudei_soglasn.html

https://aqparat.info/news/2024/12/17/10736251-tashiev_po_delu_zampreda_gns_dubanaeva_g.html

https://aqparat.info/news/2024/12/16/10736015-pri_investirovanii_sredstv_mezhdunarodny.html

https://aqparat.info/news/2024/12/13/10735445-kabmin_odobril_zaklyuchenie_k_soglasheni.html

https://aqparat.info/news/2024/12/11/10734758-banki_kazahstana_zaplatili_nalogi_na_sum.html

https://aqparat.info/news/2024/12/06/10733490-fao_planiruet_povysit_obemy_zakupok_prod.html

https://aqparat.info/news/2024/12/05/10733102-elimizde_4_1_myndai_naukas_ishki_agzasyn.

https://aqparat.info/news/2024/04/15/10649580-v_2022-2023_godah_batkenskaya_oblast_pol.html

https://aqparat.info/news/2024/04/12/10648886-donorskaya_akciya_v_shymkente_sobrala_14.html

https://aqparat.info/news/2024/04/09/10647223-v_kazakstan_halkyna_ozvuchili_skolko_im.html

https://aqparat.info/news/2024/04/04/10645283-bolee_630_mln_tenge_pereveli_kazahstancy.html

https://aqparat.info/news/2024/04/03/10644934-akcenty_byli_ugrozy_i_dazhe_dengi_predla.html

https://aqparat.info/news/2024/04/02/10644547-zhaparov_podpisal_zakon_ob_inoagentah.html

https://aqparat.info/news/2024/03/25/10642644-rossiiskie_banki_mogut_profinansirovat_s.html

https://aqparat.info/news/2024/03/19/10640699-voennye_rk_v_shymkente_dobrovolno_sdali.html

https://aqparat.info/news/2024/03/06/10637414-v_iyune_proidet_forum_v_zheneve_gde_prez.html

https://aqparat.info/news/2024/03/06/10637358-ministr_energetiki_11_marta_opredelim_ko.html

https://aqparat.info/news/2024/03/05/10637065-na_rekonstrukciyu_630_km_avto

https://aqparat.info/news/2023/07/22/10550493-voditelskie_prava_po_evrostandartu_poluc.html

https://aqparat.info/news/2023/07/22/10550479-po_proektu_kambar-atinskoi_ges-1_idet_bu.html

https://aqparat.info/news/2023/07/21/10550329-sorevnovaniya_sredi_detei_pobedivshih_ra.html

https://aqparat.info/news/2023/07/21/10550257-karagandinskii_oblastnoi_centr_krovi_sno.html

https://aqparat.info/news/2023/07/20/10549712-ministr_transporta_tekebaev_ozvuchil_pro.html

https://aqparat.info/news/2023/07/20/10549634-byudzhet_dostig_6_4_mlrd_takimi_tempami.html

https://aqparat.info/news/2023/07/19/10549363-serdce_53-letnei_zhenschiny_peresadili_2.html

https://aqparat.info/news/2023/07/18/10548892-pervuyu_peresadku_donorskogo_serdca_prov.html

https://aqparat.info/news/2023/07/15/10547850-v_spore_vokrug_jusan_bank_a_postavlena_t.html

https://aqparat.info/news/2023/07/12/10546741-mintrans_v_iyule_na_246_km_avtodorogi_bi.html

https://aqparat.info/news/2023/07/11/10546350-zavtra_operaciya_a_vracha

https://aqparat.info/news/2023/02/07/10472560-kapitan_leipciga_mozhet_propustit_blizha.html

https://aqparat.info/news/2023/02/07/10472324-onkologi_kazahstana_vozrozhdayut_totalno.html

https://aqparat.info/news/2023/02/03/10470470-podozritelnye_operacii_i_prestupnye_doho.html

https://aqparat.info/news/2023/02/01/10469510-posolstvo_ssha_vystupilo_za_tesnoe_sotru.html

https://aqparat.info/news/2023/01/31/10468734-afganskih_devushek_prinimayut_v_vuzy_cen.html

https://aqparat.info/news/2023/01/27/10466981-anonsy_nedeli_nepopulyarnoe_reshenie_pok.html

https://aqparat.info/news/2023/01/27/10466946-ekstremizm_i_terrorizm_v_kazahstane.html

https://aqparat.info/news/2023/01/26/10466618-politico_vozmozhnaya_postavka_ukraine_is.html

https://aqparat.info/news/2023/01/25/10465680-po_programme_moi_dom_2021-2026_za_1_5_go.html

https://aqparat.info/news/2023/01/25/10465636-na_meste_byvshei_ik_47_postroyat_mikrora.html

https://aqparat.info/news/2023/01/24/10465369-shahter_vmeste_s_klubnym_part

https://aqparat.info/news/2022/07/31/10390599-afganistan_blizok_taliban_dalek_kak_cent.html

https://aqparat.info/news/2022/07/30/10390502-oon_kyrgyzstan_vozderzhalsya_ot_uchastiy.html

https://aqparat.info/news/2022/07/29/10390195-kabmin_utverdil_polozhenie_o_rabote_s_by.html

https://aqparat.info/news/2022/07/29/10390021-stoimost_rekonstrukcii_avtodorogi_suusam.html

https://aqparat.info/news/2022/07/28/10389337-chetvertyi_chelovek_v_istorii_vylechilsy.html

https://aqparat.info/news/2022/07/27/10389060-dolg_v_energosisteme_prevyshaet_137_mlrd.html

https://aqparat.info/news/2022/07/27/10388949-gosdolg_budet_obsuzhdatsya_no_est_i_bole.html

https://aqparat.info/news/2022/07/27/10388912-transplantaciya_kostnogo_mozga_stala_bol.html

https://aqparat.info/news/2022/07/25/10388137-uchenye_sovershili_proryv_v_lechenii_dia.html

https://aqparat.info/news/2022/07/19/10385704-moralno-eticheskii_kodeks_dlya_shkol_i_v.html

https://aqparat.info/news/2022/07/12/10382693-predsedatel_fonda_kazaks

https://aqparat.info/news/2022/03/20/10326419-human_rights_watch_v_afganistane_ot_nedo.html

https://aqparat.info/news/2022/03/18/10325838-cherez_chto_prohodyat_karagandincy_nuzhd.html

https://aqparat.info/news/2022/03/17/10324859-v_oon_prizvali_vydelit_milliardy_na_pomo.html

https://aqparat.info/news/2022/03/11/10320917-pochemu_procentnaya_stavka_u_kombankov_p.html

https://aqparat.info/news/2022/03/01/10315129-karagandincev_poblagodarili_za_dobrye_de.html

https://aqparat.info/news/2022/03/01/10315107-pust_budet_bolshe_dobroty_akim_karagandi.html

https://aqparat.info/news/2022/02/27/10314211-v_kieve_sozdayut_gumanitarnyi_shtab_dlya.html

https://aqparat.info/news/2022/02/26/10313637-nbu_otkryl_specialnyi_schet_dlya_sbora_s.html

https://aqparat.info/news/2022/02/24/10312422-nbu_otkryl_specschet_dlya_sbora_sredstv.html

https://aqparat.info/news/2022/02/23/10311617-kyrgyzstan_v_2021_godu_poluchil_ot_donor.html

https://aqparat.info/news/2022/02/22/10310665-verhovnyi_sud_rk_prinyal_

Статья успешно сохранена!
https://aqparat.info/news/2024/12/24/10738403-gde_v_kazahstane_bolshe_vsego_soglasii_n.html
Статья успешно сохранена!
https://aqparat.info/news/2024/12/18/10736736-v_kazahstane_stalo_bolshe_lyudei_soglasn.html
Статья успешно сохранена!
https://aqparat.info/news/2024/12/17/10736251-tashiev_po_delu_zampreda_gns_dubanaeva_g.html
Статья успешно сохранена!
https://aqparat.info/news/2024/12/16/10736015-pri_investirovanii_sredstv_mezhdunarodny.html
Статья успешно сохранена!
https://aqparat.info/news/2024/12/13/10735445-kabmin_odobril_zaklyuchenie_k_soglasheni.html
Статья успешно сохранена!
https://aqparat.info/news/2024/12/11/10734758-banki_kazahstana_zaplatili_nalogi_na_sum.html
Статья успешно сохранена!
https://aqparat.info/news/2024/12/06/10733490-fao_planiruet_povysit_obemy_zakupok_prod.html
Статья успешно сохранена!
https://aqparat.info/news/2024/12/05/10733102-elimizde_4_1_myndai_naukas_ishki_agzasyn.html
Статья успешно сохранена!
https://aqparat.info/news/2024

Статья успешно сохранена!
https://aqparat.info/news/2024/05/18/10664068-shans_dlya_zhivyh_ili_pochemu_v_strane_v.html
Статья успешно сохранена!
https://aqparat.info/news/2024/05/12/10660978-film_ob_alie_moldagulovoi_snimut_v_kazah.html
Статья успешно сохранена!
https://aqparat.info/news/2024/05/11/10660322-prognoziruetsya_dalneishee_zamedlenie_in.html
Статья успешно сохранена!
https://aqparat.info/news/2024/05/10/10660139-serbiya_vydelila_1_mln_evro_postradavshi.html
Статья успешно сохранена!
https://aqparat.info/news/2024/05/10/10659947-obem_izyatii_iz_byudzheta_almaty_rezko_v.html
Статья успешно сохранена!
https://aqparat.info/news/2024/05/05/10658163-obyazatelstva_aziatskogo_fonda_razvitiya.html
Статья успешно сохранена!
https://aqparat.info/news/2024/05/04/10657920-zamglavy_minfina_e_kaldybaev_rasskazal_n.html
Статья успешно сохранена!
https://aqparat.info/news/2024/05/04/10657806-chistokrovnyi_ferrari_i_kitaiskaya_mazda.html
Статья успешно сохранена!
https://aqparat.info/news/2024

Статья успешно сохранена!
https://aqparat.info/news/2023/10/09/10582971-v_peresadke_organov_nuzhdayutsya_318_zhi.html
Статья успешно сохранена!
https://aqparat.info/news/2023/10/06/10582019-chto_izvestno_o_novom_predsedatele_gosba.html
Статья успешно сохранена!
https://aqparat.info/news/2023/10/05/10581634-v_posleduyuschie_10_let_budet_vydeleno_d.html
Статья успешно сохранена!
https://aqparat.info/news/2023/10/04/10581212-den_donora_organov_vpervye_proidet_v_kaz.html
Статья успешно сохранена!
https://aqparat.info/news/2023/10/04/10581123-ft_ukraina_mozhet_poluchit_186_mlrd_za_s.html
Статья успешно сохранена!
https://aqparat.info/news/2023/10/01/10579840-zvanie_pochetnyi_donor_planiruyut_vossta.html
Статья успешно сохранена!
https://aqparat.info/news/2023/09/29/10579156-deputat_predlozhil_minselhozu_sozdat_mar.html
Статья успешно сохранена!
https://aqparat.info/news/2023/09/28/10578610-glavnoe_na_segodnya.html
Статья успешно сохранена!
https://aqparat.info/news/2023/09/27/10578125-ssha_

An unexpected error occurred for URL https://aqparat.info/news/2023/06/14/10535110-pacienty_centralnoi_bolnicy_almaty_zaraz.html: 'NoneType' object has no attribute 'get_text'
https://aqparat.info/news/2023/06/14/10535094-krov_kakoi_gruppy_ostro_trebuetsya_seich.html
Статья успешно сохранена!
https://aqparat.info/news/2023/06/14/10535068-vich-infekciya_vyyavlena_u_pacientov_alm.html
Статья успешно сохранена!
https://aqparat.info/news/2023/06/14/10534999-dlya_obespecheniya_zhilem_vseh_nuzhdayus.html
Статья успешно сохранена!
https://aqparat.info/news/2023/06/14/10534908-kakoi_segodnya_prazdnik_14_iyunya.html
Статья успешно сохранена!
https://aqparat.info/news/2023/06/13/10534537-glavnoe_na_segodnya.html
Статья успешно сохранена!
https://aqparat.info/news/2023/06/07/10532204-advokat_bessonova_zayavila_o_vozmozhnost.html
Статья успешно сохранена!
https://aqparat.info/news/2023/06/02/10530418-popytki_kyrgyzstana_prodvinut_zakon_ob_i.html
An unexpected error occurred for URL https://aqparat

KeyboardInterrupt: 

In [None]:
###inform.kz

In [146]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup as BS


In [147]:

def extract_urls(search_url):
    try:
        # Path to ChromeDriver
        chrome_driver_path = "/usr/bin/chromedriver"

        # Initialize WebDriver
        service = Service(chrome_driver_path)
        driver = webdriver.Chrome(service=service)

        # Open the website
        driver.get(search_url)

        # Wait for the page to load
        driver.implicitly_wait(10)

        # Get the HTML content
        html_content = driver.page_source

        # Parse with BeautifulSoup
        soup = BS(html_content, "html.parser")

        # Find all elements with the class "searchCard"
        search_cards = soup.find_all("div", class_="searchCard")

        # Base URL
        base_url = "https://www.inform.kz"

        # Extract and construct full URLs
        extracted_links = []
        for card in search_cards:
            link_tag = card.find("a")  # Find <a> tags within the card
            if link_tag and "href" in link_tag.attrs:
                link = link_tag["href"]
                # Make the URL full
                if link.startswith("/"):
                    link = base_url + link
                extracted_links.append(link)

           
        return extracted_links

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        # Close the browser
        if 'driver' in locals():
            driver.quit()

In [148]:
def save_to_csv(data_list, csv_file, headers):
    # Filter out None or invalid entries
    valid_data = [entry for entry in data_list if isinstance(entry, dict)]

    # Write data to CSV
    with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
        writer.writerows(valid_data)
    print(f"CSV файл успешно сохранен в {csv_file}!")

In [149]:
def article_contents(article_url, storage_dir):
    
    try:
        # Initialize ChromeDriver
        chrome_driver_path = "/usr/bin/chromedriver"
        service = Service(chrome_driver_path)
        driver = webdriver.Chrome(service=service)

        # URL of the article
        driver.get(article_url)
        driver.implicitly_wait(10)

        # Get the rendered HTML content
        html_content = driver.page_source
        soup = BS(html_content, "html.parser")

        # Extract Title
        title_tag = soup.find('h1')
        title = title_tag.get_text(strip=True) if title_tag else "No Title"

        # Extract Author
        author_tag = soup.find('a', class_='article__author-name')
        author = author_tag.get_text(strip=True) if author_tag else "No Author"
        
        # Extract Date
        date_tag = soup.find('div', class_='article__time')
        if date_tag:
            raw_date = date_tag.get_text(strip=True)  # Example: "21:09, 19 Ноябрь 2024 | GMT +5"
    
            # Remove the "| GMT +5" part
            if "|" in raw_date:
                raw_date = raw_date.split("|")[0].strip()

            # Split into time and date
            time_part, date_part = raw_date.split(", ")
            russian_months = {
                "Январь": "January", "Февраль": "February", "Март": "March",
                "Апрель": "April", "Май": "May", "Июнь": "June",
                "Июль": "July", "Август": "August", "Сентябрь": "September",
                "Октябрь": "October", "Ноябрь": "November", "Декабрь": "December"
            }
    
            # Replace the Russian month with English equivalent
            for rus, eng in russian_months.items():
                if rus in date_part:
                    date_part = date_part.replace(rus, eng)
                    break

            # Combine back in correct format
            combined_str = f"{time_part}, {date_part}"  # Combine as "21:09, 19 November 2024"
            dt = datetime.strptime(combined_str, "%H:%M, %d %B %Y")  # Correct order
            formatted_date = dt.strftime("%Y-%m-%d %H:%M:%S")  # Convert to ISO format
        else:
            formatted_date = "No Date"

        # Extract Article Text
        article_body = soup.find("div", class_="article__body-text")
        article_text = article_body.get_text(separator="\n", strip=True) if article_body else "Content not found"
        
        # Сохранение текста в файл
        valid_title = re.sub(r'[\\/:"*?<>|]+', '', title)  # Remove invalid characters
        filename = os.path.join(storage_dir, f"{valid_title}.txt")
        max_title_length = 100  # Set max length for title
        valid_title = valid_title[:max_title_length]  # Truncate title if it's too long
        
        
        with open(filename, "w", encoding="utf-8") as file:
            file.write(f"{valid_title}\n\n")
            file.write(article_text)
        
        print("Статья успешно сохранена!")
        
        return {
            "Title": valid_title,
            "Date Published": formatted_date,
            "Author": author,
            "URL": article_url,
            "Source": article_url.split('/')[2],
            "Status": "Success"
        }
        
    
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [150]:
def main():
    
    key_word_input = input("Input key word:")
    base_url = "https://www.inform.kz"
    search_url = base_url + "/search_results/?q="
    words = key_word_input.split()
    string = "+".join(words)
    key_word_url = search_url + string 
    
    print(f"Ссылка по вашему запросу: \n{key_word_url}")
    
    # Creating directory with the name of the input 
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)

    pages = 31
    
    print(f"Чило страниц:{pages}\n")

    
    urls_list = []  # Initialize the list
    

    for page in range(1, pages + 1):  # Итерируемся от 1 до 5 включительно
        full_url = key_word_url + "&page=" + str(page)
        print(full_url)
        print(f"Processing page:{page}\n")
        article_urls = extract_urls(full_url)
        for article_url in article_urls: 
            print(f"{article_url}\n")    
    
        # Append URLs to the list
        urls_list.extend(article_urls) 
    
    
        # Skip if no URLs are found
        if not article_urls:  
            print(f"Skipping page {page} due to no articles found.\n")
        continue
        

    # The final list of URLs
    print("\nAll Extracted URLs:\n")

    for url in urls_list:
        print(url)
    
    print(f"\nCount:{len(urls_list)}\n")
    print("Starting saving all the extracted articles to text")

    articles_data = []  # To store article details for CSV

    for url in urls_list:
        print(url)
        article_detail = article_contents(url, storage_dir)
        articles_data.append(article_detail)

    # CSV file creation
    csv_file = os.path.join(storage_dir, f"{key_word_input}.csv")
    
    # CSV headers
    headers = ["Title", "Date Published", "Author", "URL", "Source", "Status"]
    
    # Save to CSV
    save_to_csv(articles_data, csv_file, headers)

if __name__ == "__main__":
    main()

Input key word:Донор
Ссылка по вашему запросу: 
https://www.inform.kz/search_results/?q=Донор
Чило страниц:31

https://www.inform.kz/search_results/?q=Донор&page=1
Processing page:1

https://www.inform.kz/ru/almatinskie-donori-sdali-okolo-13-tisyach-litrov-krovi-s-nachala-goda-21b99c

https://www.inform.kz/ru/v-uzbekistane-obladatelyam-znaka-pochetniy-donor-uvelichat-viplati-v-tri-raza-d3a2f5

https://www.inform.kz/ru/chernoe-donorstvo-v-kazahstane-kak-ne-prevratit-transplantatsiyu-organov-v-torgovlyu

https://www.inform.kz/ru/posmertniy-donor-spas-zhizni-chetireh-chelovek-v-vko-feca99

https://www.inform.kz/ru/kazahstan-i-tyurkskie-strani-sozdadut-edinuyu-informatsionnuyu-sistemu-donorov-6b561e

https://www.inform.kz/ru/den-donora-organov-4103-patsientov-nuzhdayutsya-v-peresadke-organov-v-strane

https://www.inform.kz/ru/posmertnoe-donorstvo-spaslo-zhizni-19-kazahstantsev-593c15

https://www.inform.kz/ru/germaniya-krupneyshiy-donor-es-edce81

https://www.inform.kz/ru/svishe-600-zhitel

https://www.inform.kz/ru/bolee-22-litrov-donorskoy-krovi-sdali-policeyskie-sko_a3724041

https://www.inform.kz/ru/forum-donorov-proydet-v-stolice_a3720475

https://www.inform.kz/ru/svyshe-60-policeyskih-stali-donorami-v-vko_a3720354

https://www.inform.kz/ru/v-rk-medaviaciey-vypolneno-bolee-1600-vyletov-za-pacientami-dvazhdy-za-donorskimi-organami_a3711914

https://www.inform.kz/ru/50-spasateley-stali-donorami-krovi-v-zko_a3706352

https://www.inform.kz/ru/donorami-stali-sotrudniki-koloniy-strogogo-rezhima-v-vko_a3696670

https://www.inform.kz/ru/svyshe-50-spasateley-stali-donorami-v-ust-kamenogorske_a3692649

https://www.inform.kz/ru/donorami-stali-svyshe-100-voennosluzhaschih-v-ust-kamenogorske_a3686256

https://www.inform.kz/ru/izvestnye-blogery-stali-donorami-immunnoy-plazmy-v-almaty_a3682226

https://www.inform.kz/ru/perebolevshiy-koronavirusom-iz-semeya-ya-gotov-stat-donorom-immunnoy-plazmy_a3671917

https://www.inform.kz/search_results/?q=Донор&page=10
Processing page:10

https:

https://www.inform.kz/ru/v-karagande-policeyskih-nazvali-samymi-aktivnymi-donorami-krovi-foto_a2920122

https://www.inform.kz/ru/kazahstanskie-voennosluzhaschie-sdali-krov-v-ramkah-akcii-den-donora_a2915434

https://www.inform.kz/ru/v-rf-dlya-soglasnyh-na-posmertnoe-donorstvo-motociklistov-mogut-otmenit-nalogi_a2914718

https://www.inform.kz/ru/29-letniy-zhitel-almaty-nurtugan-akmyrzaev-priznan-donorom-goda-foto_a2914581

https://www.inform.kz/ru/v-kazahstane-prohodit-nedelya-donorstva-bolashak-life_a2914336

https://www.inform.kz/ru/v-kyzylorde-policeyskie-provodyat-dobrovol-nuyu-donorskuyu-akciyu-foto_a2914037

https://www.inform.kz/ru/25-spasateley-sko-stali-donorami-krovi-foto_a2909859

https://www.inform.kz/ru/pervyy-kazahstanec-s-donorskim-serdcem-prezidentu-spasibo-vashim-reformam-v-medicine_a2897141

https://www.inform.kz/ru/zhizn-zh-uspanova-pervogo-pacienta-s-donorskim-serdcem-kruto-izmenilas-obzor-pressy-za-26-aprelya_a2896944

https://www.inform.kz/ru/amerikanskie-hirurgi-v

https://www.inform.kz/ru/v-kazahstane-nablyudaetsya-ostraya-potrebnost-v-donorah-gemopoeticheskih-stvolovyh-kletok-kostnogo-mozga_a2488566

https://www.inform.kz/ru/20-let-nazad-v-kazahstane-bylo-750-tys-donorov-segodnya-okolo-200-tysyach_a2485809

https://www.inform.kz/ru/proryv-vpervye-v-kazahstane-provedena-transplantaciya-donorskogo-serdca_a2485761

https://www.inform.kz/ru/nacmedholding-planiruet-provesti-transplantaciyu-serdca-ot-donora-do-konca-2012-goda_a2485675

https://www.inform.kz/ru/v-blagotvoritel-noy-akcii-tvoy-den-donor-prinyalo-uchastie-bolee-6-tys-kazahstancev_a2473119

https://www.inform.kz/ru/dlya-dostizheniya-urovnya-razvityh-stran-kazahstanu-trebuetsya-okolo-900-tysyach-donorov-krovi-minzdrav-rk_a2472190

https://www.inform.kz/ru/almatinskie-vrachi-sdayut-krov-v-ramkah-vsemirnogo-dnya-donora_a2471463

https://www.inform.kz/ru/stan-donorom-segodnya-podari-nadezhdu-na-zavtra-akciya-tamozhennikov-almaty_a2467088

https://www.inform.kz/ru/40-aktyubinskih-policeyskih-s

Статья успешно сохранена!
https://www.inform.kz/ru/v-uzbekistane-obladatelyam-znaka-pochetniy-donor-uvelichat-viplati-v-tri-raza-d3a2f5
Статья успешно сохранена!
https://www.inform.kz/ru/chernoe-donorstvo-v-kazahstane-kak-ne-prevratit-transplantatsiyu-organov-v-torgovlyu
Статья успешно сохранена!
https://www.inform.kz/ru/posmertniy-donor-spas-zhizni-chetireh-chelovek-v-vko-feca99
Статья успешно сохранена!
https://www.inform.kz/ru/kazahstan-i-tyurkskie-strani-sozdadut-edinuyu-informatsionnuyu-sistemu-donorov-6b561e
Статья успешно сохранена!
https://www.inform.kz/ru/den-donora-organov-4103-patsientov-nuzhdayutsya-v-peresadke-organov-v-strane
Статья успешно сохранена!
https://www.inform.kz/ru/posmertnoe-donorstvo-spaslo-zhizni-19-kazahstantsev-593c15
Статья успешно сохранена!
https://www.inform.kz/ru/germaniya-krupneyshiy-donor-es-edce81
Статья успешно сохранена!
https://www.inform.kz/ru/svishe-600-zhiteley-sko-stali-donorami-stvolovih-kletok-65f388
Статья успешно сохранена!
https://www.i

Статья успешно сохранена!
https://www.inform.kz/ru/sotrudniki-akmolinskoy-kolonii-stali-donorami-dlya-postradavshey-v-avarii-zhenschiny_a3819126
Статья успешно сохранена!
https://www.inform.kz/ru/zhitel-shymkenta-neozhidanno-dlya-sebya-okazalsya-posmertnym-donorom_a3817707
Статья успешно сохранена!
https://www.inform.kz/ru/marafon-dobryh-del-sotrudniki-mvd-rk-snova-stali-donorami-krovi_a3814620
Статья успешно сохранена!
https://www.inform.kz/ru/v-donorah-nuzhdaetsya-karagandinskiy-oblastnoy-centr-krovi_a3806639
Статья успешно сохранена!
https://www.inform.kz/ru/40-litrov-donorskoy-krovi-sdali-voennosluzhaschie-atyrauskogo-garnizona_a3801183
Статья успешно сохранена!
https://www.inform.kz/ru/vsemirnyy-den-donora-otmetili-v-akmolinskoy-oblasti_a3800928
Статья успешно сохранена!
https://www.inform.kz/ru/kto-oni-donory-rekordsmeny-almaty_a3800833
Статья успешно сохранена!
https://www.inform.kz/ru/privitye-ot-covid-19-otstranyayutsya-ot-donorstva-krovi-i-ee-komponentov-na-dve-nedeli_a380018

Статья успешно сохранена!
https://www.inform.kz/ru/skol-ko-poluchayut-zhenschiny-donory-po-iskusstvennomu-oplodotvoreniyu-v-kazahstane_a3125077
Статья успешно сохранена!
https://www.inform.kz/ru/poltonny-donorskoy-krovi-sdali-policeyskie-vostochnogo-kazahstana_a3105244
Статья успешно сохранена!
https://www.inform.kz/ru/30-policeyskih-sdali-donorskuyu-krov-v-sko_a3087330
Статья успешно сохранена!
https://www.inform.kz/ru/donor-krovi-v-kazahstane-smozhet-vyigrat-iphone-7_a3085310
Статья успешно сохранена!
https://www.inform.kz/ru/vospitanniki-voenno-tehnicheskoy-shkoly-stali-donorami_a3077441
Статья успешно сохранена!
https://www.inform.kz/ru/v-kazahstane-vpervye-peresadili-pechen-ot-nesovmestimogo-donora_a3073841
Статья успешно сохранена!
https://www.inform.kz/ru/v-kazahstane-vpervye-provedut-transplantaciyu-pecheni-ot-nesovmestimogo-donora_a3073586
Статья успешно сохранена!
https://www.inform.kz/ru/v-aktobe-vpervye-peresadili-pechen-ot-posmertnogo-donora_a3069454
Статья успешно сохране

Статья успешно сохранена!
https://www.inform.kz/ru/v-kazahstane-rastet-chislo-operaciy-po-peresadke-donorskih-organov_a2685315
Статья успешно сохранена!
https://www.inform.kz/ru/bolee-5-tysyach-severokazahstancev-stali-donorami-s-nachala-goda_a2677010
Статья успешно сохранена!
https://www.inform.kz/ru/1500-litrov-krovi-sobrano-vo-vremya-donorskoy-akcii-v-almaty_a2669011
Статья успешно сохранена!
https://www.inform.kz/ru/so-sleduyuschego-goda-astana-voydet-v-sostav-chetyreh-regionov-donorov-respublikanskogo-byudzheta_a2652321
Статья успешно сохранена!
https://www.inform.kz/ru/kazahstan-yavlyaetsya-odnim-iz-donorov-abr_a2650782
Статья успешно сохранена!
https://www.inform.kz/ru/v-astane-vrachi-proveli-tret-yu-uspeshnuyu-transplantaciyu-donorskogo-serdca_a2630301
Статья успешно сохранена!
https://www.inform.kz/ru/voz-otnosit-kazahstan-k-stranam-s-nizkoy-donorskoy-aktivnost-yu_a2621569
Статья успешно сохранена!
https://www.inform.kz/ru/vrachi-setuyut-na-otsutstvie-edinoy-bazy-dannyh-donoro

Статья успешно сохранена!
https://www.inform.kz/ru/v-akcii-donor-2011-prinyali-uchastie-aktivisty-zhambylskogo-filiala-knpk_a2420311
Статья успешно сохранена!
https://www.inform.kz/ru/v-kazahstane-bolee-3-tysyach-pacientov-nuzhdayutsya-v-peresadke-donorskoy-pochki-i-serdca-minzdrav-rk_a2420251
Статья успешно сохранена!
https://www.inform.kz/ru/almaty-glavnyy-nesyr-evoy-donor-strany_a2419431
Статья успешно сохранена!
https://www.inform.kz/ru/karagandinskie-policeyskie-provedut-donorskuyu-akciyu-priurochennuyu-ko-vsemirnomu-dnyu-pamyati-zhertv-dtp_a2419186
Статья успешно сохранена!
https://www.inform.kz/ru/v-centre-krovi-taldykorgana-prohodit-akciya-donor-2011_a2418984
Статья успешно сохранена!
https://www.inform.kz/ru/obschestvennyy-sovet-po-zaschite-prav-pacientov-pri-mz-rk-utverdil-plan-meropriyatiy-po-razvitiyu-donorstva-v-kazahstane_a2418788
Статья успешно сохранена!
https://www.inform.kz/ru/ao-narodnyy-bank-kazahstana-provel-dobrovol-nuyu-akciyu-po-sdache-donorskoy-krovi_a2416084
С

In [135]:

# Initialize ChromeDriver
chrome_driver_path = "/usr/bin/chromedriver"
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)

# URL of the article
article_url = "https://www.inform.kz/ru/almatinskie-donori-sdali-okolo-13-tisyach-litrov-krovi-s-nachala-goda-21b99c"
driver.get(article_url)
driver.implicitly_wait(10)

# Get the rendered HTML content
html_content = driver.page_source
soup = BS(html_content, "html.parser")

# Extract Title
title_tag = soup.find('h1')
title = title_tag.get_text(strip=True) if title_tag else "No Title"

# Extract Author
author_tag = soup.find('a', class_='article__author-name')
author = author_tag.get_text(strip=True) if author_tag else "No Author"

# Extract Date
date_tag = soup.find('div', class_='article__time')
if date_tag:
    raw_date = date_tag.get_text(strip=True)  # Example: "21:09, 19 Ноябрь 2024 | GMT +5"
    
    # Remove the "| GMT +5" part
    if "|" in raw_date:
        raw_date = raw_date.split("|")[0].strip()

    # Split into time and date
    time_part, date_part = raw_date.split(", ")
    russian_months = {
        "Январь": "January", "Февраль": "February", "Март": "March",
        "Апрель": "April", "Май": "May", "Июнь": "June",
        "Июль": "July", "Август": "August", "Сентябрь": "September",
        "Октябрь": "October", "Ноябрь": "November", "Декабрь": "December"
    }
    
    # Replace the Russian month with English equivalent
    for rus, eng in russian_months.items():
        if rus in date_part:
            date_part = date_part.replace(rus, eng)
            break

    # Combine back in correct format
    combined_str = f"{time_part}, {date_part}"  # Combine as "21:09, 19 November 2024"
    dt = datetime.strptime(combined_str, "%H:%M, %d %B %Y")  # Correct order
    formatted_date = dt.strftime("%Y-%m-%d %H:%M:%S")  # Convert to ISO format
else:
    formatted_date = "No Date"

# Extract Article Text
article_body = soup.find("div", class_="article__body-text")
article_text = article_body.get_text(separator="\n", strip=True) if article_body else "Content not found"

# Print extracted data
print("Title:", title)
print("Author:", author)
print("Date:", formatted_date)
print("Article Text:", article_text)

# Quit the browser
driver.quit()


Title: Алматинские доноры сдали около 13 тысяч литров крови с начала года
Author: Еламан Турысбеков
Date: 2024-11-19 21:09:00
Article Text: Директор городского центра крови Алматы Жандос Надиров отметил, что в мировой практике существует единый стандарт развития донорства крови и ее компонентов. Благодаря неравнодушным жителям мегаполиса в центре крови было заготовлено около 13 тысяч литров крови. Кроме того, сотрудники центра посетили более 50 организаций, где также были приняты донорские сдачи.
— Донорами могут стать граждане старше 18 лет, прошедшие специальное медицинское обследование и не имеющие противопоказаний для сдачи крови. После сдачи крови донор получает денежный эквивалент, который компенсирует затраты на объем крови и энергетические расходы организма, — сообщил директор городского центра крови Алматы Жандос Надиров.
Следует отметить, что Городской центр крови Алматы обеспечивает поставки крови в 27 государственных больниц мегаполиса, а также в другие клиники и военно-мед