In [1]:
# Importing all necessary libraries

In [2]:
import requests
from bs4 import BeautifulSoup as BS
import re as re
import os
import math
from datetime import datetime
import csv

In [3]:
# Function for extracting the URL for articles in Egemen.kz

In [4]:
def extract_urls(base_url,search_url):
    response = requests.get(search_url)
    html_content = response.text
    
    # Парсинг HTML
    soup = BS(html_content, "html.parser")
    
    divs = soup.find_all('div', class_='clearfix news-t flexBlock')

    # Extract URLs from <a> tags inside those divs
    urls = [div.a['href'] for div in divs if div.a]
    full_urls = [base_url.rstrip('/') + div.a['href'] for div in divs if div.a]

    return full_urls

In [5]:
# Created the function specifically for the parsing Egemen.kz web-site content and saved as a text file
# Only works for one URL 

In [6]:
def article_contents(article_url, storage_dir):
    
    try:
        # Загрузка HTML страницы
        response = requests.get(article_url)
        
        # Check for 404 errors or other HTTP status codes
        if response.status_code != 200:
            raise requests.HTTPError(f"HTTP Error: {response.status_code} for {article_url}")
        
        html_content = response.text
        
        # Парсинг HTML
        soup = BS(html_content, "html.parser")
        
        # Title extract
        try:
            title = soup.find('h1')
            title = title.get_text(strip=True)
        except Exception: 
            title = "Title not found"
        
        # Author extract
        try:
            author = soup.find('div', class_='name-auth').text.strip()
        except Exception: 
            author = "Author not found"
        
        # Date extract
        try:
            date_tag = soup.find('meta', itemprop="datePublished")
            if date_tag and date_tag.has_attr('content'):
                date_published = date_tag['content']
                date_published_dt = datetime.strptime(date_published, "%Y-%m-%d %H:%M:%S")
            else:
                date_published = "Date not found"
        except Exception: 
            date_published = "Date not found"
        
        # Content extract
        try:
            article_body = soup.find("div", itemprop="articleBody")
            article_text = article_body.get_text(separator="\n", strip=True) if article_body else "Content not found"
        except Exception: 
            article_text = "Content not found"
        
        # Сохранение текста в файл
        valid_title = re.sub(r'[\\/:"*?<>|]+', '', title)  # Remove invalid characters
        filename = os.path.join(storage_dir, f"{valid_title}.txt")
        
        with open(filename, "w", encoding="utf-8") as file:
            file.write(f"{title}\n\n")
            file.write(article_text)
        
        print("Статья успешно сохранена!")
        
        return {
            "Title": title,
            "Date Published": date_published,
            "Author": author,
            "URL": article_url,
            "Status": "Success"
        }
    
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [7]:
def extract_article_details(article_url):
    """Extract title, date published, author, and URL."""
    response = requests.get(article_url)
    html_content = response.text
    soup = BS(html_content, "html.parser")
    
    # Extract title
    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else "No Title"
    
    # Extract date published
    date_tag = soup.find('meta', itemprop="datePublished")
    date_published = date_tag['content'] if date_tag and date_tag.has_attr('content') else "No Date"
    date_published_dt = None
    if date_published != "No Date":
        date_published_dt = datetime.strptime(date_published, "%Y-%m-%d %H:%M:%S")
        date_published = date_published_dt.strftime("%Y-%m-%d %H:%M:%S")
    
    # Extract author
    author_tag = soup.find('div', class_='name-auth')
    author = author_tag.get_text(strip=True) if author_tag else "No Author"
    
    return {
        "Title": title,
        "Date Published": date_published,
        "Author": author,
        "URL": article_url
    }


In [8]:
def main():
    
    # Key word input
    key_word_input = input("Введите ключевое слово: \n")
    
    # Creating url with key-word
    base_url_search = "https://egemen.kz"                         #base url can be changed by other URL
    search_url = base_url_search + "/search?q=" 
    words = key_word_input.split()
    string = "+".join(words)
    key_word_url = search_url + string

    print(f"Ссылка по вашему запросу: \n{key_word_url}")
    
    # Creating directory with the name of the input 
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)

    response = requests.get(key_word_url) 
    html_content = response.text
    
    # Parsing HTML
    soup = BS(html_content, "html.parser")
    
    # Finding articles number, included in web-site
    article_founded = soup.find('small').text

    print(article_founded)
    
    # Conversion to int from list
    num_article = re.findall(r'\d+', article_founded)
    num = int(num_article[0]) 
    
    # Each web-page in site only contains 5 articels, it can be also changed 
    articles_per_page = 5
    pages = math.ceil(num / articles_per_page) 

    print(f"Чило страниц:{pages}\n")
    
    urls_list = []  # Initialize the list

    for page in range(1, pages + 1):
        full_url = key_word_url + "&page=" + str(page)
        print(f"Processing page: {page} {full_url}\n")
    
        # Call the function and get the extracted URLs
        article_urls = extract_urls(base_url_search, full_url)
    
        for article_url in article_urls: 
            print(f"{article_url}\n")    
    
        # Append URLs to the list
        urls_list.extend(article_urls) 
    
        # Skip if no URLs are found
        if not article_urls:  
            print(f"Skipping page {page} due to no articles found.\n")
        continue

    # The final list of URLs
    print("\nAll Extracted URLs:\n")

    for url in urls_list:
        print(url)
    
    print(f"\nCount:{len(urls_list)}\n")
    print("Starting saving all the extracted articles to the text")
    for url in urls_list:
        print(url)
        article_contents(url,storage_dir)
        
    # CSV file creation
    csv_file = os.path.join(storage_dir, f"{key_word_input}.csv")
    
    # CSV headers
    headers = ["Title", "Date Published", "Author", "URL"]
    

        
    # Writing to CSV
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
    
        for url in urls_list:
            print(f"Processing article: {url}")
            article_details = extract_article_details(url)
            writer.writerow(article_details)
    print("CSV файл успешно сохранен !")

In [10]:
if __name__ == "__main__":
    main()

Введите ключевое слово: 
Рынок
Ссылка по вашему запросу: 
https://egemen.kz/search?q=Рынок
7 материал табылды
Чило страниц:2

Processing page: 1 https://egemen.kz/search?q=Рынок&page=1

https://egemen.kz/article/200045-gharyshtyq-eginshilik-qazaqstandyq-startap-egistic-rynokqa-shyqty

https://egemen.kz/article/95567-irandyq-vektor-dganha-rynoktar-dganha-mumkindikter

https://egemen.kz/article/29341-ishki-rynokta-turaqtylyq-kerek

https://egemen.kz/article/15983-qazaqstan-titany-alemdik-rynokqa-qadam-basty

https://egemen.kz/article/14439-alemdik-rynokqa-shyghudynh-basty-dgoly-–-basekege-qabilettilik

Processing page: 2 https://egemen.kz/search?q=Рынок&page=2

https://egemen.kz/article/10317-basekelestik-–-rynokty-bayytudynh-orkenietti-dgoly

https://egemen.kz/article/9765-maqsat-–-ishki-rynokty-qamtamasyz-etip-eksport-aleuetin-arttyru


All Extracted URLs:

https://egemen.kz/article/200045-gharyshtyq-eginshilik-qazaqstandyq-startap-egistic-rynokqa-shyqty
https://egemen.kz/article/95567