In [None]:
# Importing all necessary libraries

In [75]:
import requests
from bs4 import BeautifulSoup as BS
import re as re
import os
import math
from datetime import datetime
import csv

In [76]:
# Function for extracting the URL for articles in Egemen.kz

In [77]:
def extract_urls(base_url,search_url):
    response = requests.get(search_url)
    html_content = response.text
    
    # Парсинг HTML
    soup = BS(html_content, "html.parser")
    
    divs = soup.find_all('div', class_='clearfix news-t flexBlock')

    # Extract URLs from <a> tags inside those divs
    urls = [div.a['href'] for div in divs if div.a]
    full_urls = [base_url.rstrip('/') + div.a['href'] for div in divs if div.a]

    return full_urls

In [78]:
# Created the function specifically for the parsing Egemen.kz web-site content and saved as a text file
# Only works for one URL 

In [79]:
def article_contents(article_url,storage_dir):
    
    # Загрузка HTML страницы
    response = requests.get(article_url)
    html_content = response.text
    
    # Парсинг HTML
    soup = BS(html_content, "html.parser")
    
    # Title extract
    title = soup.find('h1')
    title = title.get_text(strip=True)
    
    # Author extract

    author = soup.find('div', class_='name-auth').text
    print(f"Author:{author}")
    
    # Data extract 
    
    date_tag = soup.find('meta', itemprop="datePublished")

    # Extract the 'content' attribute
    if date_tag and date_tag.has_attr('content'):
        date_published = date_tag['content']
    else:
        print("Date Published not found!")

    date_published_dt = datetime.strptime(date_published, "%Y-%m-%d %H:%M:%S")
    print(date_published_dt)
    
    # Content extract
    article_body = soup.find("div", itemprop="articleBody")

    # Extract and clean all text from the div
    if article_body:
        article_text = article_body.get_text(separator="\n", strip=True)
    else:
        print("Article body not found!")
    
    # Сохранение текста в файл

    valid_title = re.sub(r'[\\/:"*?<>|]+', '', title)  # Remove invalid characters
    filename = os.path.join(storage_dir, f"{valid_title}.txt")


    with open(filename, "w", encoding="utf-8") as file:
        file.write(f"{title}\n\n")
        file.write(article_text)

    print("Статья успешно сохранена!")
    
    return {
        "Title": title,
        "Date Published": date_published,
        "Author": author,
        "URL": article_url
    }



In [80]:
def main():
    
    # Key word input
    key_word_input = input("Введите ключевое слово: \n")
    
    # Creating url with key-word
    base_url_search = "https://egemen.kz"                         #base url can be changed by other URL
    key_word_url = base_url_search + "/search?q=" + key_word_input 

    print(f"Ссылка по вашему запросу: \n{key_word_url}")
    
    # Creating directory with the name of the input 
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)

    response = requests.get(key_word_url) 
    html_content = response.text
    
    # Parsing HTML
    soup = BS(html_content, "html.parser")
    
    # Finding articles number, included in web-site
    article_founded = soup.find('small').text

    print(article_founded)
    
    # Conversion to int from list
    num_article = re.findall(r'\d+', article_founded)
    num = int(num_article[0]) 
    
    # Each web-page in site only contains 5 articels, it can be also changed 
    articles_per_page = 5
    pages = math.ceil(num / articles_per_page) 

    print(f"Чило страниц:{pages}\n")
    
    urls_list = []  # Initialize the list

    for page in range(1, pages + 1):
        full_url = key_word_url + "&page=" + str(page)
        print(f"Processing page: {page} {full_url}\n")
    
        # Call the function and get the extracted URLs
        article_urls = extract_urls(base_url_search, full_url)
    
        for article_url in article_urls: 
            print(f"{article_url}\n")    
    
        # Append URLs to the list
        urls_list.extend(article_urls) 
    
        # Skip if no URLs are found
        if not article_urls:  
            print(f"Skipping page {page} due to no articles found.\n")
        continue

    # The final list of URLs
    print("\nAll Extracted URLs:\n")

    for url in urls_list:
        print(url)
    
    print(f"\nCount:{len(urls_list)}\n")
    print("Starting saving all the extracted articles to the text")
    for url in urls_list:
        print(url)
        article_contents(url,storage_dir)
        
    # CSV file creation
    csv_file = os.path.join(storage_dir, f"{key_word_input}.csv")
    
    # CSV headers
    headers = ["Title", "Date Published", "Author", "URL"]
    

        
    # Writing to CSV
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
    
        for url in urls_list:
            print(f"Processing article: {url}")
            article_details = extract_article_details(url)
            writer.writerow(article_details)
    print("CSV файл успешно сохранен !")

In [82]:
if __name__ == "__main__":
    main()

Введите ключевое слово: 
президент путин
Ссылка по вашему запросу: 
https://egemen.kz/search?q=президент путин
0 материал табылды
Чило страниц:0


All Extracted URLs:


Count:0

Starting saving all the extracted articles to the text
CSV файл успешно сохранен !


In [None]:
for dirname,_,filenames in os.walk("/home/alikhan/Desktop/Data/Parsing"):
    for filename in filenames:
        print(os.path.join(dirname,filename))

In [23]:
date_tag = soup.find('meta', itemprop="datePublished")

# Extract the 'content' attribute
if date_tag and date_tag.has_attr('content'):
    date_published = date_tag['content']
    print("Date Published:", date_published)
else:
    print("Date Published not found!")

date_published_dt = datetime.strptime(date_published, "%Y-%m-%d %H:%M:%S")

# Extracting author to save into the csv

author = soup.find('div', class_='name-auth').text
print(author)

Date Published: 2024-12-18 16:09:00

Дана МЫРЗАҚАДІР




In [24]:
import requests
from bs4 import BeautifulSoup as BS
from datetime import datetime
import csv
import os
import re as regex

def extract_article_details(article_url):
    """Extract title, date published, author, and URL."""
    response = requests.get(article_url)
    html_content = response.text
    soup = BS(html_content, "html.parser")
    
    # Extract title
    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else "No Title"
    
    # Extract date published
    date_tag = soup.find('meta', itemprop="datePublished")
    date_published = date_tag['content'] if date_tag and date_tag.has_attr('content') else "No Date"
    date_published_dt = None
    if date_published != "No Date":
        date_published_dt = datetime.strptime(date_published, "%Y-%m-%d %H:%M:%S")
    
    # Extract author
    author_tag = soup.find('div', class_='name-auth')
    author = author_tag.get_text(strip=True) if author_tag else "No Author"
    
    return {
        "title": title,
        "date_published": date_published_dt.strftime("%Y-%m-%d %H:%M:%S") if date_published_dt else "No Date",
        "author": author,
        "url": article_url
    }

def main():
    # Key word input
    key_word_input = input("Enter a keyword: \n").strip()
    
    # Define storage directory
    storage_dir = os.path.join(os.getcwd(), key_word_input)
    os.makedirs(storage_dir, exist_ok=True)
    
    # CSV file creation
    csv_file = os.path.join(storage_dir, f"{key_word_input}.csv")
    
    # CSV headers
    headers = ["Title", "Date Published", "Author", "URL"]
    
    # Writing to CSV
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
        
        # Base search URL
        base_url_search = "https://egemen.kz"
        key_word_url = base_url_search + "/search?q=" + key_word_input
        
        # Fetch search results
        response = requests.get(key_word_url)
        soup = BS(response.text, "html.parser")
        
        # Extract number of articles
        article_founded_tag = soup.find('small')
        article_founded = article_founded_tag.text if article_founded_tag else "0 articles found"
        num_article = regex.findall(r'\d+', article_founded)
        num = int(num_article[0]) if num_article else 0
        
        # Pages logic
        articles_per_page = 5
        pages = (num // articles_per_page) + 1
        
        # Extract articles
        for page in range(1, pages + 1):
            full_url = key_word_url + "&page=" + str(page)
            print(f"Processing page {page}: {full_url}")
            response = requests.get(full_url)
            soup = BS(response.text, "html.parser")
            
            # Extract URLs
            divs = soup.find_all('div', class_='clearfix news-t flexBlock')
            article_urls = [base_url_search.rstrip('/') + div.a['href'] for div in divs if div.a]
            
            for url in article_urls:
                print(f"Processing article: {url}")
                article_details = extract_article_details(url)
                writer.writerow(article_details)
    
    print(f"All articles saved to: {csv_file}")

if __name__ == "__main__":
    main()


Enter a keyword: 
Донор
Processing page 1: https://egemen.kz/search?q=Донор&page=1
Processing article: https://egemen.kz/article/377917-mayittik-donorlyqty-damytugha-ne-kedergi


ValueError: dict contains fields not in fieldnames: 'date_published', 'url', 'title', 'author'

In [55]:
def extract_article_details(article_url):
    """Extract title, date published, author, and URL."""
    response = requests.get(article_url)
    html_content = response.text
    soup = BS(html_content, "html.parser")
    
    # Extract title
    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else "No Title"
    
    # Extract date published
    date_tag = soup.find('meta', itemprop="datePublished")
    date_published = date_tag['content'] if date_tag and date_tag.has_attr('content') else "No Date"
    date_published_dt = None
    if date_published != "No Date":
        date_published_dt = datetime.strptime(date_published, "%Y-%m-%d %H:%M:%S")
        date_published = date_published_dt.strftime("%Y-%m-%d %H:%M:%S")
    
    # Extract author
    author_tag = soup.find('div', class_='name-auth')
    author = author_tag.get_text(strip=True) if author_tag else "No Author"
    
    return {
        "Title": title,
        "Date Published": date_published,
        "Author": author,
        "URL": article_url
    }


In [62]:
# CSV file creation
csv_file = os.path.join(storage_dir, f"Анализ.csv")
    
# CSV headers
headers = ["Title", "Date Published", "Author", "URL"]
    


# Writing to CSV
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()
    
    for url in urls_list:
        print(f"Processing article: {url}")
        article_details = extract_article_details(url)
        writer.writerow(article_details)  # Pass the entire dictionary


NameError: name 'storage_dir' is not defined