In [50]:
import requests
from bs4 import BeautifulSoup
from persiantools.jdatetime import JalaliDateTime
import datetime
from datetime import timedelta
import json
import re 
from typing import List, Dict, Optional,Any
import time
from requests.exceptions import RequestException
import logging
import pprint

import os
os.environ['PYTHONIOENCODING'] = 'utf-8'

In [33]:
persian_months = {
    'فروردین': 1,
    'اردیبهشت': 2,
    'خرداد': 3,
    'تیر': 4,
    'مرداد': 5,
    'شهریور': 6,
    'مهر': 7,
    'آبان': 8,
    'آذر': 9,
    'دی': 10,
    'بهمن': 11,
    'اسفند': 12
}

In [34]:
# Define the URL of the news page
url = "https://mihanblockchain.com/category/news/"

# Define headers to mimic a real user (this helps to avoid bot detection)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}

# Send a GET request to the URL with the headers
response = requests.get(url, headers=headers)



In [35]:
# Initialize a list to store the result
result = []

# Get the current time using datetime.now(datetime.UTC)
# current_time = datetime.datetime.now(datetime.UTC)
current_time =  datetime.datetime.now(datetime.timezone.utc)

# Check if the request was successful
if response.status_code == 200:
    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Select the articles using the provided CSS selectors
    articles = soup.select('div.jnews_category_content_wrapper > div.jeg_postblock_4.jeg_postblock > div.jeg_posts.jeg_block_container > div.jeg_posts > article.jeg_post')

    # Extract the link and date for each article
    for article in articles:
        # Extract the link
        link = article.select_one('div.jeg_postblock_content > div.jeg_post_meta > div.jeg_meta_date > a')['href']
        
        # Extract the date
        date_str = article.select_one('div.jeg_postblock_content > div.jeg_post_meta > div.jeg_meta_date').get_text(strip=True)
        
        # Convert Persian date string to Gregorian datetime
        try:
            # Split the date and time parts
            persian_date, persian_time = date_str.split(" - ")
            
            # Split the Persian date into day, month, and year
            day, month_name, year = persian_date.split()
            
            # Map the Persian month name to the month number
            month = persian_months[month_name]
            
            # Convert the Persian date to a format that strptime can parse
            converted_date = f"{year}-{month:02d}-{int(day):02d}"
            
            # Parse the converted date into JalaliDateTime object
            persian_datetime = JalaliDateTime.strptime(converted_date, '%Y-%m-%d')

            # Add the time information
            hour, minute = map(int, persian_time.split(":"))
            persian_datetime = persian_datetime.replace(hour=hour, minute=minute)

            # Convert to Gregorian date
            gregorian_datetime = persian_datetime.to_gregorian()

            # Convert to UTC (remove Persian Standard Time offset)
            utc_datetime = gregorian_datetime.replace(tzinfo=None)

            # Get the date difference between the article's date and the current time
            time_diff = current_time - utc_datetime.replace(tzinfo=datetime.timezone.utc)

            # Only add the article if it was published within the last 1 day
            if time_diff <= timedelta(days=8):
                # Format the datetime into a string
                formatted_date = utc_datetime.strftime('%Y-%m-%d %H:%M:%S')

                # Append the link and formatted date to the result list
                result.append({
                    "link": link,
                    "date": formatted_date
                })

        except Exception as e:
            print(f"Error converting date: {e}")
else:
    print("Failed to retrieve the webpage")

In [36]:
json_result = json.dumps(result, ensure_ascii=False, indent=4)

In [37]:
type(json_result)

str

In [38]:
# Initialize a list to store the detailed results
detailed_result = []

if isinstance(json_result, str):
    json_result = json.loads(json_result)

# Loop through each article in the previous result
for article in json_result:
    url = article['link']
    date = article['date']  # Date from the last result
    
    # Send a GET request to the URL with headers
    response = requests.get(url, headers=headers)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the page content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the data content, keeping the HTML tags
        data = soup.select_one('div.jeg_inner_content')
        data_html = data.decode_contents() if data else "N/A"  # Raw HTML content

        # Extract the creator
        creator = soup.select_one('div.jeg_meta_container > div.jeg_post_meta.jeg_post_meta_1 > div.meta_left > div.jeg_meta_author > a')
        creator_name = creator.get_text(strip=True) if creator else "N/A"  # Use "N/A" if no creator found

        # Extract the title
        title = soup.select_one('div.jeg_inner_content > div.entry-header > h1.jeg_post_title')
        title_text = title.get_text(strip=True) if title else "N/A"  # Use "N/A" if no title found

        # Add the extracted fields to the result
        detailed_result.append({
            "link": url,
            "date": date,  # Date from the last JSON
            "data": data_html,  # Keep HTML tags
            "creator": creator_name,
            "title": title_text
        })
    else:
        print(f"Failed to retrieve the article at {url}")

# Convert the detailed result list to a JSON string
json_detailed_result = json.dumps(detailed_result, ensure_ascii=False, indent=4)

# Print the detailed JSON result
print(json_detailed_result)

[
    {
        "link": "https://mihanblockchain.com/onchain-ui-would-prevent-bybit-hack-dfinity/",
        "date": "2025-03-01 15:00:00",
        "data": "\n<div class=\"jeg_breadcrumbs jeg_breadcrumb_container\">\n<div id=\"breadcrumbs\"><span class=\"\">\n<a href=\"https://mihanblockchain.com\">میهن بلاکچین</a>\n</span><i class=\"fa fa-angle-left\"></i><span class=\"\">\n<a href=\"https://mihanblockchain.com/category/news/\">اخبار</a>\n</span><i class=\"fa fa-angle-left\"></i><span class=\"breadcrumb_last_link\">\n<a href=\"https://mihanblockchain.com/category/news/public/\">اخبار عمومی</a>\n</span></div> </div>\n<div class=\"entry-header\">\n<h1 class=\"jeg_post_title\">نظر بنیان‌گذار Dfinity پس از هک بای بیت: برنامه‌ها باید کاملا آنچین باشند</h1>\n<div class=\"jeg_meta_container\"><div class=\"jeg_post_meta jeg_post_meta_1\">\n<div class=\"meta_left\">\n<div class=\"jeg_meta_author coauthor\">\n<span class=\"meta_text\"> نگارش:‌</span><a href=\"https://mihanblockchain.com/author/r

In [39]:
def upload_image_to_api(image_url: str) -> str:
    """
    Uploads an image to an API and returns the new URL.
    Replace this with your actual API call.
    """
    try:
        # Download the image from the original URL
        response = requests.get(image_url, stream=True)
        response.raise_for_status()

        # Simulate API upload (replace with real API endpoint)
        # For this example, assume the API accepts a file and returns {'url': 'new_url'}
        api_endpoint = "https://capitalino.io/api/admin/upload-image/ai-save-image"  # Replace with your API URL
        files = {'file': (image_url.split('/')[-1], response.content)}
        api_response = requests.post(api_endpoint, files=files)
        api_response.raise_for_status()

        # Extract new URL from API response
        new_url = api_response.json().get('url')
        return new_url
    except requests.RequestException as e:
        print(f"Error uploading image {image_url}: {e}")
        return image_url  # Return original URL if upload fails

In [44]:
def process_json_items(items: List[Dict]) -> List[Dict]:
    html_content = items[0].get('data', '')
    if not html_content:
        return [{'json': {'error': 'No HTML content found in items[0].data'}}]

    excluded_containers = [
        "jeg_share_bottom_container",
        "jeg_ad_jeg_article_jnews_content_bottom_ads",
        "jnews_prev_next_container",
        "jnews_author_box_container",
        "jnews_related_post_container",
        "jeg_postblock_22 jeg_postblock jeg_module_hook jeg_pagination_disable jeg_col_2o3 jnews_module_307974_0_67c8441ee8156",
        "jnews_popup_post_container",
        "jnews_comment_container"
    ]

    def is_in_excluded_container(html: str, element_position: int) -> bool:
        sample_text = html[element_position:element_position + 100]
        for container in excluded_containers:
            pattern = rf'<div[^>]*class="[^"]*{container}[^"]*"[^>]*>[\\s\\S]*?{re.escape(sample_text)}'
            if re.search(pattern, html, re.IGNORECASE):
                return True
        return False
    
    def extract_thumbnail_image(html: str) -> str:
        """
        Extracts the thumbnail image URL from the HTML content using the specified CSS selector.
        div.jeg_inner_content > div.jeg_featured > a > div.thumbnail-container > img (src)
        """
        try:
            # First try to find data-lazy-src attribute which contains the actual image URL
            thumbnail_pattern = r'<div[^>]*class="[^"]*jeg_featured[^"]*"[^>]*>.*?<a[^>]*>.*?<div[^>]*class="[^"]*thumbnail-container[^"]*"[^>]*>.*?<img[^>]*data-lazy-src="([^"]+)"'
            thumbnail_match = re.search(thumbnail_pattern, html, re.DOTALL)
            
            if thumbnail_match:
                return thumbnail_match.group(1)
            
            # If data-lazy-src is not found, try data-src
            thumbnail_pattern = r'<div[^>]*class="[^"]*jeg_featured[^"]*"[^>]*>.*?<a[^>]*>.*?<div[^>]*class="[^"]*thumbnail-container[^"]*"[^>]*>.*?<img[^>]*data-src="([^"]+)"'
            thumbnail_match = re.search(thumbnail_pattern, html, re.DOTALL)
            
            if thumbnail_match:
                return thumbnail_match.group(1)
            
            # If neither data-lazy-src nor data-src is found, try srcset
            thumbnail_pattern = r'<div[^>]*class="[^"]*jeg_featured[^"]*"[^>]*>.*?<a[^>]*>.*?<div[^>]*class="[^"]*thumbnail-container[^"]*"[^>]*>.*?<img[^>]*data-lazy-srcset="([^"]+)"'
            thumbnail_match = re.search(thumbnail_pattern, html, re.DOTALL)
            
            if thumbnail_match:
                # srcset contains multiple URLs, take the first one
                srcset = thumbnail_match.group(1)
                first_url = srcset.split(',')[0].strip().split(' ')[0]
                return first_url
            
            # As a last resort, try the regular src attribute
            thumbnail_pattern = r'<div[^>]*class="[^"]*jeg_featured[^"]*"[^>]*>.*?<a[^>]*>.*?<div[^>]*class="[^"]*thumbnail-container[^"]*"[^>]*>.*?<img[^>]*src="([^"]+)"'
            thumbnail_match = re.search(thumbnail_pattern, html, re.DOTALL)
            
            if thumbnail_match:
                src = thumbnail_match.group(1)
                # Skip data URLs
                if src.startswith('data:'):
                    return None
                return src
                
            return None
        except Exception as e:
            print(f"Error extracting thumbnail image: {e}")
            return None

    def extract_and_replace_images(html: str) -> tuple[str, List[Dict]]:
        processed_html = html
        images_url = []
        processed_urls = set()
        image_counter = 0

        # Pattern for div.content-inner > div.wp-block-image > figure > a > img
        figure_pattern = r'<div[^>]*class="[^"]*wp-block-image[^"]*"[^>]*>\s*<figure[^>]*>\s*<a[^>]*>\s*<img[^>]*?(?:data-lazy-src="([^"]+)"|data-src="([^"]+)"|src="([^"]+)").*?</a>(?:\s*<figcaption[^>]*>([\s\S]*?)</figcaption>)?[\s\S]*?</figure>'
        
        for figure_match in re.finditer(figure_pattern, html, re.IGNORECASE):
            full_match = figure_match.group(0)
            match_position = figure_match.start()
            
            # Try to get the image URL from data-lazy-src, data-src, or src in that order
            image_url = figure_match.group(1) or figure_match.group(2) or figure_match.group(3)
            
            # Skip data URLs
            if image_url and image_url.startswith('data:'):
                # Try to find data-lazy-srcset in the match
                srcset_match = re.search(r'data-lazy-srcset="([^"]+)"', full_match)
                if srcset_match:
                    # Extract the first URL from srcset
                    srcset = srcset_match.group(1)
                    image_url = srcset.split(',')[0].strip().split(' ')[0]
                else:
                    continue  # Skip this image if we can't find a valid URL

            if not image_url or is_in_excluded_container(html, match_position) or image_url in processed_urls:
                continue

            # Caption is now in group 4 due to the additional capturing groups
            caption = figure_match.group(4)
            if caption:
                caption = re.sub(r'<[^>]+>', '', caption).strip()

            image_id = f'image{image_counter}'
            placeholder = f'**image_{image_id}**'

            images_url.append({
                'id': image_id,
                'url': image_url,
                'caption': caption,
                'type': 'figure'
            })

            processed_html = processed_html.replace(full_match, placeholder)
            processed_urls.add(image_url)
            image_counter += 1

        return processed_html, images_url

    def extract_content(html: str) -> List[str]:
        # First, find the main content div
        content_div_pattern = r'<div[^>]*class="[^"]*content-inner[^"]*"[^>]*>([\s\S]*?)<div[^>]*class="[^"]*jeg_share_bottom_container'
        content_div_match = re.search(content_div_pattern, html, re.IGNORECASE)
        
        if not content_div_match:
            return []
        
        content_html = content_div_match.group(1)
        
        # Replace image placeholders to ensure they're preserved
        content_html = re.sub(r'<div[^>]*>\s*(\*\*image_image\d+\*\*)\s*</div>', r'\1', content_html, flags=re.IGNORECASE)
        
        # Extract paragraphs, blockquotes, and image placeholders
        content_pattern = r'(<p[^>]*>[\s\S]*?</p>|<blockquote[^>]*>[\s\S]*?</blockquote>|<h[1-6][^>]*>[\s\S]*?</h[1-6]>|\*\*image_image\d+\*\*)'
        content_array = []

        for match in re.finditer(content_pattern, content_html, re.IGNORECASE):
            content = match.group(1).strip()
            
            # If it's an image placeholder, add it directly
            if content.startswith('**image_'):
                content_array.append(content)
            else:
                # For text content, strip HTML tags but preserve the text
                text_content = re.sub(r'<[^>]+>', '', content).strip()
                if text_content:
                    content_array.append(text_content)

        return content_array

    def extract_tags(html: str) -> List[str]:
        """
        Extracts tags from the HTML content using the specified CSS selector:
        div.inner-content > div.jeg_post_tags > a
        """
        try:
            # First try the specified path
            tag_section_pattern = r'<div[^>]*class="[^"]*inner-content[^"]*"[^>]*>.*?<div[^>]*class="[^"]*jeg_post_tags[^"]*"[^>]*>([\s\S]*?)</div>'
            tag_section_match = re.search(tag_section_pattern, html, re.DOTALL)
        
            # If not found, try an alternative pattern
            if not tag_section_match:
                tag_section_pattern = r'<div[^>]*class="[^"]*jeg_post_tags[^"]*"[^>]*>([\s\S]*?)</div>'
                tag_section_match = re.search(tag_section_pattern, html, re.DOTALL)
        
            if not tag_section_match:
                return []

            tags = []
            tag_pattern = r'<a[^>]*>([\s\S]*?)</a>'
            for tag_match in re.finditer(tag_pattern, tag_section_match.group(1), re.IGNORECASE):
                tag_text = tag_match.group(1).strip()
                # Remove any HTML tags inside the tag text
                tag_text = re.sub(r'<[^>]+>', '', tag_text).strip()
                if tag_text and not tag_text.lower() == 'تگ:':  # Skip the "تگ:" label if present
                    tags.append(tag_text)
        
            return tags
        except Exception as e:
            print(f"Error extracting tags: {e}")
            return []

    # Extract thumbnail image first
    thumbnail_image = extract_thumbnail_image(html_content)
    
    # Extract and process images
    processed_html, images_url = extract_and_replace_images(html_content)
    
    # Extract text content and metadata without unicode_escape processing
    title = items[0].get('title', '')
    raw_date = items[0].get('date')
    creator = items[0].get('creator', '')
    content = extract_content(processed_html)
    tags = extract_tags(html_content)

    # Upload thumbnail image if found
    if thumbnail_image:
        new_thumbnail_url = upload_image_to_api(thumbnail_image)
        thumbnail_image = new_thumbnail_url
    else:
        # Use the first image as thumbnail if no dedicated thumbnail is found
        thumbnail_image = images_url[0]['url'] if images_url else None

    # Upload content images and replace URLs
    for image in images_url:
        original_url = image['url']
        new_url = upload_image_to_api(original_url)
        image['url'] = new_url  # Replace original URL with new URL

    return [{
        'json': {
            'title': title,
            'date': raw_date,
            'creator': creator,
            'thumbnailImage': thumbnail_image,
            'content': content,
            'ImagesUrl': images_url,
            'tags': tags
        }
    }]

In [45]:
# Parse the JSON string into a Python object
items = json.loads(json_detailed_result)

# Process the items
result = process_json_items(items)


Error uploading image https://mihanblockchain.com/wp-content/uploads/2023/11/bybit.jpg: 400 Client Error: Bad Request for url: https://capitalino.io/api/admin/upload-image/ai-save-image
Error uploading image https://mihanblockchain.com/wp-content/uploads/2025/03/image-3.png: 400 Client Error: Bad Request for url: https://capitalino.io/api/admin/upload-image/ai-save-image
Error uploading image https://mihanblockchain.com/wp-content/uploads/2025/03/image-4-1024x569.png: 502 Server Error: Bad Gateway for url: https://capitalino.io/api/admin/upload-image/ai-save-image


In [46]:
# Convert the result to JSON format
output_json = json.dumps(result, ensure_ascii=False, indent=4)

# Output the JSON
print(output_json)

[
    {
        "json": {
            "title": "نظر بنیان‌گذار Dfinity پس از هک بای بیت: برنامه‌ها باید کاملا آنچین باشند",
            "date": "2025-03-01 15:00:00",
            "creator": "رضا حضرتی",
            "thumbnailImage": "https://mihanblockchain.com/wp-content/uploads/2023/11/bybit.jpg",
            "content": [
                "دومینیک ویلیامز (Dominic Williams)، بنیان‌گذار و دانشمند ارشد بنیاد دفینیتی (Dfinity Foundation)، یک سازمان غیرانتفاعی که توسعه و نگهداری پروتکل اینترنت کامپیوتر (Internet Computer Protocol – ICP) را بر عهده دارد، اخیراً در گفت‌وگویی اظهار داشت که برنامه‌ها باید به‌طور کاملا آنچین باشند تا از وقوع حملاتی مشابه هک اخیر بای‌بیت (Bybit) که باعث به خطر افتادن رابط کاربری شد، جلوگیری شود.\nبه گزارش میهن بلاکچین، ویلیامز توضیح داد که بیشتر برنامه‌های غیرمتمرکز و پروژه‌های بلاکچینی کنونی دارای اقتصاد توکنی آنچین هستند اما برای زیرساخت‌های خود به پلتفرم‌های متمرکز مانند آمازون وب سرویس (Amazon Web Services – AWS) متکی هستند.\n\n\n\nلزوم انتقال برنامه‌ها به 

In [51]:
class APIClient:
    def __init__(self, base_url: str, api_key: str = None, max_retries: int = 3):
        """
        Initialize the API client
        
        Args:
            base_url (str): The base URL of the API
            api_key (str, optional): API key if required
            max_retries (int): Maximum number of retry attempts for failed requests
        """
        self.base_url = base_url.rstrip('/')
        self.api_key = api_key
        self.max_retries = max_retries
        self.session = requests.Session()
        
        # Setup headers
        self.headers = {
            'Content-Type': 'application/json',
            'Accept': 'application/json'
        }
        if api_key:
            self.headers['Authorization'] = f'Bearer {api_key}'
            
        # Setup logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def post_news_data(self, data: Dict[str, Any]) -> Dict:
        """
        Post news data to the API
        
        Args:
            data (Dict[str, Any]): The news data to send
            
        Returns:
            Dict: The API response
            
        Raises:
            RequestException: If the request fails after all retries
        """
        endpoint = '/api/news'  # Adjust this endpoint based on your API
        url = f"{self.base_url}{endpoint}"
        
        for attempt in range(self.max_retries):
            try:
                self.logger.info(f"Sending data to {url} (Attempt {attempt + 1}/{self.max_retries})")
                response = self.session.post(
                    url=url,
                    json=data,
                    headers=self.headers
                )
                
                response.raise_for_status()
                self.logger.info("Successfully sent data to API")
                return response.json()
                
            except RequestException as e:
                self.logger.error(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt == self.max_retries - 1:
                    raise
                time.sleep(2 ** attempt)  # Exponential backoff
                
        raise RequestException("Failed to send data after all retry attempts")


In [None]:
# Usage example:
try:
    # Initialize the API client
    api_client = APIClient(
        base_url="YOUR_API_BASE_URL",  # Replace with your API base URL
        api_key="YOUR_API_KEY"         # Replace with your API key if required
    )
    
    # Send each item in output_json to the API
    for item in output_json:
        try:
            response = api_client.post_news_data(item)
            print(f"Successfully sent item with title: {item.get('title', 'Unknown')}")
            print(f"API Response: {response}")
            
        except RequestException as e:
            print(f"Failed to send item: {str(e)}")
            continue
            
except Exception as e:
    print(f"An error occurred: {str(e)}")