In [None]:
import csv
import requests
import re
import time
from bs4 import BeautifulSoup
from datetime import datetime, timezone, timedelta

# Tanggal yang dimasukkan ke csv. 
def get_raw_date():
    return datetime.today().strftime("%Y-%m-%d")

# Membuat dictionary berisi judul dan link yang dibuat beberapa jam lalu atau kemarin. 
def find_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    divs = soup.find_all('div', class_='col-md-8')

    span_elements = []

    # Mengambil waktu saat ini dalam zona waktu Jakarta (GMT+7)
    current_hour = datetime.now(timezone(timedelta(hours=7))).hour
    # Menghitung jam yang sudah berlalu dari jam saat ini untuk crawl dari awal hari ini.
    hour_patterns = [f"\\b{i} jam lalu\\b" for i in range(1, current_hour + 1)]
    minute_pattern = "\\b\\d+ menit lalu\\b"
    time_pattern = f"{minute_pattern}|{('|'.join(hour_patterns))}"

    for div in divs:
        spans = div.find_all('span', string=re.compile(time_pattern, re.IGNORECASE))
        span_elements.extend(spans)

    links_dict = {}
    for span in span_elements:
        parent = span.find_parent()
        while parent:
            link = parent.find('a', href=True)
            if link:
                title = link.get('title', '')
                links_dict[title] = link['href']
                break
            parent = parent.find_parent()
    return links_dict

# Mengumpulkan artikel dari link yang ditemukan.
def collect_articles(links_dict):
    data = []
    raw_date = get_raw_date()
    source = "Antara"
    for title, link in links_dict.items():
        try:
            get_content = requests.get(link, timeout=10)
            get_content.raise_for_status()
            date = raw_date
            content, image_url = find_content(link)
            # Process content directly
            processed_content = extract_content(content)
            data.append([title, source, link, image_url, date, processed_content])
        except requests.exceptions.RequestException as e:
            print(f"Error fetching content for {link}: {e}")
        time.sleep(2)
    return data

# Mencari konten dari link berita.
def find_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the image with the largest area instead of by class
    image_url = ""
    max_area = 0
    for img in soup.find_all('img'):
        # Skip small icons, emoticons etc
        if not img.has_attr('src') or img['src'].startswith('data:'):
            continue
            
        # Get width and height attributes
        width = int(img.get('width', 0)) or int(re.search(r'width:\s*(\d+)px', img.get('style', '')) or [0, 0])[1]
        height = int(img.get('height', 0)) or int(re.search(r'height:\s*(\d+)px', img.get('style', '')) or [0, 0])[1]
        
        # Calculate area
        area = width * height
        
        # Update max area and image URL if this image is larger
        if area > max_area:
            max_area = area
            image_url = img['src']
    
    # If no area detected images, fallback to traditional method
    if not image_url:
        image_div = soup.find('div', class_='wrap__article-detail-image mt-4')
        if image_div:
            img_tag = image_div.find('img', class_='img-fluid')
            if img_tag and img_tag.has_attr('src'):
                image_url = img_tag['src']
    
    parent_p_count = {}
    # Mencari semua parent yang memiliki p. Jumlah p yang tertinggi = main content yang ingin diambil.
    for p in soup.find_all('p'):
        # Skip paragraphs that have any class attribute
        if p.has_attr('class'):
            continue
        # Skip paragraphs that contain a span with class 'baca-juga'
        if p.find('span', class_='baca-juga'):
            continue

        parent = p.find_parent()
        if parent:
            parent_p_count[parent] = parent_p_count.get(parent, 0) + 1

    content_text = ""
    if parent_p_count:
        max_parent = max(parent_p_count, key=parent_p_count.get)
        paragraphs = []
        for p in max_parent.find_all('p'):
            if p.has_attr('class'):
                continue
            if p.find('span', class_='baca-juga'):
                continue
            text = p.get_text(" ", strip=True)
            paragraphs.append(text)
        content_text = " ".join(paragraphs)
    
    return content_text, image_url

def extract_content(text):
    text = text.lower()
    start_phrases = ["jakarta (antara) - "]
    start_pattern = "|".join(map(re.escape, start_phrases))
    pattern = f"({start_pattern})(.*)"
    match = re.search(pattern, text, re.DOTALL)
    return match.group(2).strip() if match else text

def crawl_antara():
    all_links = {}
    page = 1
    
    while True:
        url = f'https://www.antaranews.com/terkini/{page}'
        try:
            links_dict = find_links(url)
            if not links_dict:
                print(f"No articles found on page {page}, stopping crawler")
                break
            
            all_links.update(links_dict)
            print(f"Found {len(links_dict)} articles on page {page}")
            page += 1
        except requests.exceptions.RequestException as e:
            print(f"Error accessing page {page}: {e}")
            break
    
    # Process all collected links
    data = collect_articles(all_links)
    
    # Format data for return
    formatted_data = []
    for row in data:
        formatted_data.append({
            'title': row[0],
            'source': row[1],
            'url': row[2],
            'image': row[3],
            'date': row[4],
            'content': row[5]
        })
    
    return formatted_data

In [None]:
print(crawl_antara())

Found 15 articles on page 1
Found 15 articles on page 2
Found 15 articles on page 2
Found 15 articles on page 3
Found 15 articles on page 3
Found 15 articles on page 4
Found 15 articles on page 4
Found 15 articles on page 5
Found 15 articles on page 5
Found 13 articles on page 6
Found 13 articles on page 6
Found 13 articles on page 7
Found 13 articles on page 7
Found 12 articles on page 8
Found 12 articles on page 8
Found 12 articles on page 9
Found 12 articles on page 9
Found 13 articles on page 10
Found 13 articles on page 10
Found 14 articles on page 11
Found 14 articles on page 11
Found 14 articles on page 12
Found 14 articles on page 12
Found 15 articles on page 13
Found 15 articles on page 13
Found 13 articles on page 14
Found 13 articles on page 14
Found 14 articles on page 15
Found 14 articles on page 15
Found 15 articles on page 16
Found 15 articles on page 16
Found 14 articles on page 17
Found 14 articles on page 17
Found 14 articles on page 18
Found 14 articles on page 18
Fo

In [None]:
from datetime import datetime, timezone, timedelta

current_hour = datetime.now(timezone(timedelta(hours=7))).hour
hour_patterns = [f"\\b{i} jam lalu\\b" for i in range(1, current_hour + 1)]
minute_pattern = "\\b\\d+ menit lalu\\b"
time_pattern = f"{minute_pattern}|{('|'.join(hour_patterns))}"

In [24]:
time_pattern

'menit|1 jam|2 jam|3 jam|4 jam|5 jam|6 jam|7 jam|8 jam|9 jam|10 jam|11 jam|12 jam'

In [44]:
url = 'https://www.antaranews.com/terkini/11'

In [45]:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

divs = soup.find_all('div', class_='col-md-8')

span_elements = []

# Mengambil waktu saat ini dalam zona waktu Jakarta (GMT+7)
current_hour = datetime.now(timezone(timedelta(hours=7))).hour
# Menghitung jam yang sudah berlalu dari jam saat ini untuk crawl dari awal hari ini.
hour_patterns = [f"\\b{i} jam lalu\\b" for i in range(1, current_hour + 1)]
minute_pattern = "\\b\\d+ menit lalu\\b"
time_pattern = f"{minute_pattern}|{('|'.join(hour_patterns))}"
print(time_pattern)
for div in divs:
    spans = div.find_all('span', string=re.compile(time_pattern, re.IGNORECASE))
    span_elements.extend(spans)

\b\d+ menit lalu\b|\b1 jam lalu\b|\b2 jam lalu\b|\b3 jam lalu\b|\b4 jam lalu\b|\b5 jam lalu\b|\b6 jam lalu\b|\b7 jam lalu\b|\b8 jam lalu\b|\b9 jam lalu\b|\b10 jam lalu\b|\b11 jam lalu\b|\b12 jam lalu\b|\b13 jam lalu\b


In [46]:
span_elements

[<span class="text-secondary">13 jam lalu</span>,
 <span class="text-secondary">13 jam lalu</span>,
 <span class="text-secondary">13 jam lalu</span>,
 <span class="text-secondary">13 jam lalu</span>,
 <span class="text-secondary">13 jam lalu</span>,
 <span class="text-secondary">13 jam lalu</span>,
 <span class="text-secondary">13 jam lalu</span>]