In [None]:
pip install selenium
!apt-get remove chromium-browser
!apt-get install chromium-browser
!apt-get remove chromium-chromedriver
!apt-get install chromium-chromedriver

from google.colab import drive
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920, 1200")
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver

# Function to fetch links from all pages
def get_all_article_links(base_url, total_pages=3229):
    all_links = []  # List for all article links

    # Create WebDriver
    driver = web_driver()

    # Iterate through all pages
    for page in range(1, total_pages + 1):
        print(f"Accessing page {page} of {total_pages}...")

        # Form URL for each page (correct the URL format)
        page_url = f"{base_url}&page={page}"
        print(f"Current URL: {page_url}")  # Log the current URL

        # Open URL
        driver.get(page_url)
        driver.implicitly_wait(10)  # Wait for the page to fully load

        # Fetch all links with the text "Pročitaj više" (Read more)
        elements = driver.find_elements(By.LINK_TEXT, "Pročitaj više")
        print(f"Number of 'Pročitaj više' links found on page {page}: {len(elements)}")

        # Fetch absolute URLs
        links = [element.get_attribute('href') for element in elements]
        all_links.extend(links)  # Add new links to the list

        # Pause before proceeding to the next page
        time.sleep(2)

    # Close WebDriver
    driver.quit()

    return all_links

base_url = "https://www.infobiro.ba/search?q=dnevni%20avaz&archive_id=2&ctc_id=77"
article_links = get_all_article_links(base_url)

# Function to fetch metadata for a single article
def get_article_metadata(driver, article_url):
    print(f"Fetching metadata from article: {article_url}")
    driver.get(article_url)
    time.sleep(0.5)

    metadata = {
        "newspaper": "",
        "date": "",
        "section": "",
        "headline": "",
        "title": "",
        "subtitle": "",
        "page": "",
        "authors": "",
        "text": ""
    }

    try:
        # Newspaper
        print("Waiting for newspaper name...")
        try:
            newspaper_elem = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='article-info-wrapper']/h4/a"))
            ).text
            print(f"Newspaper found: {newspaper_elem}")
            metadata["newspaper"] = newspaper_elem
        except Exception as e:
            print(f"Error fetching newspaper name: {e}")
            metadata["newspaper"] = ""

        # Date
        print("Waiting for date...")
        try:
            date_elem = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='article-info-wrapper']/p"))
            ).text
            print(f"Date found: {date_elem}")
            metadata["date"] = date_elem
        except Exception as e:
            print(f"Error fetching date: {e}")
            metadata["date"] = ""

        # Title
        print("Waiting for title...")
        try:
            title_elem = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='article-info-wrapper']/h2"))
            ).text
            print(f"Title found: {title_elem}")
            metadata["title"] = title_elem
        except Exception as e:
            print(f"Error fetching title: {e}")
            metadata["title"] = ""

        # Headline (can be empty)
        try:
            print("Waiting for headline...")
            headline_elem = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='article-info-wrapper']/h5"))
            )
            metadata["headline"] = headline_elem.text.strip()
        except Exception as e:
            metadata["headline"] = ""

        # Authors
        print("Waiting for authors...")
        try:
            authors_elem = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='article-info-wrapper']/div[1]/div/span/a"))
            ).text
            print(f"Authors found: {authors_elem}")
            metadata["authors"] = authors_elem
        except Exception as e:
            print(f"Error fetching authors: {e}")
            metadata["authors"] = ""

        # Text
        print("Waiting for text...")
        try:
            text_elem = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='article-text']"))
            ).text
            print(f"Text found: {text_elem[:100]}...")  # Show only the first 100 characters for brevity
            metadata["text"] = text_elem
        except Exception as e:
            print(f"Error fetching text: {e}")
            metadata["text"] = ""

    except StaleElementReferenceException as e:
        print(f"Stale element error: {e}")
        # Retry the function to fetch the element again

    except Exception as e:
        print(f"Error fetching metadata: {e}")

    return metadata

# Function to fetch metadata for multiple articles
def get_multiple_articles_metadata(article_links):
    driver = web_driver()
    articles_metadata = []

    for link in article_links:
        print(f"Fetching metadata for article: {link}")
        metadata = get_article_metadata(driver, link)
        articles_metadata.append(metadata)

    # Close WebDriver
    driver.quit()

    return articles_metadata

df = pd.DataFrame(articles_metadata)
df.head(50000)
