In [None]:
# Install Selenium
pip install selenium

# Remove Chromium browser and install again
!apt-get remove chromium-browser
!apt-get install chromium-browser
!apt-get remove chromium-chromedriver
!apt-get install chromium-chromedriver

from google.colab import drive
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import calendar
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException

def web_driver():
    """Create and configure the Chrome WebDriver."""
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920, 1200")
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver

# Create WebDriver
driver = web_driver()

# Static starting URL
static_url = "https://infobiro.ba/publikacije/140/kalendar"
driver.get(static_url)

# Check if the page has loaded
print("Loaded page:", driver.title)

# Define the XPath for the cell with the month to be clicked
year_xpath = '//*[@id="oo"]/tbody/tr[20]/td[1]'  # XPath for the year (20th row)
cell_xpath = '//*[@id="oo"]/tbody/tr[20]/td[11]/div'  # XPath for the October cell (11th column)

try:
    # Find the cell with the year (verification step)
    year_cell = driver.find_element(By.XPATH, year_xpath)
    print(f"Year found: {year_cell.text}")

    # Find the cell for October (or other month/column if needed)
    target_cell = driver.find_element(By.XPATH, cell_xpath)

    # Click on the cell using JavaScript
    driver.execute_script("arguments[0].click();", target_cell)
    print("Successfully clicked on the cell with the month value.")

except Exception as e:
    print(f"Error: {e}")


def generate_year_dates(year):
    """Generate all dates for the specified year in the required format."""
    base_urls = []

    # Iterate through months and days of the year
    for month in range(1, 3):  # From January to December
        month_name = calendar.month_name[month]

        # Determine the number of days in the month
        num_days = calendar.monthrange(year, month)[1]

        for day in range(1, num_days + 1):  # From the 1st to the last day of the month
            # Generate the URL with year, month, and day
            base_url = f"{static_url}/{year}-{month:02}-{day:02}"
            base_urls.append(base_url)
            print(f"Generated base_url: {base_url}")

    return base_urls

# Input year
year_input = 2024

# Print generated dates
base_urls = generate_year_dates(year_input)

all_links = set()

def get_all_article_links(base_url):
    """Retrieve all article links from the base URL."""
    # Create WebDriver
    driver = web_driver()

    huge_page_number = 9999

    test_url = f"{base_url}?page={huge_page_number}"
    print(test_url)

    driver.get(test_url)
    driver.implicitly_wait(3)

    current_url = driver.current_url

    # Fetch all links with the text "Read More"
    page_fault_check = driver.find_elements(By.LINK_TEXT, "Pročitaj više")
    print(f"Number of 'Read More' links found during initial testing: {len(page_fault_check)}")

    if len(page_fault_check) == 0:
        return set()

    page_number_match = re.search(r'page=(\d+)', current_url)
    if page_number_match:
        total_pages = int(page_number_match.group(1))  # Extracted page number
        print(f"Page number from URL: {total_pages}")
    else:
        print("Page number not found in the URL.")

    if total_pages == 9999:
        total_pages = 1

    all_links = []  # Use a list instead of a set

    # Iterate through all pages
    for page in range(1, total_pages + 1):
        print(f"Accessing page {page} of {total_pages}...")

        # Form the URL for each page
        page_url = f"{base_url}?page={page}"
        print(f"Current URL: {page_url}")  # Log for the current URL

        # Open the URL
        driver.get(page_url)
        driver.implicitly_wait(3)  # Wait for the page to fully load

        # Fetch all links with the text "Read More"
        elements = driver.find_elements(By.LINK_TEXT, "Pročitaj više")
        print(f"Number of 'Read More' links found on page {page}: {len(elements)}")

        # Extract absolute URLs
        links = [element.get_attribute('href') for element in elements]
        all_links.extend(links)  # Add new links to the list

        # Pause before moving to the next page
        time.sleep(1)

    # Close WebDriver
    driver.quit()

    return all_links

article_links = set()
for base_url in base_urls:
    article_links.update(get_all_article_links(base_url))


def get_article_metadata(driver, article_url):
    """Fetch metadata from the article."""
    print(f"Starting to fetch metadata from the article: {article_url}")
    driver.get(article_url)
    time.sleep(0.5)

    metadata = {
        "newspaper": "",
        "date": "",
        "section": "",
        "header": "",
        "title": "",
        "subtitle": "",
        "page": "",
        "authors": "",
        "text": ""
    }

    try:
        # Newspaper
        print("Waiting for the newspaper...")
        try:
            newspaper_elem = WebDriverWait(driver, 2).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='article-info-wrapper']/h4/a"))
            ).text
            print(f"Newspaper found: {newspaper_elem}")
            metadata["newspaper"] = newspaper_elem
        except Exception as e:
            print(f"Error fetching the newspaper: {e}")
            metadata["newspaper"] = ""

        # Date
        print("Waiting for the date...")
        try:
            date_elem = WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.XPATH, "//*[@id='article-info-wrapper']/p"))).text
            print(f"Date found: {date_elem}")
            metadata["date"] = date_elem
        except Exception as e:
            print(f"Error fetching the date: {e}")
            metadata["date"] = ""

        # Title
        print("Waiting for the title...")
        try:
            title_elem = WebDriverWait(driver, 2).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='article-info-wrapper']/h2"))
            ).text
            print(f"Title found: {title_elem}")
            metadata["title"] = title_elem
        except Exception as e:
            print(f"Error fetching the title: {e}")
            metadata["title"] = ""

        # Header (can be empty)
        try:
            print("Waiting for the header...")
            header_elem = WebDriverWait(driver, 1).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='article-info-wrapper']/h5"))
            )
            metadata["header"] = header_elem.text.strip()
        except Exception as e:
            metadata["header"] = ""

        # Authors
        print("Waiting for the authors...")
        try:
            authors_elem = WebDriverWait(driver, 2).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='article-info-wrapper']/div[1]/div/span/a"))
            ).text
            print(f"Authors found: {authors_elem}")
            metadata["authors"] = authors_elem
        except Exception as e:
            print(f"Error fetching the authors: {e}")
            metadata["authors"] = ""

        # Text
        print("Waiting for the text...")
        try:
            text_elem = WebDriverWait(driver, 2).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='article-text']"))
            ).text
            print(f"Text found: {text_elem[:100]}...")  # Show only the first 100 characters for brevity
            metadata["text"] = text_elem
        except Exception as e:
            print(f"Error fetching the text: {e}")
            metadata["text"] = ""

    except Exception as e:
        print(f"Error fetching metadata: {e}")

    return metadata

# Function to login to InfoBiro
def login_to_infobiro(driver, username, password):
    try:
        print("Opening login page...")
        driver.get("https://infobiro.ba/login")
        time.sleep(3)  # Wait for the page to load

        # Enter username
        print("Entering username...")
        username_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//input[@type='text']"))
        )
        username_field.send_keys(username)

        # Enter password
        print("Entering password...")
        password_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//input[@type='password']"))
        )
        password_field.send_keys(password)

        # Click login button
        print("Clicking login button...")
        login_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'Prijavi se')]"))
        )
        try:
            login_button.click()
        except ElementClickInterceptedException:
            driver.execute_script("arguments[0].click();", login_button)

        time.sleep(5)  # Wait after login
        if driver.current_url == "https://infobiro.ba/":
            print("Login successful!")
        else:
            print("Login failed! Check username and password.")
    except Exception as e:
        print(f"Login error: {e}")


# Function to fetch metadata for multiple articles
def get_multiple_articles_metadata(article_links):
    driver = webdriver.Chrome()  # Replace with your specific WebDriver setup
    username = "your_username"
    password = "your_password"

    login_to_infobiro(driver, username, password)

    articles_metadata = []
    for index, link in enumerate(article_links, start=1):
        print(f"Fetching metadata for article {index}: {link}")
        metadata = get_article_metadata(driver, link)  # Assuming this function exists
        articles_metadata.append(metadata)

    driver.quit()
    return articles_metadata


# Main execution
article_links = [...]  # Replace with your article links list
articles_data = get_multiple_articles_metadata(article_links)

# Save to DataFrame and JSON file
df = pd.DataFrame(articles_data)
df.to_json("Infobiro_Dnevni_Avaz.json", orient="records", lines=True, force_ascii=False)
print("Metadata saved to 'Infobiro_Dnevni_Avaz.json'.")