## <center> Scraping FirstMonday.com</center>

Prerequisites:
* Make sure to place chromedriver.exe in the same directory as your code, in "\chromedriver-win64" subfolder.

In [1]:
%run 00_lib_preprocessing.ipynb
%run 00_lib_sqlwriter.ipynb

In [2]:
# ##### Web Scraper Using Selenium

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

import pandas as pd
import re
from datetime import datetime, timezone
import time
import os
import html

class FirstMondayBot:
    """
    A bot to automate the scraping of journals using Selenium.
    """
    def __init__(self, driver, mysql_writer):
        # Initializes the bot with a Selenium WebDriver instance and a MySQLWriter instance.
        self.driver = driver
        self.mysql_writer = mysql_writer

    
    def scrape_archives_urls (self, base_url, max_pages=1):
        # Scrapes archives URLs and stores them in the 'archives' table.

        current_page = 0

        try:
            # Navigate to the base URL
            self.driver.get(base_url)
            time.sleep(2)

            while current_page <= max_pages:
                current_page += 1
                print(f"Scraping page {current_page}...")

                # Wait for the issues_archive to be visible
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "issues_archive"))
                )

                # Find and extract links
                struct_item_container = self.driver.find_element(By.CLASS_NAME, "issues_archive")
                post_links = struct_item_container.find_elements(By.XPATH, ".//a[@href]")

                # Collect unique archives URLs from the current page
                base_urls = set()
                for link in post_links:
                    # archive_url, volume_number, archive_title, archive_publication_date, editor, import_date
                    url = link.get_attribute("href")
                    volume_number = ''
                    archive_title = ''
                    archive_publication_date = '1900-01-01'
                    editor = ''
                    base_urls.add((url, volume_number, archive_title, archive_publication_date, editor, datetime.now(), ''))

                # Convert the set of URLs to a DataFrame
                if base_urls:
                    archives_df = pd.DataFrame(list(base_urls), columns=['archive_url','volume_number', 'archive_title', 'archive_publication_date', 'editor', 'import_date', 'status'])
                    
                    # Insert into the database
                    self.mysql_writer.insert_archives(archives_df)
                    print(f"Inserted {len(base_urls)} archives URLs from page {current_page}.")

                # Attempt to click the "next" button to proceed to the next page
                try:
                    next_button = WebDriverWait(self.driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, "//a[contains(@class, 'next')]"))
                    )
                    self.driver.execute_script("arguments[0].click();", next_button)

                    # Wait for the new page content to load by checking the staleness of the previous container
                    WebDriverWait(self.driver, 3).until(EC.staleness_of(struct_item_container))
                    
                except Exception as e:
                    print("No more pages or error clicking the 'next' button:", e)
                    break
                
        except Exception as e:
            print(f"An error occurred while scraping archives: {e}")

    
    def scrape_archives (self):
        try:
            df = self.mysql_writer.read_table_archives(status=None)
            
            for _, row in df.iterrows():
                archive_url = row['archive_url']
                print('Scraping archive_url:', archive_url)

                self.driver.get(archive_url)
                time.sleep(2)

                bsObj = BeautifulSoup(self.driver.page_source, 'lxml') # Get the page source and parse it with Beautiful Soup

                volume_number = ''
                archive_title = ''
                archive_publication_date = '1900-01-01'
                editor = ''

                volume_number = bsObj.find("h1").getText().strip()
                # print('volume_number: ', volume_number)

                # Find the <div> with class "description"
                description_div = bsObj.find("div", {"class": "description"})
                # print('description:', description_div)

                if description_div:
                    # Extract title and editor
                    paragraph = description_div.find("p")
                    # print('paragraph:', paragraph)
                    
                    if paragraph:
                        try:
                            description = paragraph.get_text(separator='\n').strip()
                            lines = description.split('\n')
                            # print('lines:', lines)

                            archive_title = lines[0].strip()
                            # print('archive_title:', archive_title)

                            # Check if lines[1] contains "edited by" or "co-edited by" 
                            if any(keyword in description.lower() for keyword in ["edited by", "co-edited by"]): 
                                editor = lines[1].strip() 
                                editor = re.sub(r'Edited by|Co-edited by', '', editor, flags=re.IGNORECASE).strip() 
                                # print('editor:', editor)

                        except Exception as e:
                            print("An error occurred:", e)
                            continue

                published_div = bsObj.find("div", {"class": "published"})
                if published_div:
                    archive_publication_date = published_div.find("span", {"class": "value"}).text.strip()
                    # print('archive_publication_date:', archive_publication_date)

                base_urls = set()
                base_urls.add((archive_url, volume_number, archive_title, archive_publication_date, editor, datetime.now(), 'PENDING'))
            
                # Update archives table
                archives_df = pd.DataFrame(list(base_urls), columns=['archive_url', 'volume_number', 'archive_title', 'archive_publication_date', 'editor', 'import_date', 'status'])
                self.mysql_writer.update_archives(archives_df)

        except Exception as e:
            print("An error occurred:", e)

    
    def scrape_articles_urls (self):
        try:
            df = self.mysql_writer.read_table_archives(status="PENDING")
            article_urls = set()
            archive_urls = set()

            for _, row in df.iterrows():
                archive_url = row['archive_url']

                self.driver.get(archive_url)
                time.sleep(3)

                bsObj = BeautifulSoup(self.driver.page_source, 'lxml') # Get the page source and parse it with Beautiful Soup

                # article_url, article_title, doi, article_publication_date, author, archive_url, import_date
                article_url = ''
                article_title = ''
                doi = ''
                article_publication_date = '1900-01-01'
                author = ''
                keyword= ''
                abstract = ''
                content_url = ''

                articles = bsObj.find_all("h3", {"class": "title"})
                # print (articles)
                for article in articles:
                    a_tag = article.find('a')
                    if a_tag:
                        article_url = a_tag['href']
                        # print('article_url: ', article_url)
                        article_urls.add((article_url, article_title, doi, article_publication_date, author, keyword, abstract, archive_url, content_url, datetime.now(), 'PENDING'))
                        archive_urls.add((archive_url,'COMPLETED'))

                articles_df = pd.DataFrame(list(article_urls), columns=['article_url', 'article_title', 'doi', 'article_publication_date', 'author', 'keyword', 'abstract', 'archive_url', 'content_url', 'import_date', 'status'])
                self.mysql_writer.insert_articles(articles_df)

                archives_df = pd.DataFrame(list(archive_urls), columns=['archive_url', 'status'])
                self.mysql_writer.update_archives_status(archives_df) # update archive status to 'completed'
                
            # print(f"Inserted article URL: {(article_urls)}")

        except Exception as e:
            print("An error occurred:", e)

    
    def scrape_articles (self):
        try:
            df = self.mysql_writer.read_table_articles(status="PENDING")
            article_urls = set()

            for _, row in df.iterrows():
                count=+1
                article_url = row['article_url']
                print('----------------Scraping article_url:', article_url, '----------------')

                archive_url = row['archive_url']
                # print('archive_url:', archive_url)

                self.driver.get(article_url)
                time.sleep(2)

                bsObj = BeautifulSoup(self.driver.page_source, 'lxml') # Get the page source and parse it with Beautiful Soup
                
                # article_url, title, doi, publication_date, author, abstract, archive_url, import_date
                article_title = ''
                doi = ''
                article_publication_date = '1900-01-01'
                author = ''
                keyword = ''
                abstract = ''
                content_url = ''
                error = ''

                if bsObj.find("div", {"class": "error-code"}):
                    error = (bsObj.find("div", {"class": "error-code"}).get_text().strip())
                    print ('error:', error)
                    if error.lower() == 'http error 500':
                        continue

                # Find the <div> with class "description"
                article_title = bsObj.find("h1", {"class": "page_title"})
                if article_title: article_title = article_title.get_text().strip()
                print('article_title:', article_title)

                doi_section = bsObj.find("section", {"class": "item doi"})
                if doi_section: doi = doi_section.find('a').get('href')
                print('doi:', doi)

                date_section = bsObj.find("div", {"class": "item published"})
                if date_section: date_section = date_section.find("span")
                if date_section: article_publication_date = date_section.get_text().strip()
                article_publication_date = article_publication_date[:10]
                print('article_publication_date:', article_publication_date)

                author_section = bsObj.find("section", {"class": "item authors"})
                if author_section: author_section = author_section.find_all("span",{"class":"name"})
                if author_section:  author = [author.get_text().strip() for author in author_section]
                author = ', '.join(author)
                print('author:', author)

                keyword_section = bsObj.find("section", {"class": "item keywords"})
                if keyword_section: 
                    keyword_section = keyword_section.find("span",{"class":"value"})
                    if keyword_section:  
                        keyword_text = [keyword.get_text().strip() for keyword in keyword_section] 
                        if keyword_text: 
                            keyword_text = ", ".join(keyword_text)
                            # Clean keyword text
                            text_preprocessor = TextPreprocessor()
                            keyword = text_preprocessor.preprocess_text(keyword_text)
                print('keyword:', keyword)

                abstract_section = bsObj.find("section", {"class": "item abstract"})
                if abstract_section:
                    abstract = abstract_section.find("p")
                    if  abstract:
                        abstract = abstract.get_text().strip()
                    else:
                        abstract = abstract_section.get_text(separator=" ", strip=True).strip()
                        abstract = re.sub(r'^\babstract\b', '', abstract, flags=re.IGNORECASE).replace('\n', '')
                print('abstract:', abstract)

                a_tag = bsObj.find("a", {"class": "obj_galley_link file"})
                if a_tag: content_url = a_tag.get('href')
                print('content_url:', content_url)

                article_urls.add((article_url, article_title, doi, article_publication_date, author, keyword, abstract, archive_url, content_url, datetime.now(), 'COMPLETED'))

                # Update archives table                
                articles_df = pd.DataFrame(list(article_urls), columns=['article_url', 'article_title', 'doi', 'article_publication_date', 'author', 'keyword', 'abstract', 'archive_url', 'content_url', 'import_date', 'status'])
                self.mysql_writer.update_articles(articles_df)
                time.sleep(3)

        except Exception as e:
            print("An error occurred:", e)

    
    def selectively_escape(self, text):
        # List of characters to escape
        problematic_chars = ['<', '>', '&', '"', "'", '\n', '\r', '\t']
        escape_map = {
            '<': '&lt;',
            '>': '&gt;',
            '&': '&amp;',
            '"': '&quot;',
            "'": '&#39;',
            '\n': '&#10;',
            '\r': '&#13;',
            '\t': '&#9;',
        }

        # Escape the problematic characters in the text
        for char in problematic_chars:
            text = text.replace(char, escape_map[char])

        return text
    
    
    def scrape_contents(self):
        try:
            df = self.mysql_writer.read_table_articles_wo_contents()

            for _, row in df.iterrows():
                content_url = row['content_url']
                print('----------------Scraping content_url:', content_url, '----------------')

                # Navigate to the content URL using Selenium
                self.driver.get(content_url)
                time.sleep(5)  # Allow time for the page to load

                # Wait for the iframe to be present and extract it using Selenium
                iframe_element = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "iframe"))
                )
                iframe_url = iframe_element.get_attribute("src").strip()
                print("iframe_url:", iframe_url)

                # Set headers to mimic a real browser request
                headers = {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                                "AppleWebKit/537.36 (KHTML, like Gecko) "
                                "Chrome/111.0.0.0 Safari/537.36",
                    "Referer": content_url  # Some servers require a valid referer
                }

                # Fetch the iframe content using requests with headers
                iframe_response = requests.get(iframe_url, headers=headers)
                if iframe_response.status_code != 200:
                    print(f"Failed to retrieve iframe content from {iframe_url} with status code {iframe_response.status_code}")
                    continue

                # Parse the iframe HTML content with BeautifulSoup
                try:
                    iframe_soup = BeautifulSoup(iframe_response.text, "lxml")
                except Exception:
                    print("lxml not available, using default HTML parser.")
                    iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
                iframe_text = iframe_soup.get_text(separator="\n", strip=True)

                # Escape any special characters (like <, >, &, etc.) in the iframe text
                # iframe_text_cleaned = self.selectively_escape(iframe_text)
                
                # Insert to database
                contents_df = pd.DataFrame([[content_url, iframe_url, iframe_text]], columns=['content_url', 'iframe_url', 'content'])
                self.mysql_writer.insert_contents(contents_df)

                time.sleep(3)

        except Exception as e:
            print("An error occurred during scraping contents:", e)

    
    def close(self):
        # Closes the Selenium WebDriver instance.
        self.driver.quit()

In [3]:
# ##### Main Execution: Initializing and Running the Scraper

import os
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Setup Chrome options for Selenium WebDriver
chrome_options = Options()
# chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-software-rasterizer")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                            "AppleWebKit/537.36 (KHTML, like Gecko) "
                            "Chrome/111.0.0.0 Safari/537.36")

# Initialize the WebDriver
chrome_driver_path = os.path.join(os.getcwd(), "chromedriver-win64", "chromedriver.exe")
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Define login parameters
URL = "https://firstmonday.org/ojs/index.php/fm/issue/archive"

# MySQL Connection Details
DB_HOST = "localhost"
DB_USER = "root"
DB_PASSWORD = "root"
DB_NAME = "fmdb"

# Initialize the MySQLWriter instance
mysql_writer = MySQLWriter(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME)

# Create an instance of FirstMondayBot
bot = FirstMondayBot(driver, mysql_writer)

try:
    # Scrape archives' URLs and store them in the 'archives' table
    BASE_FORUM_URL = "https://firstmonday.org/ojs/index.php/fm/issue/archive/"
    MAX_PAGES = 50  # Max page 50

    # # print("1. Inserting archive URLs...")
    # bot.scrape_archives_urls (BASE_FORUM_URL, MAX_PAGES)

    # # Scrape archives and store them in the 'archives' table
    # print("2. Updating archives...")
    # bot.scrape_archives()

    # # print("3. Inserting article URLs...")
    # bot.scrape_articles_urls()

    # # print("4. Updating articles...")
    # bot.scrape_articles()

    # print("5. Inserting html contents...")
    bot.scrape_contents()

except Exception as e:
    print("An error occurred during scraping:", e)

finally:
    # Close the WebDriver and the MySQL connection
    bot.close()
    mysql_writer.close_connection()

Table 'volumes' is ready.
Table 'archives' is ready.
Table 'articles' is ready.
Table 'contents' is ready.
Table 'authors' is ready.
Table 'authors_articles' is ready.
----------------Scraping content_url: https://firstmonday.org/ojs/index.php/fm/article/view/828/737 ----------------
iframe_url: https://firstmonday.org/ojs/index.php/fm/article/download/828/737?inline=1
lxml not available, using default HTML parser.
Inserted article URL: https://firstmonday.org/ojs/index.php/fm/article/view/828/737
MySQL connection closed.
