In [1]:
## Run selenium and chrome driver to scrape data from cloudbytes.dev
import time
import os.path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [None]:
## Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off
chrome_options.add_argument("--window-size=1920,1200")

# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")

# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

# Get page
pagenum=1
link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
browser.get(link)

# Get the total number of pages to scrape
maxpagenum=int(browser.find_element(By.XPATH,'//ol[1]/li[13]').text.strip())

def get_title(browser):
    """
    Extracts data from each work listed on a page.
    :param browser: Chrome Webdriver object
    :return: Pandas DataFrame
    """
    # Find all the fanfic works on the page
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        WebDriverWait(browser, 10)
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        links = a[0].get_attribute("href")
        language = work.find_elements(By.CLASS_NAME,'language')[1].text.strip()
        words = work.find_elements(By.CLASS_NAME,'words')[1].text.strip()
        try:
            kudos = work.find_elements(By.CLASS_NAME,'kudos')[1].text.strip()
        except:
            kudos = 0
        try:
            hits = work.find_elements(By.CLASS_NAME,'hits')[1].text.strip()
        except:
            hits = 0
        try:
            comments = work.find_elements(By.CLASS_NAME,'comments')[1].text.strip()
        except:
            comments = 0
        try:
            bookmarks = work.find_elements(By.CLASS_NAME,'bookmarks')[1].text.strip()
        except:
            bookmarks = 0
        row = {'link':links, 
               'language':language,
               'words':words,
               'kudos':kudos, 
               'comments':comments,
               'bookmarks':bookmarks,
               'hits':hits
               }
        data.append(row)
    return pd.DataFrame(data)

# Get data from the first page
data_list=get_title(browser)

# Iterate through remaining pages
for p in range(2,maxpagenum+1):
    pagenum=p
    time.sleep(10)
    print(f'procesing page {pagenum}')
    link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
    browser.get(link)
    data_list=data_list.append(get_title(browser), ignore_index=True)

# Wait for 10 seconds
time.sleep(10)
browser.quit()

# Remove empty cells and create a new DataFrame with missing data
data_list1=data_list.replace('', np.nan)
data_list1=data_list1[data_list1.isna().any(axis=1)]
data_list2 = data_list[~data_list['link'].isin(data_list1['link'])]

def get_missing(df):
    """
    Extracts data for works that had missing cells in the original DataFrame.
    :param df: Pandas DataFrame
    :return: Pandas DataFrame
    """
    data=[]
    for x in df['link']:
        newlink=x+'?view_adult=true'
        print(newlink)
        source = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        soup = BeautifulSoup(source,'html.parser')
        language=soup.find('dd', attrs={'class':'language'}).get_text().replace('\n','').strip()
        words=soup.find('dd', attrs={'class':'words'}).get_text()
        try:
            kudos=soup.find('dd', attrs={'class':'kudos'}).get_text()
        except:
            kudos=0
        try:
            comments=soup.find('dd', attrs={'class':'comments'}).get_text()
        except:
            comments=0
        try:
            bookmarks=soup.find('dd', attrs={'class':'bookmarks'}).get_text()
        except:
            bookmarks=0
        try:
            hits=soup.find('dd', attrs={'class':'hits'}).get_text()
        except:
            hits=0
        row = {'link':x, 
               'language':language,
               'words':words,
               'kudos':kudos, 
               'comments':comments,
               'bookmarks':bookmarks,
               'hits':hits
               }
        print(row)
        data.append(row)
        time.sleep(10)
    return pd.DataFrame(data)
data_list3=get_missing(data_list1)
data_listfinal=data_list2.append(data_list3)
data_listfinal[data_listfinal.isna().any(axis=1)]
data_listfinal.to_csv('lockwood_part2.csv', index=False)