In [73]:
## Run selenium and chrome driver to scrape data from cloudbytes.dev
import time
import os.path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [74]:
## Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off
# chrome_options.add_argument("--window-size=1920,1200")

In [75]:
# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")

In [76]:
# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

In [77]:
# Get page
pagenum=1
link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
#link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page=1"
browser.get(link)

maxpagenum=int(browser.find_element(By.XPATH,'//ol[1]/li[13]').text.strip())

In [78]:
def get_links(browser):
    # Find all the fanfic works on the page
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        links = a[0].get_attribute("href")
    return data
    

In [79]:
# Get data from the first page
links_list=get_links(browser)
#Wait for 10 seconds
time.sleep(10)
browser.quit()

In [None]:
def get_data(links_list):
    data=[]
    counter=0
    for x in links_list:
        newlink=x+'?view_adult=true'
        counter+=1
        print(f"getting missing data {counter}/{len(links_list)}")
        source = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        soup = BeautifulSoup(source,'html.parser')
        title=soup.find('h2', attrs={'class':'title heading'}).get_text().replace('\n','').strip()
        try:
            author=soup.find('a', attrs={'rel':'author'}).get_text()
        except:
            author="Anonymous"
        datetime=soup.find('dd', attrs={'class':'published'}).get_text()
        chapters=soup.find('dd', attrs={'class':'chapters'}).get_text()
        language=soup.find('dd', attrs={'class':'language'}).get_text().replace('\n','').strip()
        words=soup.find('dd', attrs={'class':'words'}).get_text()
        try:
            kudos=soup.find('dd', attrs={'class':'kudos'}).get_text()
        except:
            kudos=0
        try:
            comments=soup.find('dd', attrs={'class':'comments'}).get_text()
        except:
            comments=0
        try:
            bookmarks=soup.find('dd', attrs={'class':'bookmarks'}).get_text()
        except:
            bookmarks=0
        try:
            hits=soup.find('dd', attrs={'class':'hits'}).get_text()
        except:
            hits=0
        warning=soup.find('dd', attrs={'class':'warning tags'}).get_text().replace('\n','').strip()
        try:
            ships = soup.find('dd', attrs={'class':'relationship tags'})
            ships_list=ships.find_all('a', attrs={'class':'tag'})
            mainrelationship = ships_list[0].get_text().strip()
            relationship_list = []
            for r in ships_list:
                relationship_list.append(r.get_text().strip())
        except:
            mainrelationship='None'
            relationship_list = []
        try:
            char = soup.find('dd', attrs={'class':'character tags'})
            char_list=char.find_all('a', attrs={'class':'tag'})
            character_list = []
            for c in char_list:
                character_list.append(c.get_text().strip())
        except:
            character_list=[]
        try:
            freeform = soup.find('dd', attrs={'class':'freeform tags'})
            freefom_list=freeform.find_all('a', attrs={'class':'tag'})
            tags_list = []
            for t in freefom_list:
                tags_list.append(t.get_text().strip())
        except:
            tags_list = []
        try:
            summary=soup.find('div', attrs={'class':'summary module'}).get_text().replace('\n', ' ').replace('Summary:','').strip()
        except:
            summary=np.nan
        rating_tag=soup.find('dd', attrs={'class':'rating tags'}).get_text().replace('\n','').strip()
        row = {'link':x, 
               'title':title,
               'author':author, 
               'updatedate':datetime,
               'chapters':chapters,
               'language':language,
               'words':words,
               'kudos':kudos, 
               'comments':comments,
               'bookmarks':bookmarks,
               'hits':hits,
               'warning':warning,
               'mainship':mainrelationship,
               'relationship':relationship_list,
               'characters':character_list, 
               'tags':tags_list,
               'summary':summary,
               'rating':rating_tag
               }
        #print(row)
        data.append(row)
        time.sleep(10)
    return pd.DataFrame(data)

In [None]:
final=get_data(links_list)
# Split the chapter column into chapter and chapter_max, and create a completion column
final[['chapter','chapter_max']] = final.chapters.str.split("/", expand=True)
final['completion'] = final.apply(lambda row: 'completed' if row['chapter']==row['chapter_max'] else 'incomplete', axis=1)
final.to_csv.to_csv('ao3_lockwood_and_co.csv', index=False)