In [2]:
## Run selenium and chrome driver to scrape data from cloudbytes.dev
import time
import os.path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime

In [3]:
# Setup chrome options
chrome_options = Options()
chrome_options.headless = True  # Ensure GUI is off

# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")

# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

# Get page
pagenum = 1
link = f"https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page={pagenum}"
browser.get(link)

maxpagenum = int(browser.find_element(By.XPATH, '//ol[1]/li[13]').text.strip())


In [4]:
def get_links(browser):
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        links = a[0].get_attribute("href")
        data.append(links)
    return data
def process_pages(browser, maxpagenum):
    # datetime object containing current date and time
    now = datetime.now()
    dt_string = now.strftime("%d%m%Y_%H%M")
    print("date and time =", dt_string)

    # Create an empty list to hold the data
    data_list = get_links(browser)
    print('Page 1 has been processed')

    # Iterate through each page and append the data to the list
    for p in range(2, maxpagenum + 1):
        pagenum = p
        time.sleep(10)
        print(f'Processing page {pagenum}/{maxpagenum}')
        link = "https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page=" + str(pagenum)
        browser.get(link)
        data_list += get_links(browser)
        print(len(data_list))

    return data_list

links_list = process_pages(browser, maxpagenum)
# Wait for 10 seconds
time.sleep(10)
browser.quit()
link=pd.DataFrame(links_list, columns=['links'])
link.to_csv('links.csv', index=False)

date and time = 11052023_1621
Page 1 has been processed
Processing page 2/74
40
Processing page 3/74
60
Processing page 4/74
80
Processing page 5/74
100
Processing page 6/74
120
Processing page 7/74
140
Processing page 8/74
160
Processing page 9/74
180
Processing page 10/74
200
Processing page 11/74
220
Processing page 12/74
240
Processing page 13/74
260
Processing page 14/74
280
Processing page 15/74
300
Processing page 16/74
320
Processing page 17/74
340
Processing page 18/74
360
Processing page 19/74
380
Processing page 20/74
400
Processing page 21/74
420
Processing page 22/74
440
Processing page 23/74
460
Processing page 24/74
480
Processing page 25/74
500
Processing page 26/74
520
Processing page 27/74
540
Processing page 28/74
560
Processing page 29/74
580
Processing page 30/74
600
Processing page 31/74
620
Processing page 32/74
640
Processing page 33/74
660
Processing page 34/74
680
Processing page 35/74
700
Processing page 36/74
720
Processing page 37/74
740
Processing page 38/

In [5]:
def get_data(links_list):
    data = []
    slow_links = []  # List to store links that are taking too long to access

    for x in links_list:
        newlink = f"{x}?view_adult=true"
        print(f"getting missing data {links_list.index(x)+1}/{len(links_list)}")
        try:
            source = requests.get(newlink, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        except requests.exceptions.RequestException:
            print(f"Link {x} is taking too long to access. Adding to slow_links list.")
            slow_links.append(x)
            continue

        soup = BeautifulSoup(source, 'html.parser')
        title = soup.find('h2', attrs={'class': 'title heading'}).get_text().replace('\n', '').strip()
        author = soup.find('a', attrs={'rel': 'author'}).get_text() if soup.find('a', attrs={'rel': 'author'}) else "Anonymous"
        datetime = soup.find('dd', attrs={'class': 'published'}).get_text()
        updated = soup.find('dd', attrs={'class': 'status'}).get_text() if soup.find('dd', attrs={'class': 'status'}) else datetime
        chapters = soup.find('dd', attrs={'class': 'chapters'}).get_text()
        language = soup.find('dd', attrs={'class': 'language'}).get_text().replace('\n', '').strip()
        words = soup.find('dd', attrs={'class': 'words'}).get_text()
        kudos = soup.find('dd', attrs={'class': 'kudos'}).get_text() if soup.find('dd', attrs={'class': 'kudos'}) else 0
        comments = soup.find('dd', attrs={'class': 'comments'}).get_text() if soup.find('dd', attrs={'class': 'comments'}) else 0
        bookmarks = soup.find('dd', attrs={'class': 'bookmarks'}).get_text() if soup.find('dd', attrs={'class': 'bookmarks'}) else 0
        hits = soup.find('dd', attrs={'class': 'hits'}).get_text() if soup.find('dd', attrs={'class': 'hits'}) else 0
        warning = soup.find('dd', attrs={'class': 'warning tags'}).get_text().replace('\n', '').strip()
        ships_list = [r.get_text().strip() for r in soup.find_all('dd', attrs={'class': 'relationship tags'})]
        mainrelationship = ships_list[0] if ships_list else 'None'
        character_list = [c.get_text().strip() for c in soup.find_all('dd', attrs={'class': 'character tags'})]
        tags_list = [t.get_text().strip() for t in soup.find_all('dd', attrs={'class': 'freeform tags'})]
        position_list = [p.get_text().strip() for p in soup.find('span', attrs={'class': 'position'}).find('a')] if soup.find('span', attrs={'class': 'position'}) else 'not a series'
        summary = soup.find('div', attrs={'class': 'summary module'}).get_text().replace('\n', ' ').replace('Summary:', '').strip() if soup.find('div', attrs={'class': 'summary module'}) else np.nan
        rating_tag = soup.find('dd', attrs={'class': 'rating tags'}).get_text().replace('\n', '').strip()
        row = {
            'link': x,
            'title': title,
            'author': author,
            'published': datetime,
            'updatedate': updated,
            'chapters': chapters,
            'language': language,
            'words': words,
            'kudos': kudos,
            'comments': comments,
            'bookmarks': bookmarks,
            'hits': hits,
            'warning': warning,
            'mainship': mainrelationship,
            'relationship': ships_list,
            'characters': character_list,
            'tags': tags_list,
            'summary': summary,
            'rating': rating_tag
        }
        data.append(row)
        time.sleep(10)

    for l in slow_links:
        print(l)

    return pd.DataFrame(data)

final = get_data(links_list)

getting missing data 1/1473
getting missing data 2/1473
getting missing data 3/1473
getting missing data 4/1473
getting missing data 5/1473
getting missing data 6/1473
getting missing data 7/1473
getting missing data 8/1473
getting missing data 9/1473
getting missing data 10/1473
getting missing data 11/1473
getting missing data 12/1473
getting missing data 13/1473
getting missing data 14/1473
getting missing data 15/1473
getting missing data 16/1473
getting missing data 17/1473
getting missing data 18/1473
getting missing data 19/1473
getting missing data 20/1473
getting missing data 21/1473
getting missing data 22/1473
getting missing data 23/1473
getting missing data 24/1473
getting missing data 25/1473
getting missing data 26/1473
getting missing data 27/1473
getting missing data 28/1473
getting missing data 29/1473
getting missing data 30/1473
getting missing data 31/1473
getting missing data 32/1473
getting missing data 33/1473
getting missing data 34/1473
getting missing data 35

KeyboardInterrupt: 

In [None]:
# Split the chapter column into chapter and chapter_max, and create a completion column
final[['chapter','chapter_max']] = final.chapters.str.split("/", expand=True)
final['completion'] = final.apply(lambda row: 'completed' if row['chapter']==row['chapter_max'] else 'incomplete', axis=1)


In [None]:
filename=f'ao3_lockwood_and_co_ao_11052023_1335.csv'
final.to_csv(filename, index=False)