In [3]:
## Run selenium and chrome driver to scrape data from cloudbytes.dev
import time
import os.path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [None]:

## Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off
chrome_options.add_argument("--window-size=1920,1200")

# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")

# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

# Get page
pagenum=1
link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
browser.get(link)
maxpagenum=int(browser.find_element(By.XPATH,'//ol[1]/li[13]').text.strip())

def get_title(browser):
    """
    Extracts information for each work on a given page and returns a Pandas DataFrame.
    """
    # Find all the fanfic works on the page
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        WebDriverWait(browser, 10)
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        title = a[0].text.strip()
        try:
            author = a[1].text.strip()
        except:
            author = None
        links = a[0].get_attribute("href")
        datetime = work.find_element(By.CLASS_NAME, 'datetime').text.strip()
        chapters = work.find_elements(By.CLASS_NAME,'chapters')[1].text.strip()
        row = {'link':links, 
               'title':title,
               'author':author, 
               'updatedate':datetime,
               'chapters':chapters,
               }
        #print(row)
        data.append(row)
    return pd.DataFrame(data)

# Create an empty DataFrame to hold the data
data_list = pd.DataFrame()

# Iterate through each page and append the data to the DataFrame
for p in range(2,maxpagenum+1):
    pagenum=p
    time.sleep(10)
    print(f'procesing page {pagenum}')
    link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
    browser.get(link)
    data_list=data_list.append(get_title(browser), ignore_index=True)
    print(data_list)

#Wait for 10 seconds
time.sleep(10)
browser.quit()

# Split the chapter column into chapter and chapter_max, and create a completion column
data_list[['chapter','chapter_max']] = data_list.chapters.str.split("/", expand=True)
data_list['completion'] = data_list.apply(lambda row: 'completed' if row['chapter']==row['chapter_max'] else 'incomplete', axis=1)

# Save the DataFrame to a CSV file
data_list.to_csv('lockwood_part1.csv', index=False)

In [4]:
data_list=pd.read_csv('lockwood_part1.csv')

In [5]:
data_list1=data_list.replace('', np.nan)
data_list1=data_list1[data_list1.isna().any(axis=1)]
data_list1

Unnamed: 0,link,title,author,updatedate,chapters,chapter,chapter_max,completion
16,https://archiveofourown.org/works/46606960,Firsts,MagicPeach,19 Apr 2023,,,,incomplete
17,https://archiveofourown.org/works/46234456,,,,,,,incomplete
18,https://archiveofourown.org/works/44688367,,,,,,,incomplete
19,https://archiveofourown.org/works/46464568,,,,,,,incomplete
124,https://archiveofourown.org/works/46403539,Bite yours till it bleeds,,10 Apr 2023,1/1,1.0,1.0,completed
772,https://archiveofourown.org/works/44991853,prove it.,,13 Feb 2023,1/1,1.0,1.0,completed
822,https://archiveofourown.org/works/44793961,no game,,05 Feb 2023,1/1,1.0,1.0,completed
985,https://archiveofourown.org/works/29639628,Pierced Pretty,,22 Feb 2021,1/1,1.0,1.0,completed
994,https://archiveofourown.org/works/28339671,A christmas case,,26 Dec 2020,1/1,1.0,1.0,completed
1028,https://archiveofourown.org/works/20861537,And They Were Locked In,,07 Oct 2020,2/2,2.0,2.0,completed


In [7]:
data_list2 = data_list[~data_list['link'].isin(data_list1['link'])]
data_list2

Unnamed: 0,link,title,author,updatedate,chapters,chapter,chapter_max,completion
0,https://archiveofourown.org/works/45308677,"Rising, Rising",barronsfever (historyofbellarke),20 Apr 2023,14/14,14.0,14,completed
1,https://archiveofourown.org/works/46519282,Perfect,MarsStarPower,20 Apr 2023,7/?,7.0,?,incomplete
2,https://archiveofourown.org/works/46616701,yesterday we were just children,JennaTalbot,19 Apr 2023,1/7,1.0,7,incomplete
3,https://archiveofourown.org/works/46616434,waves that will never rest,ev0lution,19 Apr 2023,1/2,1.0,2,incomplete
4,https://archiveofourown.org/works/46616116,Just a dream?,Ren_Nakamura,20 Apr 2023,1/1,1.0,1,completed
...,...,...,...,...,...,...,...,...
1308,https://archiveofourown.org/works/3709479,That Green Gentleman (Things Have Changed),Amelia (BelowTheText),09 Apr 2015,1/1,1.0,1,completed
1309,https://archiveofourown.org/works/2282274,Ex Malo Bonum,lady_mab,01 Dec 2014,4/4,4.0,4,completed
1310,https://archiveofourown.org/works/1267453,and the world was gone,lady_mab,04 Mar 2014,7/7,7.0,7,completed
1311,https://archiveofourown.org/works/1169828,The Passage of Time,lady_mab,05 Feb 2014,1/1,1.0,1,completed


In [9]:
data_list1.columns

Index(['link', 'title', 'author', 'updatedate', 'chapters', 'chapter',
       'chapter_max', 'completion'],
      dtype='object')

In [18]:
def get_missing1(df):
    data=[]
    for x in df['link']:
        newlink=x+'?view_adult=true'
        print(newlink)
        source = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        soup = BeautifulSoup(source,'html.parser')
        title=soup.find('h2', attrs={'class':'title heading'}).get_text().replace('\n','').strip()
        try:
            author=soup.find('a', attrs={'rel':'author'}).get_text()
        except:
            author=np.nan
        datetime=soup.find('dd', attrs={'class':'published'}).get_text()
        chapters=soup.find('dd', attrs={'class':'chapters'}).get_text()
        row = {'link':x, 
               'title':title,
               'author':author, 
               'updatedate':datetime,
               'chapters':chapters
               }
        print(row)
        data.append(row)
        time.sleep(10)
    return pd.DataFrame(data)

In [19]:
data_list3=get_missing1(data_list1)

https://archiveofourown.org/works/46606960?view_adult=true
{'link': 'https://archiveofourown.org/works/46606960', 'title': 'Firsts', 'author': 'MagicPeach', 'updatedate': '2023-04-19', 'chapters': '1/1'}
https://archiveofourown.org/works/46234456?view_adult=true
{'link': 'https://archiveofourown.org/works/46234456', 'title': "You've Got Mail", 'author': 'spinnaker1509', 'updatedate': '2023-04-04', 'chapters': '5/9'}
https://archiveofourown.org/works/44688367?view_adult=true
{'link': 'https://archiveofourown.org/works/44688367', 'title': 'the bones of our past', 'author': 'moon2pluto', 'updatedate': '2023-01-31', 'chapters': '19/23'}
https://archiveofourown.org/works/46464568?view_adult=true
{'link': 'https://archiveofourown.org/works/46464568', 'title': 'Surrender My Heart', 'author': 'LillianKeng90', 'updatedate': '2023-04-13', 'chapters': '2/10'}
https://archiveofourown.org/works/46403539?view_adult=true
{'link': 'https://archiveofourown.org/works/46403539', 'title': 'Bite yours till

In [20]:
# Split the chapter column into chapter and chapter_max, and create a completion column
data_list3[['chapter','chapter_max']] = data_list3.chapters.str.split("/", expand=True)
data_list3['completion'] = data_list3.apply(lambda row: 'completed' if row['chapter']==row['chapter_max'] else 'incomplete', axis=1)
data_list3

Unnamed: 0,link,title,author,updatedate,chapters,chapter,chapter_max,completion
0,https://archiveofourown.org/works/46606960,Firsts,MagicPeach,2023-04-19,1/1,1,1,completed
1,https://archiveofourown.org/works/46234456,You've Got Mail,spinnaker1509,2023-04-04,5/9,5,9,incomplete
2,https://archiveofourown.org/works/44688367,the bones of our past,moon2pluto,2023-01-31,19/23,19,23,incomplete
3,https://archiveofourown.org/works/46464568,Surrender My Heart,LillianKeng90,2023-04-13,2/10,2,10,incomplete
4,https://archiveofourown.org/works/46403539,Bite yours till it bleeds,,2023-04-10,1/1,1,1,completed
5,https://archiveofourown.org/works/44991853,prove it.,,2023-02-13,1/1,1,1,completed
6,https://archiveofourown.org/works/44793961,no game,,2023-02-05,1/1,1,1,completed
7,https://archiveofourown.org/works/29639628,Pierced Pretty,,2021-02-22,1/1,1,1,completed
8,https://archiveofourown.org/works/28339671,A christmas case,,2020-12-26,1/1,1,1,completed
9,https://archiveofourown.org/works/20861537,And They Were Locked In,,2019-10-02,2/2,2,2,completed


In [21]:
data_listfinal=data_list2.append(data_list3)
data_listfinal[data_listfinal.isna().any(axis=1)]

Unnamed: 0,link,title,author,updatedate,chapters,chapter,chapter_max,completion
4,https://archiveofourown.org/works/46403539,Bite yours till it bleeds,,2023-04-10,1/1,1,1,completed
5,https://archiveofourown.org/works/44991853,prove it.,,2023-02-13,1/1,1,1,completed
6,https://archiveofourown.org/works/44793961,no game,,2023-02-05,1/1,1,1,completed
7,https://archiveofourown.org/works/29639628,Pierced Pretty,,2021-02-22,1/1,1,1,completed
8,https://archiveofourown.org/works/28339671,A christmas case,,2020-12-26,1/1,1,1,completed
9,https://archiveofourown.org/works/20861537,And They Were Locked In,,2019-10-02,2/2,2,2,completed
10,https://archiveofourown.org/works/25148770,Namaste,,2020-07-08,1/1,1,1,completed
11,https://archiveofourown.org/works/24737563,Down by the Riverside,,2020-06-15,12/12,12,12,completed
12,https://archiveofourown.org/works/24522589,Hearts Don’t Break Around Here,,2020-06-03,1/1,1,1,completed
13,https://archiveofourown.org/works/18352004,Curiosity,,2019-04-04,1/1,1,1,completed


In [22]:
# Save the DataFrame to a CSV file
data_listfinal.to_csv('lockwood_part1.csv', index=False)