In [1]:
## Run selenium and chrome driver to scrape data from cloudbytes.dev
import time
import os.path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
## Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off
# chrome_options.add_argument("--window-size=1920,1200")

In [3]:
# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")

In [4]:
# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

In [5]:
# Get page
pagenum=1
link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
#link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page=1"
browser.get(link)

maxpagenum=int(browser.find_element(By.XPATH,'//ol[1]/li[13]').text.strip())

In [6]:
def get_links(browser):
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        links = a[0].get_attribute("href")
        data.append(links)
    return data
    

In [7]:
def process_pages(browser, maxpagenum):
    # datetime object containing current date and time
    now = datetime.now()
    dt_string = now.strftime("%d%m%Y_%H%M")
    print("date and time =", dt_string)

    # Create an empty list to hold the data
    data_list = get_links(browser)
    print('Page 1 has been processed')

    # Iterate through each page and append the data to the list
    for p in range(2, maxpagenum + 1):
        pagenum = p
        time.sleep(10)
        print(f'Processing page {pagenum}/{maxpagenum}')
        link = "https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page=" + str(pagenum)
        browser.get(link)
        data_list += get_links(browser)
        print(len(data_list))

    return data_list, dt_string

links_list, dt_string = process_pages(browser, maxpagenum)
# Wait for 10 seconds
time.sleep(10)
browser.quit()
data = pd.DataFrame(columns=['link','title','author','published','updatedate','chapters','language','words','kudos','comments','bookmarks','hits','warning','mainship','relationship','characters','tags','summary','rating','series'])
data['link'] = links_list
data.to_csv('links.csv', index=False)

date and time = 25052023_0017
Page 1 has been processed
Processing page 2/79
40
Processing page 3/79
60
Processing page 4/79
80
Processing page 5/79
100
Processing page 6/79
120
Processing page 7/79
140
Processing page 8/79
160
Processing page 9/79
180
Processing page 10/79
200
Processing page 11/79
220
Processing page 12/79
240
Processing page 13/79
260
Processing page 14/79
280
Processing page 15/79
300
Processing page 16/79
320
Processing page 17/79
340
Processing page 18/79
360
Processing page 19/79
380
Processing page 20/79
400
Processing page 21/79
420
Processing page 22/79
440
Processing page 23/79
460
Processing page 24/79
480
Processing page 25/79
500
Processing page 26/79
520
Processing page 27/79
540
Processing page 28/79
560
Processing page 29/79
580
Processing page 30/79
600
Processing page 31/79
620
Processing page 32/79
640
Processing page 33/79
660
Processing page 34/79
680
Processing page 35/79
700
Processing page 36/79
720
Processing page 37/79
740
Processing page 38/

In [8]:
data = pd.read_csv('links.csv')

In [9]:
def get_data(data):
    counter=0
    slow_links = [] # List to store links that are taking too long to access
    for x in range(len(data['link'])):
        
        start_time = time.time()
        if pd.isnull(data.loc[x,'title']):
            print(f"getting missing data {x+1}/{len(data['link'])}")
            try:
                newlink=data['link'][x]+'?view_adult=true'
                page_start_time=time.time()
                source = requests.get(newlink, headers={
                              'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
                elapsed_time = time.time() - page_start_time
                if elapsed_time > 10:
                    print(f"Link {data['link'][x]} is taking too long to access. Adding to slow_links list.")
                    slow_links.append(data['link'][x])
                    continue
            except requests.exceptions.RequestException:
                print(f"Link {data['link'][x]} is taking too long to access. Adding to slow_links list.")
                slow_links.append(data['link'][x])
                continue
            soup = BeautifulSoup(source,'html.parser')
            try:
                data.loc[x,'title']=soup.find('h2', attrs={'class':'title heading'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'title']=np.nan
            try:
                data.loc[x,'author']=soup.find('a', attrs={'rel':'author'}).get_text()
            except:
                data.loc[x,'author']="Anonymous"
            try:
                data.loc[x,'published']=soup.find('dd', attrs={'class':'published'}).get_text()
            except:
                data.loc[x,'published']=np.nan
            try:
                data.loc[x,'updatedate'] = soup.find('dd', attrs={'class':'status'}).get_text()
            except:
                data.loc[x,'updatedate']=data['published'][x]
            
            try:
                data.loc[x,'chapters']=soup.find('dd', attrs={'class':'chapters'}).get_text()
            except:
                data.loc[x,'chapters']=np.nan
            
            try:
                data.loc[x,'language']=soup.find('dd', attrs={'class':'language'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'language']=np.nan
            
            try:
                data.loc[x,'words']=soup.find('dd', attrs={'class':'words'}).get_text()
            except:
                data.loc[x,'words']=np.nan
            try:
                data.loc[x,'kudos']=soup.find('dd', attrs={'class':'kudos'}).get_text()
            except:
                data.loc[x,'kudos']=0
            try:
                data.loc[x,'comments']=soup.find('dd', attrs={'class':'comments'}).get_text()
            except:
                data.loc[x,'comments']=0
            try:
                data.loc[x,'bookmarks']=soup.find('dd', attrs={'class':'bookmarks'}).get_text()
            except:
                data.loc[x,'bookmarks']=0
            try:
                data.loc[x,'hits']=soup.find('dd', attrs={'class':'hits'}).get_text()
            except:
                data.loc[x,'hits']=0
            
            try:
                data.loc[x,'warning']=soup.find('dd', attrs={'class':'warning tags'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'warning']=0
            try:
                ships = soup.find('dd', attrs={'class':'relationship tags'})
                ships_list=ships.find_all('a', attrs={'class':'tag'})
                data.loc[x,'mainship'] = ships_list[0].get_text().strip()
                relationship_list = []
                for r in ships_list:
                    relationship_list.append(r.get_text().strip())
                data.loc[x,'relationship'] = ', '.join(character_list)
            except:
                data.loc[x,'mainship']='None'
                data.loc[x,'relationship'] = ''
            try:
                char = soup.find('dd', attrs={'class':'character tags'})
                char_list=char.find_all('a', attrs={'class':'tag'})
                character_list = []
                for c in char_list:
                    character_list.append(c.get_text().strip())
                data.loc[x,'characters'] = ', '.join(character_list)
            except:
                data.loc[x,'characters']=''
            try:
                freeform = soup.find('dd', attrs={'class':'freeform tags'})
                freefom_list=freeform.find_all('a', attrs={'class':'tag'})
                tags_list = []
                for t in freefom_list:
                    tags_list.append(t.get_text().strip())
                data.loc[x,'tags'] = ', '.join(character_list)
            except:
                data.loc[x,'tags'] = ''
            try:
                position = soup.find('span', attrs={'class':'position'})
                position_list = position.find('a')
                series_list = []
                for p in position_list:
                    series_list.append(p.get_text().strip())
                data.loc[x,'series'] = ', '.join(character_list)
            except:
                data.loc[x,'series'] = 'not a series'
            try:
                data.loc[x,'summary']=soup.find('div', attrs={'class':'summary module'}).get_text().replace('\n', ' ').replace('Summary:','').strip()
            except:
                data.loc[x,'summary']=np.nan
            
            try:
                data.loc[x,'rating']=soup.find('dd', attrs={'class':'rating tags'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'rating']=np.nan
            print(data.iloc[x])
            time.sleep(10)
        elapsed_total_time = time.time() - start_time
        if elapsed_total_time > 120*60:
            for l in slow_links:
                print(l)
            return data
    for l in slow_links:
        print(l)
    return pd.DataFrame(data)

In [10]:
data=get_data(data)

getting missing data 1/1570
link                   https://archiveofourown.org/works/47404399
title                                             A Small Problem
author                                       Lidzloves_literature
published                                              2023-05-24
updatedate                                             2023-05-24
chapters                                                      1/1
language                                                  English
words                                                       2,061
kudos                                                           8
comments                                                      0.0
bookmarks                                                     0.0
hits                                                           76
mainship                                                     None
relationship                                                     
characters      Anthony Lockwood, Lucy Carlyle, 

In [11]:
data.isnull().sum() # Check for missing values

link             0
title            2
author           2
published        2
updatedate       2
chapters         2
language         2
words            2
kudos            2
comments         2
bookmarks        2
hits             2
mainship         2
relationship     2
characters       2
tags             2
summary         14
rating           2
series           2
dtype: int64

In [12]:
data

Unnamed: 0,link,title,author,published,updatedate,chapters,language,words,kudos,comments,bookmarks,hits,warning,mainship,relationship,characters,tags,summary,rating,series
0,https://archiveofourown.org/works/47404399,A Small Problem,Lidzloves_literature,2023-05-24,2023-05-24,1/1,English,2061,8,0.0,0.0,76,Creator Chose Not To Use Archive Warnings,,,"Anthony Lockwood, Lucy Carlyle, George Cubbins...","Anthony Lockwood, Lucy Carlyle, George Cubbins...",Lucy mysteriously woke up as a baby and Lockwo...,General Audiences,not a series
1,https://archiveofourown.org/works/44911999,Lockwood & Co. - The Crying Corridor,AwkwardWerewolf15,2023-02-10,2023-05-24,12/15,English,89870,920,492,189,20004,No Archive Warnings Apply,Lucy Carlyle/Anthony Lockwood,"Anthony Lockwood, Lucy Carlyle, George Cubbins...","Lucy Carlyle, Anthony Lockwood, George Cubbins...","Lucy Carlyle, Anthony Lockwood, George Cubbins...",As the dust settled over the case of Dr. Edmun...,Teen And Up Audiences,not a series
2,https://archiveofourown.org/works/47159842,Seaworthy,spinnaker1509,2023-05-14,2023-05-24,3/?,English,4316,91,42,13,503,No Archive Warnings Apply,Lucy Carlyle/Anthony Lockwood,"Lucy Carlyle, Anthony Lockwood, George Cubbins...","Lucy Carlyle, Anthony Lockwood, Quill Kipps, B...","Lucy Carlyle, Anthony Lockwood, Quill Kipps, B...",Lucy Carlyle is the sole survivor of a Kraken ...,Not Rated,not a series
3,https://archiveofourown.org/works/46680625,We Have Never Been Children,AriannaGrace,2023-04-23,2023-05-24,14/?,English,14772,143,81,14,3303,Creator Chose Not To Use Archive Warnings,Lucy Carlyle/Anthony Lockwood,"Lucy Carlyle, Anthony Lockwood, Quill Kipps, B...","Lucy Carlyle, Anthony Lockwood, George Cubbins...","Lucy Carlyle, Anthony Lockwood, George Cubbins...",Inspector Barnes wants her to go home. He gave...,Teen And Up Audiences,not a series
4,https://archiveofourown.org/works/46903384,Dirty Little Secret,Random_Nerd3,2023-05-02,2023-05-24,3/?,English,10045,55,81,4,1311,Rape/Non-Con,Lucy Carlyle/Anthony Lockwood,"Lucy Carlyle, Anthony Lockwood, George Cubbins...","Lucy Carlyle, Fetch Anthony Lockwood, Anthony ...","Lucy Carlyle, Fetch Anthony Lockwood, Anthony ...",Lucy really needs to stop accessorizing with s...,Explicit,"Lucy Carlyle, Fetch Anthony Lockwood, Anthony ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1565,https://archiveofourown.org/works/3709479,That Green Gentleman (Things Have Changed),Amelia (BelowTheText),2015-04-09,2015-04-09,1/1,English,4570,19,1,1,332,Creator Chose Not To Use Archive Warnings,,,"Anthony Lockwood, Lucy Carlyle, George Cubbins...",,I got the idea from P!atd's song of the same n...,Teen And Up Audiences,not a series
1566,https://archiveofourown.org/works/2282274,Ex Malo Bonum,lady_mab,2014-09-09,2014-12-02,4/4,English,10026,20,6,2,486,No Archive Warnings Apply,,,"Johannes Cabal, Leonie Barrow, Horst Cabal, Lu...","Johannes Cabal, Leonie Barrow, Horst Cabal, Lu...",New department of DEPRAC offers up a job board...,General Audiences,not a series
1567,https://archiveofourown.org/works/1267453,and the world was gone,lady_mab,2014-03-04,2014-03-05,7/7,English,7561,69,4,5,1266,No Archive Warnings ApplyMajor Character Death,,,"Lucy Carlyle, Anthony Lockwood, George Cubbins","Lucy Carlyle, Anthony Lockwood, George Cubbins",Childhood is an odd thing. \tIt is filled with...,Teen And Up Audiences,not a series
1568,https://archiveofourown.org/works/1169828,The Passage of Time,lady_mab,2014-02-05,2014-02-05,1/1,English,1102,114,9,5,1695,No Archive Warnings Apply,,,"Lucy Carlyle, Anthony Lockwood","Lucy Carlyle, Anthony Lockwood","I waited for him to try again, to dissuade me ...",General Audiences,not a series


In [13]:

# Split the chapter column into chapter and chapter_max, and create a completion column
data[['chapter','chapter_max']] = data.chapters.str.split("/", expand=True)
data['completion'] = data.apply(lambda row: 'completed' if row['chapter']==row['chapter_max'] else 'incomplete', axis=1)


In [14]:
filename=f'ao3_lockwood_and_co_ao_{dt_string}.csv'
data.to_csv(filename, index=False)

In [15]:
filename=f'ao3_lockwood_and_co_ao_{dt_string}.csv'
working_df = pd.read_csv(filename)

In [16]:
working_df['published'] = pd.to_datetime(working_df['published'])
working_df['updatedate'] = pd.to_datetime(working_df['updatedate'])
working_df['currentdate'] = max(working_df['updatedate'])
working_df['datediff_pub'] = (working_df['currentdate']-working_df['published'])/np.timedelta64(1,'D')
working_df['datediff'] = (working_df['currentdate']-working_df['updatedate'])/np.timedelta64(1,'D')

In [17]:
working_df['classification'] = working_df.apply(lambda row: 'oneshot' if row['chapter_max']=='1' else ('multichapter(complete)' if row['completion']=='completed' else ('multichapter(updating)' if row['datediff']<=60 else 'multichapter(dormant)')), axis=1)


In [18]:
working_df.columns

Index(['link', 'title', 'author', 'published', 'updatedate', 'chapters',
       'language', 'words', 'kudos', 'comments', 'bookmarks', 'hits',
       'rating', 'series', 'chapter', 'chapter_max', 'completion',
       'currentdate', 'datediff_pub', 'datediff', 'classification'],
      dtype='object')

In [19]:
for row in working_df['characters']:
    print(row)

Anthony Lockwood, Lucy Carlyle, George Cubbins | George Karim
Lucy Carlyle, Anthony Lockwood, George Cubbins | George Karim, The Skull (Lockwood & Co.), Norrie White, Mary Carlyle, Montagu Barnes, Quill Kipps, Lucy Carlyle's Mother, Mr Jacobs, Original Characters
Lucy Carlyle, Anthony Lockwood, Quill Kipps, Bobby Vernon, Kat Godwin, Ned Shaw, Holly Munro, George Cubbins | George Karim, The Skull (Lockwood & Co.)
Lucy Carlyle, Anthony Lockwood, George Cubbins | George Karim, Montagu Barnes, The Skull (Lockwood & Co.), Anthony Lockwood's tie
Lucy Carlyle, Fetch Anthony Lockwood, Anthony Lockwood, George Cubbins | George Karim, Montagu Barnes
Lucy Carlyle, Anthony Lockwood, George Cubbins | George Karim, Montagu Barnes, Quill Kipps
Lucy Carlyle, Anthony Lockwood
Quill Kipps, Lucy Carlyle, Anthony Lockwood, The Skull (Lockwood & Co.), George Cubbins | George Karim, Holly Munro
Lucy Carlyle, Anthony Lockwood, George Cubbins | George Karim, Flo Bones, The Skull (Lockwood & Co.), Quill Kipps,

In [20]:
def get_num_item(column):
    item=[]
    for row in column:
        try:
            row_item = row.replace("[","").replace("]","").replace("'","").replace('"','').split(",")
        except:
            row_item = ['']
        if row_item!=['']:
            item.append(len(row_item))
        else:
            item.append(0) 
    return item

In [21]:
author_df = working_df.groupby(['author'], as_index=False).agg({'updatedate':'max', 'published':'min'})
author_df = author_df.rename(columns={'updatedate':'lastauthorupdate','published':'firstauthorupdate'})

In [22]:
if 'firstauthorupdate_x' in working_df.columns:
    working_df=working_df.drop(columns=['firstauthorupdate_x','lastauthorupdate_x', 'lastauthorupdate_y','firstauthorupdate_y'])
    working_df=working_df.merge(author_df, how='left', on='author')
else:
    working_df=working_df.merge(author_df, how='left', on='author')

In [23]:
working_df['author_lastupdate_diff'] = (working_df['currentdate']-working_df['lastauthorupdate'])/np.timedelta64(1,'D')
working_df['daysactive'] = (working_df['lastauthorupdate']-working_df['firstauthorupdate'])/np.timedelta64(1,'D')
working_df['daysincefirtupload'] = (working_df['currentdate']-working_df['firstauthorupdate'])/np.timedelta64(1,'D')
working_df['author_activity'] = working_df['author_lastupdate_diff'].apply(lambda x: 'active' if x<=60 else 'inactive')

In [24]:
working_df[['lastauthorupdate','firstauthorupdate','daysactive']].sort_values(by=['daysactive'], ascending=False)

Unnamed: 0,lastauthorupdate,firstauthorupdate,daysactive
1557,2023-02-04,2014-02-05,3286.0
1542,2023-02-04,2014-02-05,3286.0
1541,2023-02-04,2014-02-05,3286.0
1568,2023-02-04,2014-02-05,3286.0
1566,2023-02-04,2014-02-05,3286.0
...,...,...,...
849,2023-03-05,2023-03-05,0.0
837,2023-03-05,2023-03-05,0.0
1569,2013-12-16,2013-12-16,0.0
477,NaT,NaT,


In [25]:
working_df['num_relationship']=get_num_item(working_df['relationship'])
working_df['num_characters']=get_num_item(working_df['characters'])
working_df['num_tags']=get_num_item(working_df['tags'])

In [26]:
def get_df_item(id_column,item_column, name_col):
    item_list=[]
    for x in range(len(id_column)):
        try:
            row_item = item_column[x].replace("[","").replace("]","").replace("'","").replace('"','').split(",")
        except:
            row_item = ['']
        for item in row_item:
            item=item.strip()
            if '&' not in item:
                item_list.append([id_column[x],item])
    return pd.DataFrame(item_list, columns = ['title', name_col])

In [27]:
char_df = get_df_item(working_df['title'], working_df['characters'], 'charactername')
character = pd.read_csv('characters.csv')
char_df =char_df.merge(character, how='left', on='charactername')
char_df['character'] = char_df['character'].fillna(char_df['charactername'])
char_df=char_df.drop(columns='charactername')
char_df

Unnamed: 0,title,character
0,A Small Problem,Anthony Lockwood
1,A Small Problem,Lucy Carlyle
2,A Small Problem,George Cubbins | George Karim
3,Lockwood & Co. - The Crying Corridor,Lucy Carlyle
4,Lockwood & Co. - The Crying Corridor,Anthony Lockwood
...,...,...
6287,The Passage of Time,Lucy Carlyle
6288,The Passage of Time,Anthony Lockwood
6289,To Dust,George Cubbins | George Karim
6290,To Dust,Lucy Carlyle


In [28]:
relationship_df = get_df_item(working_df['title'], working_df['relationship'], 'shiptag')
relationship = pd.read_csv('relationships.csv')
relationship_df =relationship_df.merge(relationship, how='left', on='shiptag')
relationship_df['ship'] = relationship_df['ship'].fillna(relationship_df['shiptag'])
relationship_df=relationship_df.drop(columns='shiptag')
relationship_df

Unnamed: 0,title,relationship_desc,ship
0,A Small Problem,,
1,Lockwood & Co. - The Crying Corridor,,Anthony Lockwood
2,Lockwood & Co. - The Crying Corridor,,Lucy Carlyle
3,Lockwood & Co. - The Crying Corridor,,George Cubbins | George Karim
4,Seaworthy,,Lucy Carlyle
...,...,...,...
6088,Ex Malo Bonum,,
6089,and the world was gone,,
6090,The Passage of Time,,
6091,To Dust,,Lucy Carlyle


In [29]:
tags_df= get_df_item(working_df['title'], working_df['tags'], 'tag_item')
tags_df

Unnamed: 0,title,tag_item
0,A Small Problem,Anthony Lockwood
1,A Small Problem,Lucy Carlyle
2,A Small Problem,George Cubbins | George Karim
3,Lockwood & Co. - The Crying Corridor,Lucy Carlyle
4,Lockwood & Co. - The Crying Corridor,Anthony Lockwood
...,...,...
6306,The Passage of Time,Lucy Carlyle
6307,The Passage of Time,Anthony Lockwood
6308,To Dust,George Cubbins
6309,To Dust,Lucy Carlyle


In [30]:
char_rel_tag = char_df.merge(relationship_df, how='outer', on='title')
char_rel_tag = char_rel_tag.merge(tags_df, how='outer', on='title')
char_rel_tag

Unnamed: 0,title,character,relationship_desc,ship,tag_item
0,A Small Problem,Anthony Lockwood,,,Anthony Lockwood
1,A Small Problem,Anthony Lockwood,,,Lucy Carlyle
2,A Small Problem,Anthony Lockwood,,,George Cubbins | George Karim
3,A Small Problem,Lucy Carlyle,,,Anthony Lockwood
4,A Small Problem,Lucy Carlyle,,,Lucy Carlyle
...,...,...,...,...,...
139979,To Be Alive,,,George Cubbins | George Karim,
139980,Деякі теорії,,,Anthony Lockwood,
139981,Деякі теорії,,,George Cubbins,
139982,Деякі теорії,,,George Cubbins Mother,


In [31]:
char_rel_tag.to_csv('character_relationship_tags.csv', index=False)

In [32]:
working_df.to_csv(filename, index=False)