In [71]:
## Run selenium and chrome driver to scrape data from cloudbytes.dev
import time
import os.path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime

In [62]:
## Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off
# chrome_options.add_argument("--window-size=1920,1200")

In [63]:
# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")

In [64]:
# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

In [65]:
# Get page
pagenum=1
link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
#link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page=1"
browser.get(link)

maxpagenum=int(browser.find_element(By.XPATH,'//ol[1]/li[13]').text.strip())

In [66]:
def get_links(browser):
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        links = a[0].get_attribute("href")
        data.append(links)
    return data
    

In [67]:
def process_pages(browser, maxpagenum):
    # datetime object containing current date and time
    now = datetime.now()
    dt_string = now.strftime("%d%m%Y_%H%M")
    print("date and time =", dt_string)

    # Create an empty list to hold the data
    data_list = get_links(browser)
    print('Page 1 has been processed')

    # Iterate through each page and append the data to the list
    for p in range(2, maxpagenum + 1):
        pagenum = p
        time.sleep(10)
        print(f'Processing page {pagenum}/{maxpagenum}')
        link = "https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page=" + str(pagenum)
        browser.get(link)
        data_list += get_links(browser)
        print(len(data_list))

    return data_list, dt_string

links_list, dt_string = process_pages(browser, maxpagenum)
# Wait for 10 seconds
time.sleep(10)
browser.quit()
data = pd.DataFrame(columns=['link','title','author','published','updatedate','chapters','language','words','kudos','comments','bookmarks','hits','warning','mainship','relationship','characters','tags','summary','rating','series'])
data['link'] = links_list
data.to_csv('links.csv', index=False)

date and time = 12052023_1200
Page 1 has been processed
Processing page 2/74
40
Processing page 3/74
60
Processing page 4/74
80
Processing page 5/74
100
Processing page 6/74
120
Processing page 7/74
140
Processing page 8/74
160
Processing page 9/74
180
Processing page 10/74
200
Processing page 11/74
220
Processing page 12/74
240
Processing page 13/74
260
Processing page 14/74
280
Processing page 15/74
300
Processing page 16/74
320
Processing page 17/74
340
Processing page 18/74
360
Processing page 19/74
380
Processing page 20/74
400
Processing page 21/74
420
Processing page 22/74
440
Processing page 23/74
460
Processing page 24/74
480
Processing page 25/74
500
Processing page 26/74
520
Processing page 27/74
540
Processing page 28/74
560
Processing page 29/74
580
Processing page 30/74
600
Processing page 31/74
620
Processing page 32/74
640
Processing page 33/74
660
Processing page 34/74
680
Processing page 35/74
700
Processing page 36/74
720
Processing page 37/74
740
Processing page 38/

In [72]:
data = pd.read_csv('links.csv')

In [80]:
def get_data(data):
    counter=0
    slow_links = [] # List to store links that are taking too long to access
    for x in range(len(data['link'])):
        
        start_time = time.time()
        if pd.isnull(data.loc[x,'title']):
            print(f"getting missing data {x+1}/{len(data['link'])}")
            try:
                newlink=data['link'][x]+'?view_adult=true'
                page_start_time=time.time()
                source = requests.get(newlink, headers={
                              'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
                elapsed_time = time.time() - page_start_time
                if elapsed_time > 10:
                    print(f"Link {data['link'][x]} is taking too long to access. Adding to slow_links list.")
                    slow_links.append(data['link'][x])
                    continue
            except requests.exceptions.RequestException:
                print(f"Link {data['link'][x]} is taking too long to access. Adding to slow_links list.")
                slow_links.append(data['link'][x])
                continue
            soup = BeautifulSoup(source,'html.parser')
            try:
                data.loc[x,'title']=soup.find('h2', attrs={'class':'title heading'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'title']=np.nan
            try:
                data.loc[x,'author']=soup.find('a', attrs={'rel':'author'}).get_text()
            except:
                data.loc[x,'author']="Anonymous"
            try:
                data.loc[x,'published']=soup.find('dd', attrs={'class':'published'}).get_text()
            except:
                data.loc[x,'published']=np.nan
            try:
                data.loc[x,'updatedate'] = soup.find('dd', attrs={'class':'status'}).get_text()
            except:
                data.loc[x,'updatedate']=data['published'][x]
            
            try:
                data.loc[x,'chapters']=soup.find('dd', attrs={'class':'chapters'}).get_text()
            except:
                data.loc[x,'chapters']=np.nan
            
            try:
                data.loc[x,'language']=soup.find('dd', attrs={'class':'language'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'language']=np.nan
            
            try:
                data.loc[x,'words']=soup.find('dd', attrs={'class':'words'}).get_text()
            except:
                data.loc[x,'words']=np.nan
            try:
                data.loc[x,'kudos']=soup.find('dd', attrs={'class':'kudos'}).get_text()
            except:
                data.loc[x,'kudos']=0
            try:
                data.loc[x,'comments']=soup.find('dd', attrs={'class':'comments'}).get_text()
            except:
                data.loc[x,'comments']=0
            try:
                data.loc[x,'bookmarks']=soup.find('dd', attrs={'class':'bookmarks'}).get_text()
            except:
                data.loc[x,'bookmarks']=0
            try:
                data.loc[x,'hits']=soup.find('dd', attrs={'class':'hits'}).get_text()
            except:
                data.loc[x,'hits']=0
            
            try:
                data.loc[x,'warning']=soup.find('dd', attrs={'class':'warning tags'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'warning']=0
            try:
                ships = soup.find('dd', attrs={'class':'relationship tags'})
                ships_list=ships.find_all('a', attrs={'class':'tag'})
                data.loc[x,'mainship'] = ships_list[0].get_text().strip()
                relationship_list = []
                for r in ships_list:
                    relationship_list.append(r.get_text().strip())
                data.loc[x,'relationship'] = ', '.join(character_list)
            except:
                data.loc[x,'mainship']='None'
                data.loc[x,'relationship'] = ''
            try:
                char = soup.find('dd', attrs={'class':'character tags'})
                char_list=char.find_all('a', attrs={'class':'tag'})
                character_list = []
                for c in char_list:
                    character_list.append(c.get_text().strip())
                data.loc[x,'characters'] = ', '.join(character_list)
            except:
                data.loc[x,'characters']=''
            try:
                freeform = soup.find('dd', attrs={'class':'freeform tags'})
                freefom_list=freeform.find_all('a', attrs={'class':'tag'})
                tags_list = []
                for t in freefom_list:
                    tags_list.append(t.get_text().strip())
                data.loc[x,'tags'] = ', '.join(character_list)
            except:
                data.loc[x,'tags'] = ''
            try:
                position = soup.find('span', attrs={'class':'position'})
                position_list = position.find('a')
                series_list = []
                for p in position_list:
                    series_list.append(p.get_text().strip())
                data.loc[x,'series'] = ', '.join(character_list)
            except:
                data.loc[x,'series'] = 'not a series'
            try:
                data.loc[x,'summary']=soup.find('div', attrs={'class':'summary module'}).get_text().replace('\n', ' ').replace('Summary:','').strip()
            except:
                data.loc[x,'summary']=np.nan
            
            try:
                data.loc[x,'rating']=soup.find('dd', attrs={'class':'rating tags'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'rating']=np.nan
            print(data.iloc[x])
            time.sleep(10)
        elapsed_total_time = time.time() - start_time
        if elapsed_total_time > 120*60:
            for l in slow_links:
                print(l)
            return data
    for l in slow_links:
        print(l)
    return pd.DataFrame(data)

In [86]:
data=get_data(data)

getting missing data 1349/1479
link                   https://archiveofourown.org/works/15630885
title                                   A meeting in the Archives
author            Flames and Fairy Tales (Flames_and_Fairy_Tales)
published                                              2018-08-09
updatedate                                             2018-08-09
chapters                                                      1/1
language                                                  English
words                                                       1,715
kudos                                                          75
comments                                                        9
bookmarks                                                       8
hits                                                          465
mainship                                                     None
relationship                                                     
characters                        George Cubb

In [87]:
data.isnull().sum() # Check for missing values

link             0
title            0
author           0
published        0
updatedate       0
chapters         0
language         0
words            0
kudos            0
comments         0
bookmarks        0
hits             0
mainship         0
relationship     0
characters       0
tags             0
summary         11
rating           0
series           0
chapter          3
chapter_max      3
completion       0
dtype: int64

In [88]:

# Split the chapter column into chapter and chapter_max, and create a completion column
data[['chapter','chapter_max']] = data.chapters.str.split("/", expand=True)
data['completion'] = data.apply(lambda row: 'completed' if row['chapter']==row['chapter_max'] else 'incomplete', axis=1)


In [89]:
filename=f'ao3_lockwood_and_co_ao_{dt_string}.csv'
data.to_csv(filename, index=False)

In [91]:
filename=f'ao3_lockwood_and_co_ao_{dt_string}.csv'
working_df = pd.read_csv(filename)

In [92]:
working_df['published'] = pd.to_datetime(working_df['published'])
working_df['updatedate'] = pd.to_datetime(working_df['updatedate'])
working_df['currentdate'] = max(working_df['updatedate'])
working_df['datediff_pub'] = (working_df['currentdate']-working_df['published'])/np.timedelta64(1,'D')
working_df['datediff'] = (working_df['currentdate']-working_df['updatedate'])/np.timedelta64(1,'D')

In [93]:
working_df['classification'] = working_df.apply(lambda row: 'oneshot' if row['chapter_max']=='1' else ('multichapter(complete)' if row['completion']=='completed' else ('multichapter(updating)' if row['datediff']<=60 else 'multichapter(dormant)')), axis=1)


In [94]:
working_df.columns

Index(['link', 'title', 'author', 'published', 'updatedate', 'chapters',
       'language', 'words', 'kudos', 'comments', 'bookmarks', 'hits',
       'rating', 'series', 'chapter', 'chapter_max', 'completion',
       'currentdate', 'datediff_pub', 'datediff', 'classification'],
      dtype='object')

In [107]:
for row in working_df['characters']:
    print(row)

Lucy Carlyle, Anthony Lockwood, George Cubbins | George Karim, Montagu Barnes, Flo Bones, Quill Kipps, The Skull (Lockwood & Co.)
Lucy Carlyle, Anthony Lockwood, George Cubbins | George Karim, Holly Munro, Quill Kipps, Montagu Barnes
Lucy Carlyle, Anthony Lockwood, George Cubbins | George Karim, Holly Munro, Not Enough Kipps
Lucy Carlyle, Anthony Lockwood, George Cubbins | George Karim
Lucy Carlyle, George Cubbins | George Karim, Anthony Lockwood
Lucy Carlyle, Lucy Carlyle's Mother, Anthony Lockwood, George Cubbins | George Karim, Percy Jackson, Annabeth Chase (Percy Jackson), Chiron, Montagu Barnes
George Cubbins | George Karim
Lucy Carlyle, Anthony Lockwood, George Cubbins | George Karim, Quill Kipps, Original Child Character(s)
Lucy Carlyle, Anthony Lockwood, George Cubbins | George Karim, Montagu Barnes, Quill Kipps, Kat Godwin
Lucy Carlyle, George Cubbins | George Karim, Anthony Lockwood
Anthony Lockwood, Lucy Carlyle, George Cubbins | George Karim, Quill Kipps, Holly Munro
Lucy C

In [110]:
def get_num_item(column):
    item=[]
    for row in column:
        try:
            row_item = row.replace("[","").replace("]","").replace("'","").replace('"','').split(",")
        except:
            row_item = ['']
        if row_item!=['']:
            item.append(len(row_item))
        else:
            item.append(0) 
    return item

In [98]:
author_df = working_df.groupby(['author'], as_index=False).agg({'updatedate':'max', 'published':'min'})
author_df = author_df.rename(columns={'updatedate':'lastauthorupdate','published':'firstauthorupdate'})

In [99]:
if 'firstauthorupdate_x' in working_df.columns:
    working_df=working_df.drop(columns=['firstauthorupdate_x','lastauthorupdate_x', 'lastauthorupdate_y','firstauthorupdate_y'])
    working_df=working_df.merge(author_df, how='left', on='author')
else:
    working_df=working_df.merge(author_df, how='left', on='author')

In [100]:
working_df['author_lastupdate_diff'] = (working_df['currentdate']-working_df['lastauthorupdate'])/np.timedelta64(1,'D')
working_df['daysactive'] = (working_df['lastauthorupdate']-working_df['firstauthorupdate'])/np.timedelta64(1,'D')
working_df['daysincefirtupload'] = (working_df['currentdate']-working_df['firstauthorupdate'])/np.timedelta64(1,'D')
working_df['author_activity'] = working_df['author_lastupdate_diff'].apply(lambda x: 'active' if x<=60 else 'inactive')

In [101]:
working_df[['lastauthorupdate','firstauthorupdate','daysactive']].sort_values(by=['daysactive'], ascending=False)

Unnamed: 0,lastauthorupdate,firstauthorupdate,daysactive
1413,2023-02-04,2014-02-05,3286.0
1466,2023-02-04,2014-02-05,3286.0
1412,2023-02-04,2014-02-05,3286.0
1450,2023-02-04,2014-02-05,3286.0
1451,2023-02-04,2014-02-05,3286.0
...,...,...,...
697,2023-03-09,2023-03-09,0.0
922,2023-02-14,2023-02-14,0.0
1196,2020-09-24,2020-09-24,0.0
1192,2020-10-15,2020-10-15,0.0


In [111]:
working_df['num_relationship']=get_num_item(working_df['relationship'])
working_df['num_characters']=get_num_item(working_df['characters'])
working_df['num_tags']=get_num_item(working_df['tags'])

In [114]:
def get_df_item(id_column,item_column, name_col):
    item_list=[]
    for x in range(len(id_column)):
        try:
            row_item = item_column[x].replace("[","").replace("]","").replace("'","").replace('"','').split(",")
        except:
            row_item = ['']
        for item in row_item:
            item=item.strip()
            if '&' not in item:
                item_list.append([id_column[x],item])
    return pd.DataFrame(item_list, columns = ['title', name_col])

In [115]:
char_df = get_df_item(working_df['title'], working_df['characters'], 'charactername')
character = pd.read_csv('characters.csv')
char_df =char_df.merge(character, how='left', on='charactername')
char_df['character'] = char_df['character'].fillna(char_df['charactername'])
char_df=char_df.drop(columns='charactername')
char_df

Unnamed: 0,title,character
0,Of Curses and Temptations,Lucy Carlyle
1,Of Curses and Temptations,Anthony Lockwood
2,Of Curses and Temptations,George Cubbins | George Karim
3,Of Curses and Temptations,Montague Barnes
4,Of Curses and Temptations,Flo Bones
...,...,...
5912,The Passage of Time,Lucy Carlyle
5913,The Passage of Time,Anthony Lockwood
5914,To Dust,George Cubbins | George Karim
5915,To Dust,Lucy Carlyle


In [116]:
relationship_df = get_df_item(working_df['title'], working_df['relationship'], 'shiptag')
relationship = pd.read_csv('relationships.csv')
relationship_df =relationship_df.merge(relationship, how='left', on='shiptag')
relationship_df['ship'] = relationship_df['ship'].fillna(relationship_df['shiptag'])
relationship_df=relationship_df.drop(columns='shiptag')
relationship_df

Unnamed: 0,title,relationship_desc,ship
0,Of Curses and Temptations,,
1,"Because everything is the same until, very sud...",,Lucy Carlyle
2,"Because everything is the same until, very sud...",,Anthony Lockwood
3,"Because everything is the same until, very sud...",,George Cubbins | George Karim
4,"Because everything is the same until, very sud...",,Montagu Barnes
...,...,...,...
5688,Ex Malo Bonum,,
5689,and the world was gone,,
5690,The Passage of Time,,
5691,To Dust,,Lucy Carlyle


In [117]:
tags_df= get_df_item(working_df['title'], working_df['tags'], 'tag_item')
tags_df

Unnamed: 0,title,tag_item
0,Of Curses and Temptations,Lucy Carlyle
1,Of Curses and Temptations,Anthony Lockwood
2,Of Curses and Temptations,George Cubbins | George Karim
3,Of Curses and Temptations,Montagu Barnes
4,Of Curses and Temptations,Flo Bones
...,...,...
5932,The Passage of Time,Lucy Carlyle
5933,The Passage of Time,Anthony Lockwood
5934,To Dust,George Cubbins
5935,To Dust,Lucy Carlyle


In [118]:
char_rel_tag = char_df.merge(relationship_df, how='outer', on='title')
char_rel_tag = char_rel_tag.merge(tags_df, how='outer', on='title')
char_rel_tag

Unnamed: 0,title,character,relationship_desc,ship,tag_item
0,Of Curses and Temptations,Lucy Carlyle,,,Lucy Carlyle
1,Of Curses and Temptations,Lucy Carlyle,,,Anthony Lockwood
2,Of Curses and Temptations,Lucy Carlyle,,,George Cubbins | George Karim
3,Of Curses and Temptations,Lucy Carlyle,,,Montagu Barnes
4,Of Curses and Temptations,Lucy Carlyle,,,Flo Bones
...,...,...,...,...,...
133463,To Be Alive,,,George Cubbins | George Karim,
133464,Деякі теорії,,,Anthony Lockwood,
133465,Деякі теорії,,,George Cubbins,
133466,Деякі теорії,,,George Cubbins Mother,


In [119]:
char_rel_tag.to_csv('character_relationship_tags.csv', index=False)

In [120]:
working_df.to_csv(filename, index=False)