# Import the necessary packages

In [135]:
## Run selenium and chrome driver to scrape data from cloudbytes.dev
import time
import os.path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# Setup chrome browse

In [72]:
## Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off
# chrome_options.add_argument("--window-size=1920,1200")

In [73]:
# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")

In [74]:
# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

# Functions used for the webscraping

In [75]:
def get_links(browser):
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        links = a[0].get_attribute("href")
        data.append(links)
    return data
    

In [76]:
def process_pages(browser, maxpagenum):
    # datetime object containing current date and time
    now = datetime.now()
    dt_string = now.strftime("%d%m%Y_%H%M")
    print("date and time =", dt_string)

    # Create an empty list to hold the data
    data_list = get_links(browser)
    print('Page 1 has been processed')

    # Iterate through each page and append the data to the list
    for p in range(2, maxpagenum + 1):
        pagenum = p
        time.sleep(10)
        print(f'Processing page {pagenum}/{maxpagenum}')
        link = "https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page=" + str(pagenum)
        browser.get(link)
        data_list += get_links(browser)
        print(len(data_list))

    return data_list, dt_string



In [92]:
def get_data(data):
    counter=0
    slow_links = [] # List to store links that are taking too long to access
    for x in range(len(data['link'])):
        start_time = time.time()
        if pd.isnull(data.loc[x,'summary']):
            print(f"getting missing data {x+1}/{len(data['link'])}")
            try:
                newlink=data['link'][x]+'?view_adult=true'
                page_start_time=time.time()
                source = requests.get(newlink, headers={
                              'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
                elapsed_time = time.time() - page_start_time
                if elapsed_time > 10:
                    print(f"Link {data['link'][x]} is taking too long to access. Adding to slow_links list.")
                    slow_links.append(data['link'][x])
                    continue
            except requests.exceptions.RequestException:
                print(f"Link {data['link'][x]} is taking too long to access. Adding to slow_links list.")
                slow_links.append(data['link'][x])
                continue
            soup = BeautifulSoup(source,'html.parser')
            try:
                data.loc[x,'title']=soup.find('h2', attrs={'class':'title heading'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'title']=np.nan
            try:
                data.loc[x,'author']=soup.find('a', attrs={'rel':'author'}).get_text()
            except:
                data.loc[x,'author']="Anonymous"
            try:
                data.loc[x,'published']=soup.find('dd', attrs={'class':'published'}).get_text()
            except:
                data.loc[x,'published']=np.nan
            try:
                data.loc[x,'updatedate'] = soup.find('dd', attrs={'class':'status'}).get_text()
            except:
                data.loc[x,'updatedate']=data['published'][x]
            
            try:
                data.loc[x,'chapters']=soup.find('dd', attrs={'class':'chapters'}).get_text()
            except:
                data.loc[x,'chapters']=np.nan
            
            try:
                data.loc[x,'language']=soup.find('dd', attrs={'class':'language'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'language']=np.nan
            
            try:
                data.loc[x,'words']=soup.find('dd', attrs={'class':'words'}).get_text()
            except:
                data.loc[x,'words']=np.nan
            try:
                data.loc[x,'kudos']=soup.find('dd', attrs={'class':'kudos'}).get_text()
            except:
                data.loc[x,'kudos']=0
            try:
                data.loc[x,'comments']=soup.find('dd', attrs={'class':'comments'}).get_text()
            except:
                data.loc[x,'comments']=0
            try:
                data.loc[x,'bookmarks']=soup.find('dd', attrs={'class':'bookmarks'}).get_text()
            except:
                data.loc[x,'bookmarks']=0
            try:
                data.loc[x,'hits']=soup.find('dd', attrs={'class':'hits'}).get_text()
            except:
                data.loc[x,'hits']=0
            
            try:
                data.loc[x,'warning']=soup.find('dd', attrs={'class':'warning tags'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'warning']=0
            try:
                r = soup.find('dd', attrs={'class':'relationship tags'})
                relationships = r.find_all('li')
                rel_list = []
                for rel in relationships:
                    rel_list.append(rel.get_text().strip())
                data.loc[x,'relationship'] = ', '.join(rel_list)
            except:
                data.loc[x,'relationship'] = ''
            try:
                c = soup.find('dd', attrs={'class':'character tags'})
                characters = c.find_all('li')
                char_list = []
                for char in characters:
                    char_list.append(char.get_text().strip())
                data.loc[x,'characters'] = ', '.join(char_list)
            except:
                data.loc[x,'characters']=''
            try:
                t = soup.find('dd', attrs={'class':'freeform tags'})
                tags = t.find_all('li')
                tag_list = []
                for tag in tags:
                    tag_list.append(tag.get_text().strip())
                data.loc[x,'tags'] = ', '.join(tag_list)
            except:
                data.loc[x,'tags'] = ''
            try:
                data.loc[x,'series'] = soup.find('span', attrs={'class':'position'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'series'] = 'not a series'
            try:
                data.loc[x,'summary']=soup.find('div', attrs={'class':'summary module'}).get_text().replace('\n', ' ').replace('Summary:','').strip()
            except:
                data.loc[x,'summary']=np.nan
            
            try:
                data.loc[x,'rating']=soup.find('dd', attrs={'class':'rating tags'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'rating']=np.nan
            print(data.iloc[x])
            time.sleep(10)
        elapsed_total_time = time.time() - start_time
        if elapsed_total_time > 120*60:
            for l in slow_links:
                print(l)
            return data
    for l in slow_links:
        print(l)
    return pd.DataFrame(data)

In [157]:
def get_data2(data):
    for x in range(len(data['link'])):
        newlink=data['link'][x]+'?view_adult=true'
        source = requests.get(newlink, headers={
                              'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        soup = BeautifulSoup(source,'html.parser')
        if data.loc[x, 'relationship'] == '':
            try:
                print(data['link'][x])
                r = soup.find('dd', attrs={'class':'relationship tags'})
                relationships = r.find_all('li')
                rel_list = []
                for rel in relationships:
                    rel_list.append(rel.get_text().strip())
                data.loc[x,'relationship'] = ', '.join(rel_list)
                print(x)
            except:
                print(f'{x} failed')
                data.loc[x,'relationship'] = ''
            time.sleep(10)
    return pd.DataFrame(data)

# Get the fanfic links

In [78]:
# Get page
pagenum=1
link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
#link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page=1"
browser.get(link)

maxpagenum=int(browser.find_element(By.XPATH,'//ol[1]/li[13]').text.strip())

In [79]:
links_list, dt_string = process_pages(browser, maxpagenum)
# Wait for 10 seconds
time.sleep(10)
browser.quit()
data = pd.DataFrame(columns=['link','title','author','published','updatedate','chapters','language','words','kudos','comments','bookmarks','hits','warning','relationship','characters','tags','summary','rating','series'])
data['link'] = links_list
data.to_csv('links.csv', index=False)

date and time = 25052023_2238
Page 1 has been processed
Processing page 2/79
40
Processing page 3/79
60
Processing page 4/79
80
Processing page 5/79
100
Processing page 6/79
120
Processing page 7/79
140
Processing page 8/79
160
Processing page 9/79
180
Processing page 10/79
200
Processing page 11/79
220
Processing page 12/79
240
Processing page 13/79
260
Processing page 14/79
280
Processing page 15/79
300
Processing page 16/79
320
Processing page 17/79
340
Processing page 18/79
360
Processing page 19/79
380
Processing page 20/79
400
Processing page 21/79
420
Processing page 22/79
440
Processing page 23/79
460
Processing page 24/79
480
Processing page 25/79
500
Processing page 26/79
520
Processing page 27/79
540
Processing page 28/79
560
Processing page 29/79
580
Processing page 30/79
600
Processing page 31/79
620
Processing page 32/79
640
Processing page 33/79
660
Processing page 34/79
680
Processing page 35/79
700
Processing page 36/79
720
Processing page 37/79
740
Processing page 38/

In [80]:
data = pd.read_csv('links.csv')

# Get the data of the fanfics

In [93]:
data=get_data(data)

getting missing data 142/1574
link                   https://archiveofourown.org/works/47155405
title                                   A Ghost in the Guest Room
author                                              tvngerine_zso
published                                              2023-05-13
updatedate                                             2023-05-13
chapters                                                      1/1
language                                                  English
words                                                       3,915
kudos                                                          17
comments                                                        0
bookmarks                                                       1
hits                                                          249
relationship                                Teen And Up Audiences
characters      Lucy Carlyle, Anthony Lockwood, George Cubbins...
tags            Locklyle, lockwood and co - Fr

In [143]:
data2=get_data2(data)
for x in range(len(data2['link'])):
    if data.loc[x, 'relationship'] == '':
        print(f'empty {x}')

76
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412

In [161]:
data2['relationship']

0                           Lucy Carlyle/Anthony Lockwood
1                           Lucy Carlyle/Anthony Lockwood
2       Lucy Carlyle/Anthony Lockwood, Lucy Carlyle & ...
3                           Lucy Carlyle/Anthony Lockwood
4       Anthony Lockwood/Quill Kipps, Lucy Carlyle/The...
                              ...                        
1569                                                     
1570                                                     
1571                                                     
1572                                                     
1573    George Cubbins/Anthony Lockwood, Lucy Carlyle/...
Name: relationship, Length: 1574, dtype: object

In [159]:
data2=get_data2(data2)

https://archiveofourown.org/works/47301508
76 failed
https://archiveofourown.org/works/47258338
101 failed
https://archiveofourown.org/works/47237377
107 failed
https://archiveofourown.org/works/44963443
257 failed


KeyboardInterrupt: 

In [136]:
data.isnull().sum() # Check for missing values

link             0
title            0
author           0
published        0
updatedate       0
chapters         0
language         0
words            0
kudos            0
comments         0
bookmarks        0
hits             0
relationship     0
characters       0
tags             0
summary         12
rating           0
series           0
chapter          0
chapter_max      0
completion       0
dtype: int64

# Save the File

In [96]:
filename=f'ao3_lockwood_and_co_ao_{dt_string}.csv'
data.to_csv(filename, index=False)

# Generate the other features

In [97]:
working_df = pd.read_csv(filename)

In [103]:

# Split the chapter column into chapter and chapter_max, and create a completion column
working_df[['chapter','chapter_max']] = data.chapters.str.split("/", expand=True)
working_df['completion'] = data.apply(lambda row: 'completed' if row['chapter']==row['chapter_max'] else 'incomplete', axis=1)


In [99]:
working_df['published'] = pd.to_datetime(working_df['published'])
working_df['updatedate'] = pd.to_datetime(working_df['updatedate'])
working_df['currentdate'] = max(working_df['updatedate'])
working_df['datediff_pub'] = (working_df['currentdate']-working_df['published'])/np.timedelta64(1,'D')
working_df['datediff'] = (working_df['currentdate']-working_df['updatedate'])/np.timedelta64(1,'D')

In [104]:
working_df['classification'] = working_df.apply(lambda row: 'oneshot' if row['chapter_max']=='1' else ('multichapter(complete)' if row['completion']=='completed' else ('multichapter(updating)' if row['datediff']<=60 else 'multichapter(dormant)')), axis=1)


In [105]:
working_df.columns

Index(['link', 'title', 'author', 'published', 'updatedate', 'chapters',
       'language', 'words', 'kudos', 'comments', 'bookmarks', 'hits',
       'series', 'currentdate', 'datediff_pub', 'datediff', 'chapter',
       'chapter_max', 'completion', 'classification'],
      dtype='object')

In [106]:
def get_num_item(column):
    item=[]
    for row in column:
        try:
            row_item = row.replace("[","").replace("]","").replace("'","").replace('"','').split(",")
        except:
            row_item = ['']
        if row_item!=['']:
            item.append(len(row_item))
        else:
            item.append(0) 
    return item

In [107]:
author_df = working_df.groupby(['author'], as_index=False).agg({'updatedate':'max', 'published':'min'})
author_df = author_df.rename(columns={'updatedate':'lastauthorupdate','published':'firstauthorupdate'})

In [108]:
if 'firstauthorupdate_x' in working_df.columns:
    working_df=working_df.drop(columns=['firstauthorupdate_x','lastauthorupdate_x', 'lastauthorupdate_y','firstauthorupdate_y'])
    working_df=working_df.merge(author_df, how='left', on='author')
else:
    working_df=working_df.merge(author_df, how='left', on='author')

In [109]:
working_df['author_lastupdate_diff'] = (working_df['currentdate']-working_df['lastauthorupdate'])/np.timedelta64(1,'D')
working_df['daysactive'] = (working_df['lastauthorupdate']-working_df['firstauthorupdate'])/np.timedelta64(1,'D')
working_df['daysincefirtupload'] = (working_df['currentdate']-working_df['firstauthorupdate'])/np.timedelta64(1,'D')
working_df['author_activity'] = working_df['author_lastupdate_diff'].apply(lambda x: 'active' if x<=60 else 'inactive')

In [110]:
working_df[['lastauthorupdate','firstauthorupdate','daysactive']].sort_values(by=['daysactive'], ascending=False)

Unnamed: 0,lastauthorupdate,firstauthorupdate,daysactive
1087,2023-02-04,2014-02-05,3286.0
1570,2023-02-04,2014-02-05,3286.0
1507,2023-02-04,2014-02-05,3286.0
1571,2023-02-04,2014-02-05,3286.0
1508,2023-02-04,2014-02-05,3286.0
...,...,...,...
994,2023-02-18,2023-02-18,0.0
992,2023-02-18,2023-02-18,0.0
984,2023-02-18,2023-02-18,0.0
983,2023-02-19,2023-02-19,0.0


In [111]:
working_df['num_relationship']=get_num_item(working_df['relationship'])
working_df['num_characters']=get_num_item(working_df['characters'])
working_df['num_tags']=get_num_item(working_df['tags'])

# Get data from the previous file

In [112]:
prevdt_string = '12052023_1200'
prevfilename=f'ao3_lockwood_and_co_ao_{prevdt_string}.csv'
prev_df = pd.read_csv(prevfilename)

In [113]:
prev_df = prev_df[['link', 'words']]
prev_df = prev_df.rename(columns={'words':'prev_words'})
working_df = working_df.merge(prev_df, how='left', on='link')
working_df['prev_words'] = working_df['prev_words'].fillna(0)


In [114]:
working_df[['words','prev_words']].isnull().sum()

words         0
prev_words    0
dtype: int64

In [115]:
working_df.columns

Index(['link', 'title', 'author', 'published', 'updatedate', 'chapters',
       'language', 'words', 'kudos', 'comments', 'bookmarks', 'hits',
       'series', 'currentdate', 'datediff_pub', 'datediff', 'chapter',
       'chapter_max', 'completion', 'classification', 'lastauthorupdate',
       'firstauthorupdate', 'author_lastupdate_diff', 'daysactive',
       'daysincefirtupload', 'author_activity', 'num_relationship',
       'num_characters', 'num_tags', 'prev_words'],
      dtype='object')

In [121]:
def get_df_item(id_column,item_column, name_col):
    item_list=[]
    for x in range(len(id_column)):
        try:
            row_item = item_column[x].replace("[","").replace("]","").replace("'","").replace('"','').split(",")
        except:
            row_item = ['']
        for item in row_item:
            item=item.strip()
            if '&' not in item:
                item_list.append([id_column[x],item])
    return pd.DataFrame(item_list, columns = ['link', name_col])

In [122]:
char_df = get_df_item(working_df['link'], working_df['characters'], 'charactername')
character = pd.read_csv('characters.csv')
char_df =char_df.merge(character, how='left', on='charactername')
char_df['character'] = char_df['character'].fillna(char_df['charactername'])
char_df=char_df.drop(columns='charactername')
char_df

Unnamed: 0,link,character
0,https://archiveofourown.org/works/47116633,Lucy Carlyle
1,https://archiveofourown.org/works/47116633,Anthony Lockwood
2,https://archiveofourown.org/works/47116633,George Cubbins | George Karim
3,https://archiveofourown.org/works/47116633,Montague Barnes
4,https://archiveofourown.org/works/47116633,Flo Bones
...,...,...
6335,https://archiveofourown.org/works/1169828,Lucy Carlyle
6336,https://archiveofourown.org/works/1169828,Anthony Lockwood
6337,https://archiveofourown.org/works/1084801,George Cubbins | George Karim
6338,https://archiveofourown.org/works/1084801,Lucy Carlyle


In [162]:
relationship_df = get_df_item(data2['link'], data2['relationship'], 'shiptag')
relationship_df

Unnamed: 0,link,shiptag
0,https://archiveofourown.org/works/47116633,Lucy Carlyle/Anthony Lockwood
1,https://archiveofourown.org/works/47361427,Lucy Carlyle/Anthony Lockwood
2,https://archiveofourown.org/works/47422114,Lucy Carlyle/Anthony Lockwood
3,https://archiveofourown.org/works/47421412,Lucy Carlyle/Anthony Lockwood
4,https://archiveofourown.org/works/47394868,Anthony Lockwood/Quill Kipps
...,...,...
1843,https://archiveofourown.org/works/1267453,
1844,https://archiveofourown.org/works/1169828,
1845,https://archiveofourown.org/works/1084801,George Cubbins/Anthony Lockwood
1846,https://archiveofourown.org/works/1084801,Lucy Carlyle/Anthony Lockwood


In [None]:
relationship_df = get_df_item(working_df['link'], working_df['relationship'], 'shiptag')

In [163]:

relationship = pd.read_csv('relationships.csv')
relationship_df =relationship_df.merge(relationship, how='left', on='shiptag')
relationship_df['ship'] = relationship_df['ship'].fillna(relationship_df['shiptag'])
relationship_df=relationship_df.drop(columns='shiptag')
relationship_df

Unnamed: 0,link,relationship_desc,ship
0,https://archiveofourown.org/works/47116633,pair,Anthony/Lucy
1,https://archiveofourown.org/works/47361427,pair,Anthony/Lucy
2,https://archiveofourown.org/works/47422114,pair,Anthony/Lucy
3,https://archiveofourown.org/works/47421412,pair,Anthony/Lucy
4,https://archiveofourown.org/works/47394868,pair,Anthony/Quill
...,...,...,...
1843,https://archiveofourown.org/works/1267453,,
1844,https://archiveofourown.org/works/1169828,,
1845,https://archiveofourown.org/works/1084801,pair,Anthony/George
1846,https://archiveofourown.org/works/1084801,pair,Anthony/Lucy


In [153]:
tags_df= get_df_item(working_df['link'], working_df['tags'], 'tag_item')
tags_df

Unnamed: 0,link,tag_item
0,https://archiveofourown.org/works/47116633,Body Swap
1,https://archiveofourown.org/works/47116633,Crack Treated Seriously
2,https://archiveofourown.org/works/47116633,Non Canon Magic
3,https://archiveofourown.org/works/47116633,weird curses
4,https://archiveofourown.org/works/47116633,Eventual Smut
...,...,...
14912,https://archiveofourown.org/works/1267453,nothing like a dw crossover
14913,https://archiveofourown.org/works/1267453,Not Really Character Death
14914,https://archiveofourown.org/works/1267453,Not quite a relationship
14915,https://archiveofourown.org/works/1169828,Drabble


In [164]:
char_rel_tag = char_df.merge(relationship_df, how='outer', on='link')
char_rel_tag = char_rel_tag.merge(tags_df, how='outer', on='link')
char_rel_tag

Unnamed: 0,link,character,relationship_desc,ship,tag_item
0,https://archiveofourown.org/works/47116633,Lucy Carlyle,pair,Anthony/Lucy,Body Swap
1,https://archiveofourown.org/works/47116633,Lucy Carlyle,pair,Anthony/Lucy,Crack Treated Seriously
2,https://archiveofourown.org/works/47116633,Lucy Carlyle,pair,Anthony/Lucy,Non Canon Magic
3,https://archiveofourown.org/works/47116633,Lucy Carlyle,pair,Anthony/Lucy,weird curses
4,https://archiveofourown.org/works/47116633,Lucy Carlyle,pair,Anthony/Lucy,Eventual Smut
...,...,...,...,...,...
107961,https://archiveofourown.org/works/39724113,,pair,Anthony/Lucy,Post-Canon
107962,https://archiveofourown.org/works/26124490,,,,Crack
107963,https://archiveofourown.org/works/26124490,,,,Humor
107964,https://archiveofourown.org/works/26124490,,,,Plotless mostly


In [165]:
char_rel_tag.to_csv('character_relationship_tags.csv', index=False)

In [156]:
working_df.to_csv(filename, index=False)