In [None]:
## Run selenium and chrome driver to scrape data from cloudbytes.dev
import time
import os.path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime

In [None]:
## Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off
# chrome_options.add_argument("--window-size=1920,1200")
# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")


In [None]:
# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

In [None]:
def get_links(browser):
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        links = a[0].get_attribute("href")
        data.append(links)
    return data

In [None]:
def check_list_inclusion(list1, list2):
    return all(item in list2 for item in list1)

In [None]:
def process_tv_book(str,new_links):
    for p in range(1, 20):
        print(f'Processing page {p} of {str}')
        if str=='tv':
            link = f'https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20(TV)/works?commit=Sort+and+Filter&exclude_work_search[fandom_ids][]=1250871&page={p}&work_search[complete]=&work_search[crossover]=&work_search[date_from]=&work_search[date_to]=&work_search[excluded_tag_names]=&work_search[language_id]=&work_search[other_tag_names]=&work_search[query]=&work_search[sort_column]=created_at&work_search[words_from]=&work_search[words_to]='
        elif str=='book':
            link = f'https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?commit=Sort+and+Filter&page={p}&work_search%5Bcomplete%5D=&work_search%5Bcrossover%5D=&work_search%5Bdate_from%5D=&work_search%5Bdate_to%5D=&work_search%5Bexcluded_tag_names%5D=&work_search%5Blanguage_id%5D=&work_search%5Bother_tag_names%5D=&work_search%5Bquery%5D=&work_search%5Bsort_column%5D=created_at&work_search%5Bwords_from%5D=&work_search%5Bwords_to%5D='
        time.sleep(10)
        browser.get(link)
        temp_links = get_links(browser)
        if check_list_inclusion(temp_links, links):
            break
        else:
            new_links += temp_links
    return new_links

In [None]:
def process_pages(links):
    new_links = []
    # for TV
    new_links = process_tv_book('tv', new_links)
    # for books
    new_links = process_tv_book('book', new_links)
    # remove duplicates
    new_links = list(set(new_links))
    data = pd.DataFrame(columns=['link','title','author','published','updatedate','chapters','language','words','kudos','comments','bookmarks','hits','warning','relationship','characters','tags','summary','rating','series'])
    data['link'] = new_links
    return data

In [None]:
def get_data(data):
    counter=0
    slow_links = [] # List to store links that are taking too long to access
    for x in range(len(data['link'])):
        start_time = time.time()
        if pd.isnull(data.loc[x,'summary']):
            print(f"getting missing data {x+1}/{len(data['link'])}")
            try:
                newlink=data['link'][x]+'?view_adult=true'
                page_start_time=time.time()
                source = requests.get(newlink, headers={
                              'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
                elapsed_time = time.time() - page_start_time
                if elapsed_time > 10:
                    print(f"Link {data['link'][x]} is taking too long to access. Adding to slow_links list.")
                    slow_links.append(data['link'][x])
                    continue
            except requests.exceptions.RequestException:
                print(f"Link {data['link'][x]} is taking too long to access. Adding to slow_links list.")
                slow_links.append(data['link'][x])
                continue
            soup = BeautifulSoup(source,'html.parser')
            try:
                data.loc[x,'title']=soup.find('h2', attrs={'class':'title heading'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'title']=np.nan
            try:
                data.loc[x,'author']=soup.find('a', attrs={'rel':'author'}).get_text()
            except:
                data.loc[x,'author']="Anonymous"
            try:
                data.loc[x,'published']=soup.find('dd', attrs={'class':'published'}).get_text()
            except:
                data.loc[x,'published']=np.nan
            try:
                data.loc[x,'updatedate'] = soup.find('dd', attrs={'class':'status'}).get_text()
            except:
                data.loc[x,'updatedate']=data['published'][x]
            
            try:
                data.loc[x,'chapters']=soup.find('dd', attrs={'class':'chapters'}).get_text()
            except:
                data.loc[x,'chapters']=np.nan
            
            try:
                data.loc[x,'language']=soup.find('dd', attrs={'class':'language'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'language']=np.nan
            
            try:
                data.loc[x,'words']=soup.find('dd', attrs={'class':'words'}).get_text()
            except:
                data.loc[x,'words']=np.nan
            try:
                data.loc[x,'kudos']=soup.find('dd', attrs={'class':'kudos'}).get_text()
            except:
                data.loc[x,'kudos']=0
            try:
                data.loc[x,'comments']=soup.find('dd', attrs={'class':'comments'}).get_text()
            except:
                data.loc[x,'comments']=0
            try:
                data.loc[x,'bookmarks']=soup.find('dd', attrs={'class':'bookmarks'}).get_text()
            except:
                data.loc[x,'bookmarks']=0
            try:
                data.loc[x,'hits']=soup.find('dd', attrs={'class':'hits'}).get_text()
            except:
                data.loc[x,'hits']=0
            
            try:
                data.loc[x,'warning']=soup.find('dd', attrs={'class':'warning tags'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'warning']=0
            try:
                r = soup.find('dd', attrs={'class':'relationship tags'})
                relationships = r.find_all('li')
                rel_list = []
                for rel in relationships:
                    rel_list.append(rel.get_text().strip())
                data.loc[x,'relationship'] = ', '.join(rel_list)
            except:
                data.loc[x,'relationship'] = ''
            try:
                c = soup.find('dd', attrs={'class':'character tags'})
                characters = c.find_all('li')
                char_list = []
                for char in characters:
                    char_list.append(char.get_text().strip())
                data.loc[x,'characters'] = ', '.join(char_list)
            except:
                data.loc[x,'characters']=''
            try:
                t = soup.find('dd', attrs={'class':'freeform tags'})
                tags = t.find_all('li')
                tag_list = []
                for tag in tags:
                    tag_list.append(tag.get_text().strip())
                data.loc[x,'tags'] = ', '.join(tag_list)
            except:
                data.loc[x,'tags'] = ''
            try:
                data.loc[x,'series'] = soup.find('span', attrs={'class':'position'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'series'] = 'not a series'
            try:
                data.loc[x,'summary']=soup.find('div', attrs={'class':'summary module'}).get_text().replace('\n', ' ').replace('Summary:','').strip()
            except:
                data.loc[x,'summary']=np.nan
            
            try:
                data.loc[x,'rating']=soup.find('dd', attrs={'class':'rating tags'}).get_text().replace('\n','').strip()
            except:
                data.loc[x,'rating']=np.nan
            print(data.iloc[x])
            time.sleep(10)
        elapsed_total_time = time.time() - start_time
        if elapsed_total_time > 120*60:
            for l in slow_links:
                print(l)
            return data
    for l in slow_links:
        print(l)
    return pd.DataFrame(data)

In [None]:
def update_data(data):
    counter=0
    slow_links = [] # List to store links that are taking too long to access
    for x in range(len(data['link'])):
        start_time = time.time()
        print(f"updating data {x+1}/{len(data['link'])}")
        try:
            newlink=data['link'][x]+'?view_adult=true'
            page_start_time=time.time()
            source = requests.get(newlink, headers={
                        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
            elapsed_time = time.time() - page_start_time
            if elapsed_time > 10:
                print(f"Link {data['link'][x]} is taking too long to access. Adding to slow_links list.")
                slow_links.append(data['link'][x])
                continue
        except requests.exceptions.RequestException:
            print(f"Link {data['link'][x]} is taking too long to access. Adding to slow_links list.")
            slow_links.append(data['link'][x])
            continue
        soup = BeautifulSoup(source,'html.parser')
        try:
            data.loc[x,'updatedate'] = soup.find('dd', attrs={'class':'status'}).get_text()
        except:
            data.loc[x,'updatedate']=data['published'][x]
        try:
            data.loc[x,'chapters']=soup.find('dd', attrs={'class':'chapters'}).get_text()
        except:
            data.loc[x,'chapters']=np.nan
        try:
            data.loc[x,'words']=soup.find('dd', attrs={'class':'words'}).get_text()
        except:
            data.loc[x,'words']=np.nan
        try:
            data.loc[x,'kudos']=soup.find('dd', attrs={'class':'kudos'}).get_text()
        except:
            data.loc[x,'kudos']=0
        try:
           data.loc[x,'comments']=soup.find('dd', attrs={'class':'comments'}).get_text()
        except:
              data.loc[x,'comments']=0
        try:
            data.loc[x,'bookmarks']=soup.find('dd', attrs={'class':'bookmarks'}).get_text()
        except:
            data.loc[x,'bookmarks']=0
        try:
            data.loc[x,'hits']=soup.find('dd', attrs={'class':'hits'}).get_text()
        except:
            data.loc[x,'hits']=0
        try:
            data.loc[x,'warning']=soup.find('dd', attrs={'class':'warning tags'}).get_text().replace('\n','').strip()
        except:
            data.loc[x,'warning']=0
        try:
            t = soup.find('dd', attrs={'class':'freeform tags'})
            tags = t.find_all('li')
            tag_list = []
            for tag in tags:
                tag_list.append(tag.get_text().strip())
                data.loc[x,'tags'] = ', '.join(tag_list)
        except:
            data.loc[x,'tags'] = ''
        try:
            data.loc[x,'rating']=soup.find('dd', attrs={'class':'rating tags'}).get_text().replace('\n','').strip()
        except:
            data.loc[x,'rating']=np.nan
            print(data.iloc[x])
            time.sleep(10)
        elapsed_total_time = time.time() - start_time
    if elapsed_total_time > 120*60:
        for l in slow_links:
            print(l)
        return data
    for l in slow_links:
        print(l)
    return pd.DataFrame(data)

In [None]:
dt_string = '20230525_2238'
filename=f'ao3_lockwood_and_co_ao_{dt_string}.csv'
prev_df = pd.read_csv(f'AO3/{filename}')
working_df = prev_df
# datetime object containing current date and time
now = datetime.now()
dt_string = now.strftime("%Y%m%d_%H%M")
print("date and time =", dt_string)
working_df = working_df[['link','title','author','published','updatedate','chapters','language','words','kudos','comments','bookmarks','hits','warning','relationship','characters','tags','summary','rating','series']]
working_df = update_data(working_df)

In [None]:
new_df = process_pages(working_df['link'])
new_df = get_data(new_df)
working_df = pd.concat([new_df, working_df])

In [None]:
filename=f'ao3_lockwood_and_co_ao_{dt_string}.csv'
working_df = working_df.to_csv(f'AOE/{filename}', index=False)
working_df = pd.read_csv(f'AO3/{filename}')

In [None]:
# Split the chapter column into chapter and chapter_max, and create a completion column
working_df[['chapter','chapter_max']] = working_df.chapters.str.split("/", expand=True)
working_df['completion'] = working_df.apply(lambda row: 'completed' if row['chapter']==row['chapter_max'] else 'incomplete', axis=1)

In [None]:
working_df['published'] = pd.to_datetime(working_df['published'])
working_df['updatedate'] = pd.to_datetime(working_df['updatedate'])
working_df['currentdate'] = max(working_df['updatedate'])
working_df['datediff_pub'] = (working_df['currentdate']-working_df['published'])/np.timedelta64(1,'D')
working_df['datediff'] = (working_df['currentdate']-working_df['updatedate'])/np.timedelta64(1,'D')

In [None]:
working_df['classification'] = working_df.apply(lambda row: 'oneshot' if row['chapter_max']=='1' else ('multichapter(complete)' if row['completion']=='completed' else ('multichapter(updating)' if row['datediff']<=60 else 'multichapter(dormant)')), axis=1)


In [None]:
def get_num_item(column):
    item=[]
    for row in column:
        try:
            row_item = row.replace("[","").replace("]","").replace("'","").replace('"','').split(",")
        except:
            row_item = ['']
        if row_item!=['']:
            item.append(len(row_item))
        else:
            item.append(0) 
    return item

In [None]:
author_df = working_df.groupby(['author'], as_index=False).agg({'updatedate':'max', 'published':'min'})
author_df = author_df.rename(columns={'updatedate':'lastauthorupdate','published':'firstauthorupdate'})

In [None]:
if 'firstauthorupdate_x' in working_df.columns:
    working_df=working_df.drop(columns=['firstauthorupdate_x','lastauthorupdate_x', 'lastauthorupdate_y','firstauthorupdate_y'])
    working_df=working_df.merge(author_df, how='left', on='author')
else:
    working_df=working_df.merge(author_df, how='left', on='author')

In [None]:
working_df['author_lastupdate_diff'] = (working_df['currentdate']-working_df['lastauthorupdate'])/np.timedelta64(1,'D')
working_df['daysactive'] = (working_df['lastauthorupdate']-working_df['firstauthorupdate'])/np.timedelta64(1,'D')
working_df['daysincefirtupload'] = (working_df['currentdate']-working_df['firstauthorupdate'])/np.timedelta64(1,'D')
working_df['author_activity'] = working_df['author_lastupdate_diff'].apply(lambda x: 'active' if x<=60 else 'inactive')

In [None]:
working_df['num_relationship']=get_num_item(working_df['relationship'])
working_df['num_characters']=get_num_item(working_df['characters'])
working_df['num_tags']=get_num_item(working_df['tags'])

In [None]:
prev_df = prev_df[['link', 'words']]
prev_df = prev_df.rename(columns={'words':'prev_words'})
working_df = working_df.merge(prev_df, how='left', on='link')
working_df['prev_words'] = working_df['prev_words'].fillna(0)

In [None]:
def get_df_item(id_column,item_column, name_col):
    item_list=[]
    for x in range(len(id_column)):
        try:
            row_item = item_column[x].replace("[","").replace("]","").replace("'","").replace('"','').split(",")
        except:
            row_item = ['']
        for item in row_item:
            item=item.strip()
            if '&' not in item:
                item_list.append([id_column[x],item])
    return pd.DataFrame(item_list, columns = ['link', name_col])

In [None]:
char_df = get_df_item(working_df['link'], working_df['characters'], 'charactername')
character = pd.read_csv('AO3/characters.csv')
char_df =char_df.merge(character, how='left', on='charactername')
char_df['character'] = char_df['character'].fillna(char_df['charactername'])
char_df=char_df.drop(columns='charactername')
char_df

In [None]:
relationship_df = get_df_item(working_df['link'], working_df['relationship'], 'shiptag')
relationship = pd.read_csv('AO3/relationships.csv')
relationship_df =relationship_df.merge(relationship, how='left', on='shiptag')
relationship_df['ship'] = relationship_df['ship'].fillna(relationship_df['shiptag'])
relationship_df=relationship_df.drop(columns='shiptag')
relationship_df

In [None]:
tags_df= get_df_item(working_df['link'], working_df['tags'], 'tag_item')


In [None]:
char_rel_tag = char_df.merge(relationship_df, how='outer', on='link')
char_rel_tag = char_rel_tag.merge(tags_df, how='outer', on='link')
char_rel_tag

In [None]:
char_rel_tag.to_csv('AO3/character_relationship_tags.csv', index=False)
working_df = pd.read_csv(f'AO3/{filename}', index=False)