In [2]:
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import requests
from bs4 import BeautifulSoup

In [8]:
BASE_URL = "https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="

# Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off

homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")

def get_links(page_number):
    browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
    link = BASE_URL+str(page_number)
    browser.get(link)
    print('hello')
    time.sleep(10)  # ensure page is loaded
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    data = [work.find_element(By.TAG_NAME,'h4').find_elements(By.TAG_NAME, 'a')[0].get_attribute("href") for work in works]
    browser.quit()
    return data

def get_data(link):
    headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}
    link += '?view_adult=true'
    # rest of your code to get the data from the link...
    try:
        source = requests.get(link, headers=headers).text
        soup = BeautifulSoup(source, 'html.parser')
    except requests.exceptions.RequestException:
        print(f"Link {link} is taking too long to access. Adding to slow_links list.")
        return None

    attrs_map = {
        'title': ('h2', {'class':'title heading'}),
        'author': ('a', {'rel':'author'}),
        'published': ('dd', {'class':'published'}),
        'updatedate': ('dd', {'class':'status'}),
        'chapters': ('dd', {'class':'chapters'}),
        'language': ('dd', {'class':'language'}),
        'words': ('dd', {'class':'words'}),
        'kudos': ('dd', {'class':'kudos'}),
        'comments': ('dd', {'class':'comments'}),
        'bookmarks': ('dd', {'class':'bookmarks'}),
        'hits': ('dd', {'class':'hits'}),
        'warning': ('dd', {'class':'warning tags'}),
        'summary': ('div', {'class':'summary module'}),
        'rating': ('dd', {'class':'rating tags'}),
    }

    data = {}

    for key, value in attrs_map.items():
        try:
            if key in ["author", "updatedate", "summary"]:
                data[key] = soup.find(value[0], value[1]).get_text().strip()
            else:
                data[key] = soup.find(value[0], value[1]).get_text().replace('\n','').strip()
        except AttributeError:
            if key == "author":
                data[key] = "Anonymous"
            elif key == "updatedate":
                data[key] = data["published"]
            elif key == "summary":
                data[key] = np.nan
            else:
                data[key] = 0
    
    data['mainship'], data['relationship'] = extract_relationships(soup)
    data['characters'] = extract_tags(soup, 'character tags')
    data['tags'] = extract_tags(soup, 'freeform tags')
    data['series'] = extract_series(soup)

    return data

def extract_relationships(soup):
    try:
        ships = soup.find('dd', attrs={'class':'relationship tags'})
        ships_list = ships.find_all('a', attrs={'class':'tag'})
        mainrelationship = ships_list[0].get_text().strip()
        relationship_list = [r.get_text().strip() for r in ships_list]
    except:
        mainrelationship='None'
        relationship_list = []
    return mainrelationship, relationship_list

def extract_tags(soup, class_name):
    try:
        tags_section = soup.find('dd', attrs={'class': class_name})
        tags_list = tags_section.find_all('a', attrs={'class':'tag'})
        tags = [t.get_text().strip() for t in tags_list]
    except:
        tags = []
    return tags

def extract_series(soup):
    try:
        position = soup.find('span', attrs={'class':'position'})
        position_list = position.find_all('a')
        series_list = [p.get_text().strip() for p in position_list]
    except:
        series_list = []
    return series_list

def main():
    browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
    browser.get(BASE_URL+'1')
    max_page_num = int(browser.find_element(By.XPATH,'//ol[1]/li[13]').text.strip())
    browser.quit()

    print(f'Total pages: {max_page_num}')

    # datetime object containing current date and time
    now = datetime.now()

    # dd/mm/YY H:M:S
    dt_string = now.strftime("%d%m%Y_%H%M")
    print("date and time =", dt_string)

    with ThreadPoolExecutor(max_workers=1) as executor:
        link_pages = list(executor.map(get_links, range(1, max_page_num + 1)))
    links = [link for page in link_pages for link in page]
    print('links collected')

    with ThreadPoolExecutor(max_workers=1) as executor:
        data = list(executor.map(get_data, links))
    print('data collected')

    # rest of your code to process and save the data...
    final = pd.DataFrame(data)
    # Split the chapter column into chapter and chapter_max, and create a completion column
    final[['chapter','chapter_max']] = final.chapters.str.split("/", expand=True)
    final['completion'] = final.apply(lambda row: 'completed' if row['chapter']==row['chapter_max'] else 'incomplete', axis=1)
    filename=f'ao3_lockwood_and_co_ao_{dt_string}.csv'
    final.to_csv(filename, index=False)


if __name__ == '__main__':
    main()

Total pages: 72
date and time = 05052023_1100
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello


KeyboardInterrupt: 

In [None]:
def get_unique_items(column):
    items = {}
    for row in column:
        for item in row:
            item = item.strip()
            items[item] = items.get(item, 0) + 1
    return {k: items[k] for k in sorted(items)}

def get_item_count(column):
    return [len(row) if row != [''] else 0 for row in column]

def get_df_item(title_column, item_column, name_col):
    item_list = []
    for title, row_items in zip(title_column, item_column):
        for item in row_items:
            item = item.strip()
            if '&' not in item:
                item_list.append([title, item])
    return pd.DataFrame(item_list, columns=['title', name_col])

def process_dataframe(filename):
    df = pd.read_csv(filename, converters={'characters': eval, 'relationship': eval, 'tags': eval})
    df['published'] = pd.to_datetime(df['published'])
    df['updatedate'] = pd.to_datetime(df['updatedate'])
    current_date = df['updatedate'].max()
    df['datediff_pub'] = (current_date - df['published']).dt.days
    df['datediff'] = (current_date - df['updatedate']).dt.days
    df['classification'] = df.apply(lambda row: 'oneshot' if row['chapter_max'] == '1' else ('multichapter(complete)' if row['completion'] == 'completed' else ('multichapter(updating)' if row['datediff'] <= 60 else 'multichapter(dormant)')), axis=1)

    author_df = df.groupby(['author'], as_index=False).agg({'updatedate': 'max', 'published': 'min'}).rename(columns={'updatedate': 'lastauthorupdate', 'published': 'firstauthorupdate'})
    df = df.merge(author_df, how='left', on='author')
    df['author_lastupdate_diff'] = (current_date - df['lastauthorupdate']).dt.days
    df['daysactive'] = (df['lastauthorupdate'] - df['firstauthorupdate']).dt.days
    df['daysincefirtupload'] = (current_date - df['firstauthorupdate']).dt.days
    df['author_activity'] = df['author_lastupdate_diff'].apply(lambda x: 'active' if x <= 60 else 'inactive')

    df['num_relationship'] = get_item_count(df['relationship'])
    df['num_characters'] = get_item_count(df['characters'])
    df['num_tags'] = get_item_count(df['tags'])

    char_df = get_df_item(df['title'], df['characters'], 'charactername')
    relationship_df = get_df_item(df['title'], df['relationship'], 'shiptag')
    tags_df = get_df_item(df['title'], df['tags'], 'tag_item')

    char_df.to_csv('character_relationship_tags.csv', index=False)
    df.to_csv(filename, index=False)

if __name__ == '__main__':
    process_dataframe('input.csv')