In [7]:
## Run selenium and chrome driver to scrape data from cloudbytes.dev
import time
import os.path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import sqlite3
import chromedriver_autoinstaller
import re

In [8]:
## Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off
chrome_options.add_argument("--window-size=1920,1200")
# Set path to chromedriver as per your configuration
#homedir = os.path.expanduser("~")
#webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")
chromedriver_autoinstaller.install()

# Initialize Chrome browser
browser = webdriver.Chrome(options=chrome_options)

In [9]:
# get the links from the database
conn = sqlite3.connect('output/ao3.db')
querry = "SELECT link FROM fanfic"
link_list = pd.read_sql_query(querry, conn)
print(link_list)
# get the list of authors from the database
querry = "SELECT * FROM authors"
author_list = pd.read_sql_query(querry, conn)
print(author_list)
conn.close()

                                            link
0     https://archiveofourown.org/works/44791411
1     https://archiveofourown.org/works/47484628
2     https://archiveofourown.org/works/45235999
3     https://archiveofourown.org/works/45056584
4     https://archiveofourown.org/works/47517634
...                                          ...
2327  https://archiveofourown.org/works/48203275
2328  https://archiveofourown.org/works/48856123
2329  https://archiveofourown.org/works/48862312
2330  https://archiveofourown.org/works/48854899
2331  https://archiveofourown.org/works/48872416

[2332 rows x 1 columns]
    author_id                     author  joined_ao3
0    17235478                 Chocy_Milk  2023-04-18
1    13655830                    Squid52  2022-03-10
2     1760866           literaryelegance  2016-01-08
3        6999            Cornerofmadness  2010-05-17
4    14646558                Telvannicon  2022-07-15
..        ...                        ...         ...
723   6561073   

In [10]:
def get_links(browser):
    # Find all the works using XPath
    works = browser.find_elements(By.XPATH, '//ol[2]/li')

    # Iterate through each work and extract author and datetime
    data = []
    for work in works:
        h4 = work.find_element(By.TAG_NAME, 'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        # Get the href attribute of the first <a> tag
        link = a[0].get_attribute("href")
        data.append(link)
        #print(link)
    return data

In [11]:
def process_tv_book(category, links):
    temp_links = []
    if category == "books":
        for page in range(1, 23):
            print(f'processing page {page}')
            link = f'https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page={page}'
            time.sleep(10)
            browser.get(link)
            temp_links+=(get_links(browser))
    elif category == "tv":
        for page in range(1, 5):
            print(f'processing page {page}')
            link = f'https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20(TV)/works?commit=Sort+and+Filter&exclude_work_search%5Bfandom_ids%5D%5B%5D=1250871&page={page}&work_search%5Bcomplete%5D=&work_search%5Bcrossover%5D=&work_search%5Bdate_from%5D=&work_search%5Bdate_to%5D=&work_search%5Bexcluded_tag_names%5D=&work_search%5Blanguage_id%5D=&work_search%5Bother_tag_names%5D=&work_search%5Bquery%5D=&work_search%5Bsort_column%5D=revised_at&work_search%5Bwords_from%5D=&work_search%5Bwords_to%5D='
            time.sleep(10)
            browser.get(link)
            temp_links+=(get_links(browser))
    update_links = list(set(temp_links) & set(links))
    print(f'found {len(update_links)} updated links')
    new_links = list(set(temp_links) - set(links))
    print(f'found {len(new_links)} new links')
    return update_links, new_links


In [12]:
tv_update_links, tv_new_links = process_tv_book("tv", link_list.link)

processing page 1
processing page 2
processing page 3
processing page 4
found 80 updated links
found 0 new links


In [13]:
book_update_links, book_new_links = process_tv_book("books", link_list.link)

processing page 1
processing page 2
processing page 3
processing page 4
processing page 5
processing page 6
processing page 7
processing page 8
processing page 9
processing page 10
processing page 11
processing page 12
processing page 13
processing page 14
processing page 15
processing page 16
processing page 17
processing page 18
processing page 19
processing page 20
processing page 21
processing page 22
found 434 updated links
found 6 new links


In [14]:
def get_seriestags(links, series, collections, pairings, characters, relationships, tags):
    for x in range(len(links)):
        print(f"getting fanfic {x+1}/{len(links)}")
        try:
            newlink=links.loc[x,'link']+'?view_adult=true'
            fanfic = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        except requests.exceptions.RequestException:
            print(f"fanfic {links.loc[x,'link']} is taking too long to access.")
            continue
        fanfic = BeautifulSoup(fanfic,'html.parser')
        try:
            series_names = fanfic.find('dd', attrs={'class':'series'})
            series_names = series_names.find_all('span', attrs={'class':'position'})
            new_rows = []
            for series_name in series_names:
                new_row = [links.loc[x,'fanfic_id'], series_name.find('a').get_text()]
                new_rows.append(new_row)
            series = series.append(pd.DataFrame(new_rows, columns=series.columns))
            series = series.drop_duplicates()
        except:
            print('not in a series')
        try:
            collection_names = fanfic.find('dd', attrs={'class':'collections'})
            collection_names = collection_names.find_all('a')
            new_rows = []
            for collection_name in collection_names:
                new_row = [links.loc[x,'fanfic_id'], collection_name.get_text()]
                new_rows.append(new_row)
            collections = collections.append(pd.DataFrame(new_rows, columns=collections.columns))
            collections = collections.drop_duplicates()
        except:
            print('not in a collection')
        try:
            pairing_names = fanfic.find('dd', attrs={'class':'category tags'})
            pairing_names = pairing_names.find_all('a', attrs={'class':'tag'})
            new_rows = []
            for pairing_name in pairing_names:
                new_row = [links.loc[x,'fanfic_id'], pairing_name.get_text()]
                new_rows.append(new_row)
            pairings = pairings.append(pd.DataFrame(new_rows, columns=characters.columns))
            pairings = pairings.drop_duplicates()
        except:
            print('no pairing tags')
        try:
            update_date = fanfic.find('dd', attrs={'class':'status'}).get_text()
        except:
            try:
                update_date = fanfic.find('dd', attrs={'class':'published'}).get_text()
            except:
                update_date = np.nan
                print('no update date')
        try:
            character_names = fanfic.find('dd', attrs={'class':'character tags'})
            character_names = character_names.find_all('a', attrs={'class':'tag'})
            new_rows = []
            for character_name in character_names:
                new_row = [links.loc[x,'fanfic_id'], update_date, character_name.get_text()]
                new_rows.append(new_row)
            characters = characters.append(pd.DataFrame(new_rows, columns=characters.columns))
            characters = characters.drop_duplicates()
        except:
            print('no character tags')
        try:
            relationship_names = fanfic.find('dd', attrs={'class':'relationship tags'})
            relationship_names = relationship_names.find_all('a', attrs={'class':'tag'})
            new_rows = []
            for relationship_name in relationship_names:
                new_row = [links.loc[x,'fanfic_id'], update_date, relationship_name.get_text()]
                new_rows.append(new_row)
            relationships = relationships.append(pd.DataFrame(new_rows, columns=relationships.columns))
            relationships = relationships.drop_duplicates()
        except:
            print('no relationship tags')
        try:
            tag_names = fanfic.find('dd', attrs={'class':'freeform tags'})
            tag_names = tag_names.find_all('a', attrs={'class':'tag'})
            new_rows = []
            for tag_name in tag_names:
                new_row = [links.loc[x,'fanfic_id'], update_date, tag_name.get_text()]
                new_rows.append(new_row)
            tags = tags.append(pd.DataFrame(new_rows, columns=tags.columns))
            tags = tags.drop_duplicates()
        except:
            print('no tags')
        time.sleep(10)
    return series, collections, pairings, characters, relationships, tags

In [15]:
def get_series(fanfic, fanfic_id, series):
    try:
        series_names = fanfic.find('dd', attrs={'class':'series'})
        series_names = series_names.find_all('span', attrs={'class':'position'})
        new_rows = []
        for series_name in series_names:
            new_row = [fanfic_id, series_name.find('a').get_text()]
            new_rows.append(new_row)
        series = series.append(pd.DataFrame(new_rows, columns=series.columns))
        series = series.drop_duplicates()
    except:
        print('not in a series')
    return series

In [16]:
def get_collections(fanfic, fanfic_id, collections):
    try:
        collection_names = fanfic.find('dd', attrs={'class':'collections'})
        collection_names = collection_names.find_all('a')
        new_rows = []
        for collection_name in collection_names:
            new_row = [fanfic_id, collection_name.get_text()]
            new_rows.append(new_row)
        collections = collections.append(pd.DataFrame(new_rows, columns=collections.columns))
        collections = collections.drop_duplicates()
    except:
        print('not in a collection')
    return collections

In [17]:
def get_pairings(fanfic, fanfic_id, pairings):
    try:
        pairing_names = fanfic.find('dd', attrs={'class':'category tags'})
        pairing_names = pairing_names.find_all('a', attrs={'class':'tag'})
        new_rows = []
        for pairing_name in pairing_names:
            new_row = [fanfic_id, pairing_name.get_text()]
            new_rows.append(new_row)
        pairings = pairings.append(pd.DataFrame(new_rows, columns=pairings.columns))
        pairings = pairings.drop_duplicates()
    except:
        print('no pairing tags')
    return pairings

In [18]:
def get_update_date(fanfic):
    try:
        update_date = fanfic.find('dd', attrs={'class':'status'}).get_text()
    except:
        try:
            update_date = fanfic.find('dd', attrs={'class':'published'}).get_text()
        except:
            update_date = np.nan
            print('no update date')
    return update_date

In [19]:
def get_characters(fanfic, fanfic_id, characters, update_date):
    try:
        character_names = fanfic.find('dd', attrs={'class':'character tags'})
        character_names = character_names.find_all('a', attrs={'class':'tag'})
        new_rows = []
        for character_name in character_names:
            new_row = [fanfic_id, update_date, character_name.get_text()]
            new_rows.append(new_row)
        characters = characters.append(pd.DataFrame(new_rows, columns=characters.columns))
        characters = characters.drop_duplicates()
    except:
        print('no character tags')
    return characters

In [20]:
def get_relationships(fanfic, fanfic_id, update_date, relationships):
    try:
        relationship_names = fanfic.find('dd', attrs={'class':'relationship tags'})
        relationship_names = relationship_names.find_all('a', attrs={'class':'tag'})
        new_rows = []
        for relationship_name in relationship_names:
            new_row = [fanfic_id, update_date, relationship_name.get_text()]
            new_rows.append(new_row)
        relationships = relationships.append(pd.DataFrame(new_rows, columns=relationships.columns))
        relationships = relationships.drop_duplicates()
    except:
        print('no relationship tags')
    return relationships

In [21]:
def get_tags(fanfic, fanfic_id, update_date, tags):
    try:
        tag_names = fanfic.find('dd', attrs={'class':'freeform tags'})
        tag_names = tag_names.find_all('a', attrs={'class':'tag'})
        new_rows = []
        for tag_name in tag_names:
            new_row = [fanfic_id, update_date, tag_name.get_text()]
            new_rows.append(new_row)
        tags = tags.append(pd.DataFrame(new_rows, columns=tags.columns))
        tags = tags.drop_duplicates()
    except:
        print('no tags')
    return tags

In [22]:
def get_authors(author_list, authors, fandoms):
    for x in range(len(authors)):
        print(f"getting author {x+1}/{len(authors)}")
        author_orig = authors.loc[x,'author']
        print(author_orig)
        if author_orig !=np.nan:
            author_orig = str(author_orig)
            matches = re.findall(r'\((.*?)\)', author_orig)
            if matches:
                author_orig = matches[0]
        else:
            continue
        
        try:
            newlink=f"https://archiveofourown.org/users/{author_orig}/profile"
            profile = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        except requests.exceptions.RequestException:
            print(f"author {authors['author'][x]} fandom is taking too long to access.")
            continue
        profile = BeautifulSoup(profile,'html.parser')
        #print(newlink)
        try:
            pseuds = profile.find('dl', attrs={'class':'meta'})
            
            pseuds = pseuds.find_all('dd')
            joined = pseuds[1].get_text()
            id = pseuds[2].get_text()
        except:
            joined = np.nan
            id = np.nan
        print(id, joined)
        authors.loc[x,'author_id'] = id
        authors.loc[x,'joined_ao3'] = joined
        time.sleep(10)
        try:
            newlink=f"https://archiveofourown.org/users/{author_orig}"
            user = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        except requests.exceptions.RequestException:
            print(f"author {authors['author'][x]} profile is taking too long to access.")
            continue
        user = BeautifulSoup(user,'html.parser')
        #print(newlink)
        try:
            index = user.find('ol', attrs={'class':'index group'})
            index = index.find_all('li')
            for i in range(len(index)):
                li = index[i].get_text().split('(')
                fandom = li[0].strip()
                num_fic = li[-1].split(')')[0]
                fandoms.loc[len(fandoms)] = [id, fandom, num_fic]
        except:
            print(f"author {authors['author'][x]} fandom is taking too long to access.")
        time.sleep(10)
    authors = authors.append(author_list)
    return authors, fandoms

In [23]:
def get_fanfic_data(story, fanfic, fanfic_id,  link):
    try:
        title = story.find('h2', attrs={'class':'title heading'}).get_text().replace('\n','').strip()
    except:
        title = np.nan
    try:
        author = story.find('a', attrs={'rel':'author'}).get_text()
    except:
        author = np.nan
    try:
        published = story.find('dd', attrs={'class':'published'}).get_text()
    except:
        published = np.nan
    try:
        language = story.find('dd', attrs={'class':'language'}).get_text().replace('\n','').strip()
    except:
        language = np.nan
    try:
        summary = story.find('div', attrs={'class':'summary module'}).get_text().replace('\n', ' ').replace('Summary:','').strip()
    except:
        summary = np.nan
    try:
        warning = story.find('dd', attrs={'class':'warning tags'}).get_text().replace('\n','').strip()
    except:
        warning = np.nan
    new_row = [fanfic_id, link, title, author, published, language, summary, warning]
    fanfic = fanfic.append(pd.DataFrame([new_row], columns=fanfic.columns))
    print(new_row)
    return fanfic
    

In [35]:
def get_updates(story, fanfic_id, update_date, updates):
    try:
        words=story.find('dd', attrs={'class': 'words'}).get_text()
    except:
        words=np.nan
    try:
        chapters=story.find('dd', attrs={'class': 'chapters'}).get_text()
        chapter, chapter_max = chapters.split('/')
    except:
        chapters=np.nan
        chapter=np.nan
        chapter_max=np.nan    
    try:
        rating= story.find('dd', attrs={'class':'rating tags'}).get_text().replace('\n','').strip()  
    except:
        rating=np.nan
    new_row = [fanfic_id, update_date, words, chapter, chapter_max, rating]
    updates = updates.append(pd.DataFrame([new_row], columns=updates.columns))
    print(new_row)
    return updates     

In [25]:
def get_user_engagement(story, fanfic_id, user_engagement):
    try:
        kudos=story.find('dd', attrs={'class': 'kudos'}).get_text()
    except:
        kudos=np.nan
    try:
        bookmarks=story.find('dd', attrs={'class': 'bookmarks'}).get_text()
    except:
        bookmarks=np.nan
    try:
        comments=story.find('dd', attrs={'class': 'comments'}).get_text()
    except:
        comments=np.nan
    try:
        hits=story.find('dd', attrs={'class': 'hits'}).get_text()
    except:
        hits=np.nan
    
    new_row = [fanfic_id, kudos, bookmarks, comments, hits]
    user_engagement = user_engagement.append(pd.DataFrame([new_row], columns=user_engagement.columns))
    print(new_row)

    return user_engagement

In [26]:
def get_new_data(new_links, author_list):
    fanfic = pd.DataFrame(columns=['fanfic_id','link','title','author','published','language','summary', 'warning'])
    series = pd.DataFrame(columns=['fanfic_id', 'series_name'])
    collections = pd.DataFrame(columns=['fanfic_id', 'collection_name'])
    pairings = pd.DataFrame(columns=['fanfic_id', 'pairing_name'])
    characters = pd.DataFrame(columns=['fanfic_id', 'update_date', 'character'])
    relationships = pd.DataFrame(columns=['fanfic_id', 'update_date', 'relationship'])
    tags = pd.DataFrame(columns=['fanfic_id', 'update_date', 'tag'])
    updates = pd.DataFrame(columns=['fanfic_id','update_date','words', 'chapter', 'chapter_max','rating'])
    for link in new_links:
        try:
            newlink=link+'?view_adult=true'
            print(newlink)
            story = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        except requests.exceptions.RequestException:
            print(f"fanfic {link} is taking too long to access.")
            continue
        story = BeautifulSoup(story,'html.parser')
        fanfic_id = link.replace('https://archiveofourown.org/works/','')
        fanfic = get_fanfic_data(story, fanfic, fanfic_id, link)
        #print(fanfic)
        series = get_series(story, fanfic_id, series)
        collections = get_collections(story, fanfic_id, collections)
        pairings = get_pairings(story, fanfic_id, pairings)
        update_date = get_update_date(story)
        #print(update_date)
        characters = get_characters(story, fanfic_id, characters, update_date)
        relationships = get_relationships(story, fanfic_id, update_date, relationships)
        tags = get_tags(story, fanfic_id, update_date, tags)
        updates = get_updates(story, fanfic_id, update_date, updates)
        #print(updates)
        time.sleep(10)
    print(set(fanfic.author))
    print(len(author_list))
    author_missing = list(set(fanfic.author) - set(author_list.author))
    authors = pd.DataFrame(columns= ['author_id','author','joined_ao3'])
    fandoms = pd.DataFrame(columns=['author_id', 'fandom', 'num_fic'])
    if len(author_missing) != 0:
        authors['author'] = author_missing
        authors, fandoms = get_authors(author_list, authors, fandoms)
        print(len(authors))
    else:
        authors = author_list
    authors['author_id'] = authors['author_id'].astype('str')
    authors['author'] = authors['author'].astype('str')
    fanfic['author'] = fanfic['author'].astype('str')
    fanfic = fanfic.merge(authors[['author_id', 'author']], how='left', on='author')
    fanfic = fanfic.drop(columns=['author'])
    updates['update_date'] = pd.to_datetime(updates['update_date'])
    updates['date_scraped'] = updates['update_date'].max()
    conn = sqlite3.connect('output/ao3.db')
    fanfic.to_sql('fanfic', conn, if_exists='append', index=False)
    series.to_sql('series', conn, if_exists='append', index=False)
    collections.to_sql('collections', conn, if_exists='append', index=False)
    pairings.to_sql('pairings', conn, if_exists='append', index=False)
    characters.to_sql('characters', conn, if_exists='append', index=False)
    relationships.to_sql('relationships', conn, if_exists='append', index=False)
    tags.to_sql('tags', conn, if_exists='append', index=False)
    updates.to_sql('updates', conn, if_exists='append', index=False)
    authors.to_sql('authors', conn, if_exists='replace', index=False)
    fandoms.to_sql('fandom', conn, if_exists='append', index=False)
    conn.close()
    return

In [27]:
get_new_data(tv_new_links, author_list)

set()
728


In [28]:
get_new_data(book_new_links, author_list)

https://archiveofourown.org/works/48896122?view_adult=true


not in a collection
['48896122', '2023-07-27', '2,968', '1', '2', 'Explicit']
https://archiveofourown.org/works/48885256?view_adult=true
['48885256', 'https://archiveofourown.org/works/48885256', 'Gone', 'amature_cession22', '2023-07-26', 'English', 'Lucy is kidnapped by someone with... very bad intentions. The question is will Lockwood and George be able to find her in time ?\xa0ORA serial killer finds it easy to hide between the ghosts in a uncertain time.', 'Graphic Depictions Of Violence']
not in a collection
no pairing tags
['48885256', '2023-07-26', '884', '1', '5', 'General Audiences']
https://archiveofourown.org/works/48888385?view_adult=true
not in a series
['48888385', '2023-07-26', '3,413', '1', '1', 'Explicit']
https://archiveofourown.org/works/48873058?view_adult=true
not in a series
not in a collection
['48873058', '2023-07-26', '2,540', '1', '12', 'Teen And Up Audiences']
https://archiveofourown.org/works/48885415?view_adult=true
not in a series
not in a collection
['488

In [29]:
def get_up(update_links):
    updates = pd.DataFrame(columns=['fanfic_id','update_date','words', 'chapter', 'chapter_max','rating'])
    date_now = datetime.now()
    prior_date = date_now - timedelta(days=20)
    prior_date = prior_date.strftime("%m-%d-%Y")
    for link in update_links:
        try:    
            newlink=link+'?view_adult=true'
            story = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        except requests.exceptions.RequestException:
            print(f"fanfic {link} is taking too long to acces.")
            continue
        story = BeautifulSoup(story,'html.parser')
        fanfic_id = link.replace('https://archiveofourown.org/works/','')
        update_date = get_update_date(story)
        if update_date > prior_date:
            updates = get_updates(story, fanfic_id, update_date, updates)
        else:
            break
        time.sleep(10)
    updates['update_date'] = pd.to_datetime(updates['update_date'])
    updates['date_scraped'] = updates['update_date'].max()
    updates['date_scraped'] = updates['update_date'].astype(str)
    updates['update_date'] = updates['update_date'].astype(str)
    print(updates)
    conn = sqlite3.connect('output/ao3.db')
    querry = "SELECT * FROM updates"
    updates=updates.append(pd.read_sql_query(querry, conn), ignore_index=True)
    updates = updates.drop_duplicates()
    updates.to_sql('updates', conn, if_exists='append', index=False)
    conn.close()
    return

In [30]:
len(tv_update_links)

80

In [31]:
get_up(tv_update_links)
get_up(book_update_links)

['47292367', '2023-05-21', '1,379', '1', '1', 'Not Rated']
['47850358', '2023-06-19', '986', '1', '1', 'Teen And Up Audiences']
['45303043', '2023-06-06', '8,371', '4', '?', 'Teen And Up Audiences']
['47908165', '2023-06-23', '6,091', '1', '1', 'Mature']
['45452680', '2023-06-27', '7,638', '4', '?', 'Teen And Up Audiences']
['48248254', '2023-06-30', '3,991', '1', '1', 'Teen And Up Audiences']
['47400718', '2023-07-16', '25,832', '4', '7', 'Mature']
['48586786', '2023-07-14', '6,066', '1', '1', 'Teen And Up Audiences']
['47390227', '2023-05-24', '10,088', '1', '1', 'Explicit']
['47989714', '2023-07-05', '2,677', '5', '5', 'General Audiences']
['48115486', '2023-06-24', '350', '1', '1', 'General Audiences']
['48362479', '2023-07-05', '2,062', '1', '1', 'Teen And Up Audiences']
['47963587', '2023-06-18', '2,431', '1', '1', 'Teen And Up Audiences']
['48038953', '2023-06-21', '2,027', '1', '1', 'Teen And Up Audiences']
['48224551', '2023-07-21', '9,397', '13', '13', 'Teen And Up Audiences'

In [32]:
def update_user_engagement(update_links, author_list):
    user_engagement = pd.DataFrame(columns=['fanfic_id','kudos', 'bookmarks', 'comments', 'hits'])
    date_now = datetime.now()

    updates = pd.DataFrame(columns=['fanfic_id','update_date','words', 'chapter', 'chapter_max','rating'])
    date_now = datetime.now()
    prior_date = date_now - timedelta(days=20)
    prior_date = prior_date.strftime("%m-%d-%Y")

    print(f'Number to scrape {len(update_links)}')
    counter = 1

    authors_new = []

    for link in update_links:
        print(f'getting {counter}/{len(update_links)}')
        counter+=1

        try:
            newlink=link+'?view_adult=true'
            story = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        except requests.exceptions.RequestException:
            print(f"fanfic {link} is taking too long to access.")
            continue
        print(newlink)
        story = BeautifulSoup(story,'html.parser')
        fanfic_id = link.replace('https://archiveofourown.org/works/','')
        user_engagement = get_user_engagement(story, fanfic_id, user_engagement)
        update_date = get_update_date(story)
        updates = get_updates(story, fanfic_id, update_date, updates)
        time.sleep(10)
    date_now = date_now.strftime("%Y-%m-%d")
    user_engagement['date_scraped'] = date_now
    user_engagement['date_scraped'] = user_engagement['date_scraped'].astype(str)
  
    conn = sqlite3.connect('output/ao3.db')
    querry = "SELECT * FROM user_engagement"
    user_engagement=user_engagement.append(pd.read_sql_query(querry, conn), ignore_index=True)
    user_engagement = user_engagement.drop_duplicates()
    user_engagement.to_sql('user_engagement', conn, if_exists='replace', index=False)
    conn.close()
    
    updates['update_date'] = pd.to_datetime(updates['update_date'])
    updates['date_scraped'] = updates['update_date'].max()
    updates['date_scraped'] = updates['update_date'].astype(str)
    updates['update_date'] = updates['update_date'].astype(str)
    print(updates)
    conn = sqlite3.connect('output/ao3.db')
    querry = "SELECT * FROM updates"
    updates=updates.append(pd.read_sql_query(querry, conn), ignore_index=True)
    updates = updates.drop_duplicates()
    updates.to_sql('updates', conn, if_exists='append', index=False)
    conn.close()
    return

In [33]:
author_list

Unnamed: 0,author_id,author,joined_ao3
0,17235478,Chocy_Milk,2023-04-18
1,13655830,Squid52,2022-03-10
2,1760866,literaryelegance,2016-01-08
3,6999,Cornerofmadness,2010-05-17
4,14646558,Telvannicon,2022-07-15
...,...,...,...
723,6561073,SweeterEnd,2020-01-07
724,574299,WindsAroundATreeLikeAVine,2014-07-26
725,14696172,Writing_random_stuff,2022-07-20
726,1711910,Amazinggrace0,2015-12-22


In [36]:
conn = sqlite3.connect('output/ao3.db')
querry = "SELECT link FROM fanfic"
link_list = pd.read_sql_query(querry, conn)
conn.close()
update_user_engagement(link_list.link, author_list)

Number to scrape 2338
getting 1/2338
https://archiveofourown.org/works/44791411?view_adult=true
['44791411', '168', '15', '25', '673']
['44791411', '2023-02-04', '1,394', '1', '1', 'Teen And Up Audiences']
getting 2/2338
https://archiveofourown.org/works/47484628?view_adult=true
['47484628', '84', '1', '5', '746']
['47484628', '2023-05-28', '1,503', '1', '1', 'General Audiences']
getting 3/2338
https://archiveofourown.org/works/45235999?view_adult=true
['45235999', '103', '12', '4', '1,938']
['45235999', '2023-02-22', '3,286', '1', '1', 'General Audiences']
getting 4/2338
https://archiveofourown.org/works/45056584?view_adult=true
['45056584', '261', '24', '10', '2,370']
['45056584', '2023-02-15', '4,601', '1', '1', 'Teen And Up Audiences']
getting 5/2338
https://archiveofourown.org/works/47517634?view_adult=true
['47517634', '26', '2', '5', '405']
['47517634', '2023-06-19', '9,768', '4', '?', 'Teen And Up Audiences']
getting 6/2338
https://archiveofourown.org/works/46912513?view_adult=

In [37]:
conn = sqlite3.connect('output/ao3.db')
querry = "SELECT * FROM fanfic"
print(len(pd.read_sql_query(querry, conn)))
conn.close()

2338
