In [21]:
## Run selenium and chrome driver to scrape data from cloudbytes.dev
import time
import os.path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# Web scraper part 1

In [22]:

## Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off
chrome_options.add_argument("--window-size=1920,1200")

# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")

# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

# Get page
pagenum=1
link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
browser.get(link)
maxpagenum=int(browser.find_element(By.XPATH,'//ol[1]/li[13]').text.strip())

def get_title(browser):
    """
    Extracts information for each work on a given page and returns a Pandas DataFrame.
    """
    # Find all the fanfic works on the page
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        WebDriverWait(browser, 10)
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        title = a[0].text.strip()
        try:
            author = a[1].text.strip()
        except:
            author = "Anonymous"
        links = a[0].get_attribute("href")
        datetime = work.find_element(By.CLASS_NAME, 'datetime').text.strip()
        chapters = work.find_elements(By.CLASS_NAME,'chapters')[1].text.strip()
        row = {'link':links, 
               'title':title,
               'author':author, 
               'updatedate':datetime,
               'chapters':chapters
               }
        #print(row)
        data.append(row)
    return pd.DataFrame(data)

# Create an empty DataFrame to hold the data
data_list = get_title(browser)
print('page 1 has been processed')
# Iterate through each page and append the data to the DataFrame
for p in range(2,maxpagenum+1):
    pagenum=p
    time.sleep(10)
    print(f'procesing page {pagenum}/{maxpagenum}')
    link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
    browser.get(link)
    data_list=data_list.append(get_title(browser), ignore_index=True)
    print(len(data_list))

#Wait for 10 seconds
time.sleep(10)
browser.quit()


page 1 has been processed
procesing page 2/66
40
procesing page 3/66
60
procesing page 4/66
80
procesing page 5/66
100
procesing page 6/66
120
procesing page 7/66
140
procesing page 8/66
160
procesing page 9/66
180
procesing page 10/66
200
procesing page 11/66
220
procesing page 12/66
240
procesing page 13/66
260
procesing page 14/66
280
procesing page 15/66
300
procesing page 16/66
320
procesing page 17/66
340
procesing page 18/66
360
procesing page 19/66
380
procesing page 20/66
400
procesing page 21/66
420
procesing page 22/66
440
procesing page 23/66
460
procesing page 24/66
480
procesing page 25/66
500
procesing page 26/66
520
procesing page 27/66
540
procesing page 28/66
560
procesing page 29/66
580
procesing page 30/66
600
procesing page 31/66
620
procesing page 32/66
640
procesing page 33/66
660
procesing page 34/66
680
procesing page 35/66
700
procesing page 36/66
720
procesing page 37/66
740
procesing page 38/66
760
procesing page 39/66
780
procesing page 40/66
800
procesing 

In [23]:

# Split the chapter column into chapter and chapter_max, and create a completion column
data_list[['chapter','chapter_max']] = data_list.chapters.str.split("/", expand=True)
data_list['completion'] = data_list.apply(lambda row: 'completed' if row['chapter']==row['chapter_max'] else 'incomplete', axis=1)

data_list1=data_list.replace('', np.nan)
data_list1=data_list1[data_list1.isna().any(axis=1)]
def get_missing1(df):
    data=[]
    counter=0
    for x in df['link']:
        newlink=x+'?view_adult=true'
        counter+=1
        print(f"getting missing data {counter}/{len(df['link'])}")
        source = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        soup = BeautifulSoup(source,'html.parser')
        title=soup.find('h2', attrs={'class':'title heading'}).get_text().replace('\n','').strip()
        try:
            author=soup.find('a', attrs={'rel':'author'}).get_text()
        except:
            author="Anonymous"
        datetime=soup.find('dd', attrs={'class':'published'}).get_text()
        chapters=soup.find('dd', attrs={'class':'chapters'}).get_text()
        row = {'link':x, 
               'title':title,
               'author':author, 
               'updatedate':datetime,
               'chapters':chapters
               }
        #print(row)
        data.append(row)
        time.sleep(10)
    return pd.DataFrame(data)
if len(data_list1)==0:
    part1=data_list
else:
    data_list2 = data_list[~data_list['link'].isin(data_list1['link'])]
    data_list3=get_missing1(data_list1)
    # Split the chapter column into chapter and chapter_max, and create a completion column
    data_list3[['chapter','chapter_max']] = data_list3.chapters.str.split("/", expand=True)
    data_list3['completion'] = data_list3.apply(lambda row: 'completed' if row['chapter']==row['chapter_max'] else 'incomplete', axis=1)
    part1=data_list2.append(data_list3)
#data_listfinal.to_csv('lockwood_part1.csv', index=False)
print(part1[part1.isna().any(axis=1)])

Empty DataFrame
Columns: [link, title, author, updatedate, chapters, chapter, chapter_max, completion]
Index: []


# Web scraper part 2

In [24]:
## Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off
chrome_options.add_argument("--window-size=1920,1200")

# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")

# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

# Get page
pagenum=1
link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
browser.get(link)

# Get the total number of pages to scrape
maxpagenum=int(browser.find_element(By.XPATH,'//ol[1]/li[13]').text.strip())

def get_user(browser):
    """
    Extracts data from each work listed on a page.
    :param browser: Chrome Webdriver object
    :return: Pandas DataFrame
    """
    # Find all the fanfic works on the page
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        WebDriverWait(browser, 10)
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        links = a[0].get_attribute("href")
        language = work.find_elements(By.CLASS_NAME,'language')[1].text.strip()
        words = work.find_elements(By.CLASS_NAME,'words')[1].text.strip()
        try:
            kudos = work.find_elements(By.CLASS_NAME,'kudos')[1].text.strip()
        except:
            kudos = 0
        try:
            hits = work.find_elements(By.CLASS_NAME,'hits')[1].text.strip()
        except:
            hits = 0
        try:
            comments = work.find_elements(By.CLASS_NAME,'comments')[1].text.strip()
        except:
            comments = 0
        try:
            bookmarks = work.find_elements(By.CLASS_NAME,'bookmarks')[1].text.strip()
        except:
            bookmarks = 0
        row = {'link':links, 
               'language':language,
               'words':words,
               'kudos':kudos, 
               'comments':comments,
               'bookmarks':bookmarks,
               'hits':hits
               }
        data.append(row)
        
    return pd.DataFrame(data)

# Get data from the first page
data_list=get_user(browser)
print('page 1 processed')

# Iterate through remaining pages
for p in range(2,maxpagenum+1):
    pagenum=p
    time.sleep(10)
    print(f'procesing page {pagenum}/{maxpagenum}')
    link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
    browser.get(link)
    data_list=data_list.append(get_user(browser), ignore_index=True)
    print(len(data_list))

# Wait for 10 seconds
time.sleep(10)
browser.quit()



page 1 processed
procesing page 2/66
40
procesing page 3/66
60
procesing page 4/66
80
procesing page 5/66
100
procesing page 6/66
120
procesing page 7/66
140
procesing page 8/66
160
procesing page 9/66
180
procesing page 10/66
200
procesing page 11/66
220
procesing page 12/66
240
procesing page 13/66
260
procesing page 14/66
280
procesing page 15/66
300
procesing page 16/66
320
procesing page 17/66
340
procesing page 18/66
360
procesing page 19/66
380
procesing page 20/66
400
procesing page 21/66
420
procesing page 22/66
440
procesing page 23/66
460
procesing page 24/66
480
procesing page 25/66
500
procesing page 26/66
520
procesing page 27/66
540
procesing page 28/66
560
procesing page 29/66
580
procesing page 30/66
600
procesing page 31/66
620
procesing page 32/66
640
procesing page 33/66
660
procesing page 34/66
680
procesing page 35/66
700
procesing page 36/66
720
procesing page 37/66
740
procesing page 38/66
760
procesing page 39/66
780
procesing page 40/66
800
procesing page 41/6

In [25]:
# Remove empty cells and create a new DataFrame with missing data
data_list1=data_list.replace('', np.nan)
data_list1=data_list1[data_list1.isna().any(axis=1)]
data_list2 = data_list[~data_list['link'].isin(data_list1['link'])]

def get_missing2(df):
    """
    Extracts data for works that had missing cells in the original DataFrame.
    :param df: Pandas DataFrame
    :return: Pandas DataFrame
    """
    data=[]
    counter=0
    for x in df['link']:
        newlink=x+'?view_adult=true'
        counter+=1
        print(f"getting missing data {counter}/{len(df['link'])}")
        source = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        soup = BeautifulSoup(source,'html.parser')
        language=soup.find('dd', attrs={'class':'language'}).get_text().replace('\n','').strip()
        words=soup.find('dd', attrs={'class':'words'}).get_text()
        try:
            kudos=soup.find('dd', attrs={'class':'kudos'}).get_text()
        except:
            kudos=0
        try:
            comments=soup.find('dd', attrs={'class':'comments'}).get_text()
        except:
            comments=0
        try:
            bookmarks=soup.find('dd', attrs={'class':'bookmarks'}).get_text()
        except:
            bookmarks=0
        try:
            hits=soup.find('dd', attrs={'class':'hits'}).get_text()
        except:
            hits=0
        row = {'link':x, 
               'language':language,
               'words':words,
               'kudos':kudos, 
               'comments':comments,
               'bookmarks':bookmarks,
               'hits':hits
               }
        #print(row)
        data.append(row)
        time.sleep(10)
    return pd.DataFrame(data)
if len(data_list1)==0:
    part2 = data_list
else:
    data_list3=get_missing2(data_list1)
    part2=data_list2.append(data_list3)
print(part2[part2.isna().any(axis=1)])

getting missing data 1/249
getting missing data 2/249
getting missing data 3/249
getting missing data 4/249
getting missing data 5/249
getting missing data 6/249
getting missing data 7/249
getting missing data 8/249
getting missing data 9/249
getting missing data 10/249
getting missing data 11/249
getting missing data 12/249
getting missing data 13/249
getting missing data 14/249
getting missing data 15/249
getting missing data 16/249
getting missing data 17/249
getting missing data 18/249
getting missing data 19/249
getting missing data 20/249
getting missing data 21/249
getting missing data 22/249
getting missing data 23/249
getting missing data 24/249
getting missing data 25/249
getting missing data 26/249
getting missing data 27/249
getting missing data 28/249
getting missing data 29/249
getting missing data 30/249
getting missing data 31/249
getting missing data 32/249
getting missing data 33/249
getting missing data 34/249
getting missing data 35/249
getting missing data 36/249
g

# Web scraper part 3

In [27]:
## Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off
chrome_options.add_argument("--window-size=1920,1200")

# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")

# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

# Get page
pagenum=1
link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
browser.get(link)

# Get the total number of pages to scrape
maxpagenum=int(browser.find_element(By.XPATH,'//ol[1]/li[13]').text.strip())

def get_text(browser):
    """
    Extracts data from each work listed on a page.
    :param browser: Chrome Webdriver object
    :return: Pandas DataFrame
    """
    # Find all the fanfic works on the page
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        WebDriverWait(browser, 10)
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        links = a[0].get_attribute("href")
        warning=work.find_element(By.TAG_NAME,'strong').text.strip()
        try:
            mainrelationship = work.find_element(By.CLASS_NAME,'relationships').text.strip()
            relationship_list = []
            relationship = work.find_elements(By.CLASS_NAME,'relationships')
            for r in relationship:
                relationship_list.append(r.text.strip())
        except:
            mainrelationship = np.nan
            relationship_list = np.nan
        character_list = []
        characters = work.find_elements(By.CLASS_NAME, 'characters')
        for char in characters:
            character_list.append(char.text.strip())
        try:
            tags_list = []
            tags = work.find_elements(By.CLASS_NAME,'freeforms')
            for tag in tags:
                tags_list.append(tag.text.strip())
        except:
            tags_list=np.nan
        summary = work.find_element(By.TAG_NAME, 'blockquote').text.replace('\n',' ').strip()
        row = {'link':links, 
               'warning':warning,
               'mainship':mainrelationship,
               'relationship':relationship_list,
               'characters':character_list, 
               'tags':tags_list,
               'summary':summary
               }
        data.append(row)
    return pd.DataFrame(data)

# Get data from the first page
data_list=get_text(browser)
print('page 1 processed')
# Iterate through remaining pages
for p in range(2,maxpagenum+1):
    pagenum=p
    time.sleep(10)
    print(f'procesing page {pagenum}/{maxpagenum}')
    link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
    browser.get(link)
    data_list=data_list.append(get_text(browser), ignore_index=True)
    print(len(data_list))

#Wait for 10 seconds
time.sleep(10)
browser.quit()



page 1 processed
procesing page 2/66
40
procesing page 3/66
60
procesing page 4/66
80
procesing page 5/66
100
procesing page 6/66
120
procesing page 7/66
140
procesing page 8/66
160
procesing page 9/66
180
procesing page 10/66
200
procesing page 11/66
220
procesing page 12/66
240
procesing page 13/66
260
procesing page 14/66
280
procesing page 15/66
300
procesing page 16/66
320
procesing page 17/66
340
procesing page 18/66
360
procesing page 19/66
380
procesing page 20/66
400
procesing page 21/66
420
procesing page 22/66
440
procesing page 23/66
460
procesing page 24/66
480
procesing page 25/66
500
procesing page 26/66
520
procesing page 27/66
540
procesing page 28/66
560
procesing page 29/66
580
procesing page 30/66
600
procesing page 31/66
620
procesing page 32/66
640
procesing page 33/66
660
procesing page 34/66
680
procesing page 35/66
700
procesing page 36/66
720
procesing page 37/66
740
procesing page 38/66
760
procesing page 39/66
780
procesing page 40/66
800
procesing page 41/6

In [30]:
data_list1=data_list.replace('', np.nan)
data_list1=data_list1[data_list1.isna().any(axis=1)]

data_list2 = data_list[~data_list['link'].isin(data_list1['link'])]

def get_missing3(df):
    data=[]
    counter=0
    for x in df['link']:
        newlink=x+'?view_adult=true'
        counter+=1
        print(f"getting missing data {counter}/{len(df['link'])}")
        print(newlink)
        source = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        soup = BeautifulSoup(source,'html.parser')
        warning=soup.find('dd', attrs={'class':'warning tags'}).get_text().replace('\n','').strip()
        try:
            ships = soup.find('dd', attrs={'class':'relationship tags'})
            ships_list=ships.find_all('a', attrs={'class':'tag'})
            mainrelationship = ships_list[0].get_text().strip()
            relationship_list = []
            for r in ships_list:
                relationship_list.append(r.get_text().strip())
        except:
            mainrelationship='None'
            relationship_list = []
        try:
            char = soup.find('dd', attrs={'class':'character tags'})
            char_list=char.find_all('a', attrs={'class':'tag'})
            character_list = []
            for c in char_list:
                character_list.append(c.get_text().strip())
        except:
            character_list=[]
        try:
            freeform = soup.find('dd', attrs={'class':'freeform tags'})
            freefom_list=freeform.find_all('a', attrs={'class':'tag'})
            tags_list = []
            for t in freefom_list:
                tags_list.append(t.get_text().strip())
        except:
            tags_list = []
        try:
            summary=soup.find('div', attrs={'class':'summary module'}).get_text().replace('\n', ' ').replace('Summary:','').strip()
        except:
            summary=np.nan
        row = {'link':x, 
               'warning':warning,
               'mainship':mainrelationship,
               'relationship':relationship_list,
               'characters':character_list, 
               'tags':tags_list,
               'summary':summary
               }
        #print(row)
        data.append(row)
        time.sleep(10)
    return pd.DataFrame(data)
if len(data_list1)==0:
    part2 = data_list
else:
    data_list3=get_missing3(data_list1)
    part3=data_list2.append(data_list3)
print(part3[part3.isna().any(axis=1)])

getting missing data 1/866
https://archiveofourown.org/works/46616701?view_adult=true
getting missing data 2/866
https://archiveofourown.org/works/46545298?view_adult=true
getting missing data 3/866
https://archiveofourown.org/works/46633966?view_adult=true
getting missing data 4/866
https://archiveofourown.org/works/46632634?view_adult=true
getting missing data 5/866
https://archiveofourown.org/works/46234456?view_adult=true
getting missing data 6/866
https://archiveofourown.org/works/46568050?view_adult=true
getting missing data 7/866
https://archiveofourown.org/works/46546858?view_adult=true
getting missing data 8/866
https://archiveofourown.org/works/46588315?view_adult=true
getting missing data 9/866
https://archiveofourown.org/works/46626169?view_adult=true
getting missing data 10/866
https://archiveofourown.org/works/45308677?view_adult=true
getting missing data 11/866
https://archiveofourown.org/works/46616434?view_adult=true
getting missing data 12/866
https://archiveofourown.

In [32]:
final=part1.merge(part2, how='outer', on='link')
final=final.merge(part3, how='outer', on='link')
final.to_csv('ao3_lockwood_and_co.csv', index=False)

In [None]:
rating=[]
counter=0
for x in final[link]:
    newlink=x+'?view_adult=true'
    counter+=1
    print(f"getting missing data {counter}/{len(final['link'])}")
    print(newlink)
    source = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
    soup = BeautifulSoup(source,'html.parser')
    rating_tag=soup.find('dd', attrs={'class':'rating tags'}).get_text().replace('\n','').strip()
    rating.append(rating_tag)
    time.sleep(10)
    final[rating] = rating
final.to_csv.to_csv('ao3_lockwood_and_co.csv', index=False)