In [545]:
## Run selenium and chrome driver to scrape data from cloudbytes.dev
import time
import os.path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [507]:
## Setup chrome options
chrome_options = Options()
chrome_options.headless = True # Ensure GUI is off
chrome_options.add_argument("--window-size=1920,1200")

In [508]:
# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/ao3lockwood-co/chromedriver")

In [509]:
# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

In [510]:
# Get page
pagenum=1
link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
browser.get(link)
maxpagenum=int(browser.find_element(By.XPATH,'//ol[1]/li[13]').text.strip())

In [511]:
def get_links(browser):
    # Find all the fanfic works on the page
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        title = a[0].text.strip()
        author = a[1].text.strip()
        datetime = work.find_element(By.CLASS_NAME, 'datetime').text.strip()
        warning=work.find_element(By.TAG_NAME,'strong').text.strip()
        relationship_list = []
        relationship = work.find_elements(By.CLASS_NAME,'relationships')
        for r in relationship:
            relationship_list.append(r.text.strip())
        character_list = []
        characters = work.find_elements(By.CLASS_NAME, 'characters')
        for char in characters:
            character_list.append(char.text.strip())
        tags_list = []
        tags = work.find_elements(By.CLASS_NAME,'freeforms')
        for tag in tags:
            tags_list.append(tag.text.strip())
        summary = work.find_element(By.TAG_NAME, 'blockquote').text.strip()
        language = work.find_elements(By.CLASS_NAME,'language')[1].text.strip()
        words = work.find_elements(By.CLASS_NAME,'words')[1].text.strip()
        chapters = work.find_elements(By.CLASS_NAME,'chapters')[1].text.strip()
        try:
            kudos = work.find_elements(By.CLASS_NAME,'kudos')[1].text.strip()
        except:
            kudos = 0
        try:
            hits = work.find_elements(By.CLASS_NAME,'hits')[1].text.strip()
        except:
            hits = 0
        try:
            comments = work.find_elements(By.CLASS_NAME,'comments')[1].text.strip()
        except:
            comments = 0
        try:
            bookmarks = work.find_elements(By.CLASS_NAME,'bookmarks')[1].text.strip()
        except:
            bookmarks = 0
        links = a[0].get_attribute("href")
        row = {'link':links, 'title':title, 'author':author, 'updatedate':datetime, 'warning':warning,
                     'relationship':relationship_list, 'characters':character_list, 'tags':tags_list, 'summary':summary,
                     'language':language, 'words':words,'chapters':chapters, 'hits':hits, 'kudos':kudos, 'comments':comments,
                     'bookmarks':bookmarks}
        #print(row)
        data.append(row)
    return data
    

In [512]:
def get_title(browser):
    # Find all the fanfic works on the page
    works = browser.find_elements(By.XPATH, '//ol[2]/li')
    # Iterate through each work and extract author and datetime
    data=[]
    for work in works:
        WebDriverWait(browser, 10)
        h4 = work.find_element(By.TAG_NAME,'h4')
        a = h4.find_elements(By.TAG_NAME, 'a')
        links = a[0].get_attribute("href")
        language = work.find_elements(By.CLASS_NAME,'language')[1].text.strip()
        words = work.find_elements(By.CLASS_NAME,'words')[1].text.strip()
        try:
            kudos = work.find_elements(By.CLASS_NAME,'kudos')[1].text.strip()
        except:
            kudos = 0
        try:
            hits = work.find_elements(By.CLASS_NAME,'hits')[1].text.strip()
        except:
            hits = 0
        try:
            comments = work.find_elements(By.CLASS_NAME,'comments')[1].text.strip()
        except:
            comments = 0
        try:
            bookmarks = work.find_elements(By.CLASS_NAME,'bookmarks')[1].text.strip()
        except:
            bookmarks = 0
        row = {'link':links, 
               'language':language,
               'words':words,
               'kudos':kudos, 
               'comments':comments,
               'bookmarks':bookmarks,
               'hits':hits
               }
        #print(row)
        data.append(row)
    return pd.DataFrame(data)

In [513]:
data_list=get_title(browser)
print(data_list)

                                          link language    words kudos  \
0   https://archiveofourown.org/works/45308677  English    4,255   392   
1   https://archiveofourown.org/works/46519282  English   17,318   135   
2   https://archiveofourown.org/works/46616701  English    1,626    21   
3   https://archiveofourown.org/works/46616434  English    6,948    53   
4   https://archiveofourown.org/works/46616116  English      234     0   
5   https://archiveofourown.org/works/46481659  English   17,463   146   
6   https://archiveofourown.org/works/46608505  English    2,489    49   
7   https://archiveofourown.org/works/45793852  English  105,297   140   
8   https://archiveofourown.org/works/46612807  English    1,362    79   
9   https://archiveofourown.org/works/46612975  English    1,839    13   
10  https://archiveofourown.org/works/46612264  English    2,218    31   
11  https://archiveofourown.org/works/46469533  English    5,184   100   
12  https://archiveofourown.org/works/

In [514]:

# for page 2 onwards:
for p in range(2,maxpagenum+1):
    pagenum=p
    time.sleep(10)
    print(f'procesing page {pagenum}')
    link="https://archiveofourown.org/tags/Lockwood%20*a*%20Co*d*%20-%20Jonathan%20Stroud/works?page="+str(pagenum)
    browser.get(link)
    data_list=data_list.append(get_title(browser), ignore_index=True)
    print(data_list)


procesing page 2
                                          link language    words kudos  \
0   https://archiveofourown.org/works/45308677  English    4,255   392   
1   https://archiveofourown.org/works/46519282  English   17,318   135   
2   https://archiveofourown.org/works/46616701  English    1,626    21   
3   https://archiveofourown.org/works/46616434  English    6,948    53   
4   https://archiveofourown.org/works/46616116  English      234     0   
5   https://archiveofourown.org/works/46481659  English   17,463   146   
6   https://archiveofourown.org/works/46608505  English    2,489    49   
7   https://archiveofourown.org/works/45793852  English  105,297   140   
8   https://archiveofourown.org/works/46612807  English    1,362    79   
9   https://archiveofourown.org/works/46612975  English    1,839    13   
10  https://archiveofourown.org/works/46612264  English    2,218    31   
11  https://archiveofourown.org/works/46469533  English    5,184   100   
12  https://archiveof

In [515]:
data_list

Unnamed: 0,link,language,words,kudos,comments,bookmarks,hits
0,https://archiveofourown.org/works/45308677,English,4255,392,122,25,5544
1,https://archiveofourown.org/works/46519282,English,17318,135,85,16,1808
2,https://archiveofourown.org/works/46616701,English,1626,21,10,3,244
3,https://archiveofourown.org/works/46616434,English,6948,53,5,6,335
4,https://archiveofourown.org/works/46616116,English,234,0,0,0,29
...,...,...,...,...,...,...,...
1308,https://archiveofourown.org/works/3709479,English,4570,18,1,0,300
1309,https://archiveofourown.org/works/2282274,English,10026,20,6,2,486
1310,https://archiveofourown.org/works/1267453,English,7561,67,4,5,1249
1311,https://archiveofourown.org/works/1169828,English,1102,109,9,5,1663


In [516]:
#Wait for 10 seconds
time.sleep(10)
browser.quit()

In [543]:
data_list1=data_list.replace('', np.nan)
data_list1=data_list1[data_list1.isna().any(axis=1)]

In [555]:
data_list1

Unnamed: 0,link,language,words,kudos,comments,bookmarks,hits
12,https://archiveofourown.org/works/45743218,English,5238,105,28,,958
13,https://archiveofourown.org/works/46450045,,,,,,
14,https://archiveofourown.org/works/46609009,,,,,0,
15,https://archiveofourown.org/works/46227607,,,,,,
16,https://archiveofourown.org/works/46606960,,,,,0,
...,...,...,...,...,...,...,...
1277,https://archiveofourown.org/works/8107966,,,,,,
1278,https://archiveofourown.org/works/7898905,,,,,0,
1279,https://archiveofourown.org/works/7849768,,,,,,
1298,https://archiveofourown.org/works/5443742,English,9196,45,,,


In [544]:
data_list2 = data_list[~data_list['link'].isin(data_list1['link'])]
data_list2


Unnamed: 0,link,language,words,kudos,comments,bookmarks,hits
0,https://archiveofourown.org/works/45308677,English,4255,392,122,25,5544
1,https://archiveofourown.org/works/46519282,English,17318,135,85,16,1808
2,https://archiveofourown.org/works/46616701,English,1626,21,10,3,244
3,https://archiveofourown.org/works/46616434,English,6948,53,5,6,335
4,https://archiveofourown.org/works/46616116,English,234,0,0,0,29
...,...,...,...,...,...,...,...
1308,https://archiveofourown.org/works/3709479,English,4570,18,1,0,300
1309,https://archiveofourown.org/works/2282274,English,10026,20,6,2,486
1310,https://archiveofourown.org/works/1267453,English,7561,67,4,5,1249
1311,https://archiveofourown.org/works/1169828,English,1102,109,9,5,1663


In [563]:
def get_missing(df):
    data=[]
    for x in df['link']:
        newlink=x+'?view_adult=true'
        print(newlink)
        source = requests.get(newlink, headers={
                          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}).text
        soup = BeautifulSoup(source,'html.parser')
        language=soup.find('dd', attrs={'class':'language'}).get_text().replace('\n','').strip()
        words=soup.find('dd', attrs={'class':'words'}).get_text()
        try:
            kudos=soup.find('dd', attrs={'class':'kudos'}).get_text()
        except:
            kudos=0
        try:
            comments=soup.find('dd', attrs={'class':'comments'}).get_text()
        except:
            comments=0
        try:
            bookmarks=soup.find('dd', attrs={'class':'bookmarks'}).get_text()
        except:
            bookmarks=0
        try:
            hits=soup.find('dd', attrs={'class':'hits'}).get_text()
        except:
            hits=0
        row = {'link':x, 
               'language':language,
               'words':words,
               'kudos':kudos, 
               'comments':comments,
               'bookmarks':bookmarks,
               'hits':hits
               }
        print(row)
        data.append(row)
        time.sleep(10)
    return pd.DataFrame(data)

In [564]:
data_list3=get_missing(data_list1)

https://archiveofourown.org/works/45743218?view_adult=true
{'link': 'https://archiveofourown.org/works/45743218', 'language': 'English', 'words': '5,238', 'kudos': '105', 'comments': '29', 'bookmarks': '8', 'hits': '964'}
https://archiveofourown.org/works/46450045?view_adult=true
{'link': 'https://archiveofourown.org/works/46450045', 'language': 'English', 'words': '9,559', 'kudos': '188', 'comments': '73', 'bookmarks': '14', 'hits': '1,965'}
https://archiveofourown.org/works/46609009?view_adult=true
{'link': 'https://archiveofourown.org/works/46609009', 'language': 'English', 'words': '2,663', 'kudos': '18', 'comments': '3', 'bookmarks': 0, 'hits': '310'}
https://archiveofourown.org/works/46227607?view_adult=true
{'link': 'https://archiveofourown.org/works/46227607', 'language': 'English', 'words': '9,564', 'kudos': '179', 'comments': '50', 'bookmarks': '16', 'hits': '1,856'}
https://archiveofourown.org/works/46606960?view_adult=true
{'link': 'https://archiveofourown.org/works/4660696

In [572]:
data_list3=data_list3.replace('', np.nan)
print(data_list3)

                                           link language  words kudos  \
0    https://archiveofourown.org/works/45743218  English  5,238   105   
1    https://archiveofourown.org/works/46450045  English  9,559   188   
2    https://archiveofourown.org/works/46609009  English  2,663    18   
3    https://archiveofourown.org/works/46227607  English  9,564   179   
4    https://archiveofourown.org/works/46606960  English  1,703    59   
..                                          ...      ...    ...   ...   
227   https://archiveofourown.org/works/8107966  English  1,882    91   
228   https://archiveofourown.org/works/7898905  English  1,665    19   
229   https://archiveofourown.org/works/7849768  English  3,917    37   
230   https://archiveofourown.org/works/5443742  English  9,196    45   
231   https://archiveofourown.org/works/4858055  English  4,604   624   

    comments bookmarks   hits  
0         29         8    964  
1         73        14  1,965  
2          3         0    3

In [567]:
data_list2.append(data_list3)

Unnamed: 0,link,language,words,kudos,comments,bookmarks,hits
0,https://archiveofourown.org/works/45308677,English,4255,392,122,25,5544
1,https://archiveofourown.org/works/46519282,English,17318,135,85,16,1808
2,https://archiveofourown.org/works/46616701,English,1626,21,10,3,244
3,https://archiveofourown.org/works/46616434,English,6948,53,5,6,335
4,https://archiveofourown.org/works/46616116,English,234,0,0,0,29
...,...,...,...,...,...,...,...
227,https://archiveofourown.org/works/8107966,English,1882,91,12,8,764
228,https://archiveofourown.org/works/7898905,English,1665,19,4,0,247
229,https://archiveofourown.org/works/7849768,English,3917,37,13,5,403
230,https://archiveofourown.org/works/5443742,English,9196,45,11,5,809


In [568]:
data_listfinal=data_list2.append(data_list3)
data_listfinal

Unnamed: 0,link,language,words,kudos,comments,bookmarks,hits
0,https://archiveofourown.org/works/45308677,English,4255,392,122,25,5544
1,https://archiveofourown.org/works/46519282,English,17318,135,85,16,1808
2,https://archiveofourown.org/works/46616701,English,1626,21,10,3,244
3,https://archiveofourown.org/works/46616434,English,6948,53,5,6,335
4,https://archiveofourown.org/works/46616116,English,234,0,0,0,29
...,...,...,...,...,...,...,...
227,https://archiveofourown.org/works/8107966,English,1882,91,12,8,764
228,https://archiveofourown.org/works/7898905,English,1665,19,4,0,247
229,https://archiveofourown.org/works/7849768,English,3917,37,13,5,403
230,https://archiveofourown.org/works/5443742,English,9196,45,11,5,809


In [569]:
data_listfinal[data_listfinal.isna().any(axis=1)]

Unnamed: 0,link,language,words,kudos,comments,bookmarks,hits


In [573]:
data_listfinal.to_csv('lockwood_part2.csv', index=False)