<a href="https://colab.research.google.com/github/arsilva02/librarianproject/blob/main/WebScrapingAndFeatureGen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering within the DF

In [154]:
import pandas as pd
df = pd.read_csv('data/storygraphExport.csv')

In [3]:
#lets look at the contributers section
df[~df['Contributors'].isna()]['Contributors']

Unnamed: 0,Contributors
12,Neil Smith (Translator)
15,Frances Riddle (Translator)
17,Lucy Scott (Translator)
20,Jeffrey Angles (Translator)
22,"Stephanie Ortega (Translator), Sophie Haydock ..."
...,...
700,"William Hutson (Contributor), Daveed Diggs (Co..."
712,Cassandra Medcalf (Narrator)
713,Gregory Rabassa (Translator)
714,Carol Brown Janeway


In [164]:
#if a "translator" is found in the contributor section, i will mark it as translated
def translation(df):
    df['Translated'] = df['Contributors'].str.contains(r'\(Translator\)', na=False)
    return df

df = translation(df)

Unnamed: 0,Title,Authors,Contributors,ISBN/UID,Format,Read Status,Date Added,Last Date Read,Dates Read,Read Count,...,Review,Content Warnings,Content Warning Description,Tags,Owned?,pages,pub_year,tags,series,Translated
0,You've Lost a Lot of Blood,Eric LaRocca,,9781088025758,paperback,to-read,2024/05/06,,,0.0,...,,,,,No,209.0,2022.0,"[fiction, horror, lgbtqia+, challenging, dark,...",False,False
1,Long After We Are Gone,Terah Shelton Harris,,9781728265773,paperback,to-read,2024/09/06,,,0.0,...,,,,,Yes,412.0,2024.0,"[fiction, literary, emotional, reflective, sad...",False,False
2,Shuggie Bain,Douglas Stuart,,9780802148049,hardcover,to-read,2024/08/27,,,0.0,...,,,,,No,448.0,2020.0,"[fiction, historical, lgbtqia+, literary, dark...",False,False
3,Helpmeet,Naben Ruthnum,,9781988964386,paperback,to-read,2024/10/01,,,0.0,...,,,,,No,89.0,2022.0,"[fiction, horror, dark, mysterious, fast-paced]",False,False
4,Angelfall,Susan Ee,,9780761463276,paperback,to-read,2024/04/01,,,0.0,...,,,,,No,284.0,2011.0,"[fiction, fantasy, young adult, adventurous, d...",True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,Eleanore of Avignon,Elizabeth DeLozier,,9780593475034,hardcover,to-read,2024/09/23,,,0.0,...,,,,,No,320.0,2024.0,"[fiction, historical, emotional, sad, tense, m...",False,False
724,The Crane Husband,Kelly Barnhill,,9781250850973,hardcover,to-read,2024/09/05,,,0.0,...,,,,,No,128.0,2023.0,"[fiction, fantasy, horror, dark, emotional, my...",False,False
725,Artificial Intelligence: An Illustrated Histor...,Clifford A. Pickover,,9781454955788,paperback,,,,,,...,,,,,Yes,240.0,2019.0,"[nonfiction, history, science, informative, me...",False,False
726,The Road,Cormac McCarthy,,9780307265432,hardcover,,,,,,...,,,,,Yes,241.0,2006.0,"[fiction, dystopian, literary, dark, emotional...",False,False


# StoryGraph Web Scraper

In [156]:
import requests
from bs4 import BeautifulSoup
import math
import time
import re
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'



In [157]:
def scraper(df):
    master = 'app.thestorygraph.com'
    search_master = '/browse?search_term='
    results = []  # Store results for each row

    for index, row in df.iterrows():
        title = row['Title']
        author = row['Authors'].replace(", ", " ")
        isbn = row['ISBN/UID']
        data = {}

        if pd.isna(isbn):
            search = f"{title.replace(' ', '%20')}%20{author.replace(' ', '%20')}"
        else:
            search = isbn

        search_url = f"https://{master}{search_master}{search}"
        print(title)

        try:
            response = requests.get(search_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            href = soup.find("a", href=re.compile(r"/books/"))["href"]

            book_url = f"https://{master}{href}"
            response = requests.get(book_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            text = soup.find(class_='text-sm font-light text-darkestGrey dark:text-grey mt-1').text.strip() #changed to match working example

            try:
                pages_match = re.search(r"(\d+)\s+pages", text)
                year_match = re.search(r"first pub (\d{4})", text)

                pages = int(pages_match.group(1)) if pages_match else float('nan')
                pub_year = int(year_match.group(1)) if year_match else float('nan')

                data['pages'] = pages
                data['pub_year'] = pub_year

            except (AttributeError, ValueError):
                data['pages'], data['pub_year'] = float('nan'), float('nan')
            except Exception as e:
                print(f"An unexpected error occurred (pages/year): {e}")
                data['pages'], data['pub_year'] = float('nan'), float('nan')

            try:
                tags = soup.find(class_="mt-2 my-1 md:w-10/12").text.strip().split('\n')
                data['tags'] = tags
            except AttributeError:
                data['tags'] = []
            except Exception as e:
                print(f"An unexpected error occurred (tags): {e}")
                data['tags'] = []
            series_element = soup.find(class_='font-semibold hover:text-cyan-700 dark:hover:text-cyan-500 text-lg')
            if series_element:
              series = True
            else:
              series = False
            data['series'] = series

        except (requests.exceptions.RequestException, AttributeError, TypeError, KeyError) as e:
            print(f"Error processing {search_url}: {e}")
            data['pages'], data['pub_year'], data['tags'] = float('nan'), float('nan'), []
        except Exception as e:
            print(f"An unexpected error occurred(general): {e}")
            data['pages'], data['pub_year'], data['tags'] = float('nan'), float('nan'), []

        results.append(data)

    # Create new columns from the results
    df['pages'] = [result.get('pages', float('nan')) for result in results]
    df['pub_year'] = [result.get('pub_year', float('nan')) for result in results]
    df['tags'] = [result.get('tags', []) for result in results]
    df['series'] = [result.get('series', False) for result in results]

    return df

scraper(df)

You've Lost a Lot of Blood
Long After We Are Gone
Shuggie Bain
Helpmeet
Angelfall
Come as You Are: The Surprising New Science That Will Transform Your Sex Life
She Gets the Girl
Certainty
The Sun Also Rises
The End We Start from
Adelaide
Fluids
Anxious People
Fahrenheit 451
The Nightingale
Time of the Flies
At Certain Points We Touch
On a Woman's Madness
I Am Legend
Knee-Deep in Cinders
Killing Kanoko: Selected Poems of Hiromi Ito
Big Swiss
Red House Alley
All Systems Red
The Dangers of Smoking in Bed
Blood Meridian, or The Evening Redness in the West
Commonwealth
New Mistakes
Eugene Onegin
Redwood Court
Orbital
FEVER
Mayflies
All Friends Are Necessary
A Manual for How to Love Us
Notes on Her Color
Women Who Run with the Wolves: Myths and Stories of the Wild Woman Archetype
Atonement
The Survivor Stands 
Severance
The Way of Kings
Within Arm's Reach
The Sorrows of Others
Model Home
A Manual for Cleaning Women 
Fingersmith
Cicada Summer
Record of a Night Too Brief
Mistress of Lies
I Sex

Unnamed: 0,Title,Authors,Contributors,ISBN/UID,Format,Read Status,Date Added,Last Date Read,Dates Read,Read Count,...,Star Rating,Review,Content Warnings,Content Warning Description,Tags,Owned?,pages,pub_year,tags,series
0,You've Lost a Lot of Blood,Eric LaRocca,,9781088025758,paperback,to-read,2024/05/06,,,0.0,...,,,,,,No,209.0,2022.0,"[fiction, horror, lgbtqia+, challenging, dark,...",False
1,Long After We Are Gone,Terah Shelton Harris,,9781728265773,paperback,to-read,2024/09/06,,,0.0,...,,,,,,Yes,412.0,2024.0,"[fiction, literary, emotional, reflective, sad...",False
2,Shuggie Bain,Douglas Stuart,,9780802148049,hardcover,to-read,2024/08/27,,,0.0,...,,,,,,No,448.0,2020.0,"[fiction, historical, lgbtqia+, literary, dark...",False
3,Helpmeet,Naben Ruthnum,,9781988964386,paperback,to-read,2024/10/01,,,0.0,...,,,,,,No,89.0,2022.0,"[fiction, horror, dark, mysterious, fast-paced]",False
4,Angelfall,Susan Ee,,9780761463276,paperback,to-read,2024/04/01,,,0.0,...,,,,,,No,284.0,2011.0,"[fiction, fantasy, young adult, adventurous, d...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,Eleanore of Avignon,Elizabeth DeLozier,,9780593475034,hardcover,to-read,2024/09/23,,,0.0,...,,,,,,No,320.0,2024.0,"[fiction, historical, emotional, sad, tense, m...",False
724,The Crane Husband,Kelly Barnhill,,9781250850973,hardcover,to-read,2024/09/05,,,0.0,...,,,,,,No,128.0,2023.0,"[fiction, fantasy, horror, dark, emotional, my...",False
725,Artificial Intelligence: An Illustrated Histor...,Clifford A. Pickover,,9781454955788,paperback,,,,,,...,,,,,,Yes,240.0,2019.0,"[nonfiction, history, science, informative, me...",False
726,The Road,Cormac McCarthy,,9780307265432,hardcover,,,,,,...,,,,,,Yes,241.0,2006.0,"[fiction, dystopian, literary, dark, emotional...",False


# Goodreads Web Scraper

In [176]:
pip install selenium

Collecting selenium
  Downloading selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.29.0-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.29.0-py3-none-any.whl (492 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-

In [431]:
#imports (will be used as we go)
import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')  # Add the ChromeDriver path to the system path for execution.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import time

In [432]:
# Create chrome instance and configure
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_argument('user-agent=name')
wd = webdriver.Chrome(options=chrome_options)


In [420]:
test = df[7:10]

In [433]:
def gr_scraper(df):
  i = 0
  master = 'https://goodreads.com'
  search_master = '/search'
  element_locator = (By.ID, "search_query_main")
  results = []  # Store results for each row

  for index, row in df.iterrows():
      i += 1
      title = row['Title']
      author = row['Authors'].replace(", ", " ")
      isbn = row['ISBN/UID']
      data = {}

      if pd.isna(isbn):
          search = title + " " + author
      else:
          search = isbn
      print(i)

      try:
          wd.get(master+search_master)
          search_bar = wd.find_element(By.ID, "search_query_main")
          try:
                      button = wd.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/button")
                      button.click()
          except Exception as e:
                pass

          search_bar.clear()
          search_bar.send_keys(search)

          search_button = wd.find_element(By.CSS_SELECTOR, "input.searchBox__button")

          try:
                  button = wd.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/button")
                  button.click()
                  print("Close button clicked successfully.")
          except Exception as e:
            pass

          search_button.click()

          try:
            link = wd.find_element(By.CSS_SELECTOR, "a.bookTitle")
            link.click()
          except Exception as e:
            pass
          try:
            blurb = wd.find_elements(By.CSS_SELECTOR, "span.Formatted")[0].text
            ratings = wd.find_element(By.XPATH,"/html/body/div[1]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[2]/a/div[2]/div/span[1]").text
            ratings = int(re.sub(r"[^\d]", "", ratings))

            data['blurb'] = blurb
            data['ratings'] = ratings
          except Exception as e:
            print('Error occured', e)
            print('Error on blurb/ratings')
            data['blurb'], data['ratings'] = float('nan'), float('nan')
      except Exception as e:
        print('Error occured', e)


      results.append(data)

    # Create new columns from the results
  df['blurb'] = [result.get('blurb', float('nan')) for result in results]
  df['ratings'] = [result.get('ratings', float('nan')) for result in results]

  return df

In [434]:
df = gr_scraper(df)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
Error occured list index out of range
Error on blurb/ratings
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
Error occured list index out of range
Error on blurb/ratings
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
Error occured list index out of range
Error on blurb/ratings
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
Error occured list index out of range
Error on blurb/ratings
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216


KeyboardInterrupt: 

In [435]:
df

Unnamed: 0,Title,Authors,Contributors,ISBN/UID,Format,Read Status,Date Added,Last Date Read,Dates Read,Read Count,...,Review,Content Warnings,Content Warning Description,Tags,Owned?,pages,pub_year,tags,series,Translated
0,You've Lost a Lot of Blood,Eric LaRocca,,9781088025758,paperback,to-read,2024/05/06,,,0.0,...,,,,,No,209.0,2022.0,"[fiction, horror, lgbtqia+, challenging, dark,...",False,False
1,Long After We Are Gone,Terah Shelton Harris,,9781728265773,paperback,to-read,2024/09/06,,,0.0,...,,,,,Yes,412.0,2024.0,"[fiction, literary, emotional, reflective, sad...",False,False
2,Shuggie Bain,Douglas Stuart,,9780802148049,hardcover,to-read,2024/08/27,,,0.0,...,,,,,No,448.0,2020.0,"[fiction, historical, lgbtqia+, literary, dark...",False,False
3,Helpmeet,Naben Ruthnum,,9781988964386,paperback,to-read,2024/10/01,,,0.0,...,,,,,No,89.0,2022.0,"[fiction, horror, dark, mysterious, fast-paced]",False,False
4,Angelfall,Susan Ee,,9780761463276,paperback,to-read,2024/04/01,,,0.0,...,,,,,No,284.0,2011.0,"[fiction, fantasy, young adult, adventurous, d...",True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,Eleanore of Avignon,Elizabeth DeLozier,,9780593475034,hardcover,to-read,2024/09/23,,,0.0,...,,,,,No,320.0,2024.0,"[fiction, historical, emotional, sad, tense, m...",False,False
724,The Crane Husband,Kelly Barnhill,,9781250850973,hardcover,to-read,2024/09/05,,,0.0,...,,,,,No,128.0,2023.0,"[fiction, fantasy, horror, dark, emotional, my...",False,False
725,Artificial Intelligence: An Illustrated Histor...,Clifford A. Pickover,,9781454955788,paperback,,,,,,...,,,,,Yes,240.0,2019.0,"[nonfiction, history, science, informative, me...",False,False
726,The Road,Cormac McCarthy,,9780307265432,hardcover,,,,,,...,,,,,Yes,241.0,2006.0,"[fiction, dystopian, literary, dark, emotional...",False,False


In [162]:

df.to_csv('scrapedDirty.csv', index=False)

print("CSV file saved successfully!")

CSV file saved successfully!


[]