## Web Scraping

This brings us to the first leg of our journey:

[**This is where our data is:**](https://www.goodreads.com/list/show/5.Best_Books_of_the_Decade_2000s?page=1)

![image](images/Goodreads-icon.png )


**Importing Libraries**

In [1]:
from requests import get
from bs4 import BeautifulSoup as Soup
import pandas as pd
import requests
import numpy as np
print("Libraries Installed")

Libraries Installed


**Creating BeautifulSoup Object**

In [None]:
url=get("https://www.goodreads.com/list/show/5")
request=url.text
soup_data=Soup(request,"html.parser")

**Creating Functions**

In [None]:
#Creating URL list as a function
def url_list(soup_data):
    urls=soup_data.findAll(class_="bookTitle")
    url_list=["https://www.goodreads.com"+str(list(str(url).split(" "))[2])[6::] for url in urls]
    return url_list

In [None]:
#Creating Title list as a function
def title_list(soup_data):
    titles=soup_data.findAll(class_="bookTitle")
    title_list=[title.text.strip() for title in titles]
    return title_list

In [None]:
#Creating Author list as a function
def author_list(soup_data):
    authors=soup_data.findAll(class_="authorName")
    author_list=[author.text for author in authors]
    return author_list

In [None]:
#Creating Rating list as a function
def ratings_list(soup_data):
    ratings=soup_data.findAll("div",{"id":"bookMeta"})
    try:
        return ratings[0].find("a",{"class":"gr-hyperlink"}).text.strip().split("\n")[0].replace(",","")
    except:
        return np.nan

In [None]:
#Creating Review list as a function
def reviews_list(soup_data):
    reviews=soup_data.findAll("div",{"id":"bookMeta"})
    try:
        return int(reviews[0].text.strip().split("\n")[-2].strip().replace(",",""))
    except:
        return np.nan

In [None]:
#Creating Avg Rating list as a function
def avg_list(soup_data):
    avg_lists=[]
    avg_ratings=soup_data.findAll(class_="minirating")
    for avg in range(len(avg_ratings)):
        try:
            avg_lists.append(float(avg_ratings[avg].text[0:4].strip()))
        except:
            avg_lists.append(np.nan)
    return avg_lists

In [None]:
#Creating Number of Page list as a function
def pages_list(soup_data):
    num_pages=soup_data.findAll("span",{"itemprop":"numberOfPages"})
    try:
        return int(num_pages[0].text.split(" ")[0])
    except:
        return np.nan

In [None]:
#Creating Published Year list as a function
def published_list(soup_data):
    published_year=soup_data.findAll("div",{"id":"details"})
    published_list=[]
    try:
        return published_year[0].text.split("\n")[4][-4::]
    except:
        return np.nan

In [None]:
#Creating Series Boolean list as a function
def series_list(soup_data):
    series=soup_data.findAll("h2",{"id":"bookSeries"})
    
    try:
        if series[0].text.strip()=="":
            return 0
        else:
            return 1
    except:
        return np.nan

In [None]:
#Creating Genres list as a function
def genres_list(soup_data):
    genres=soup_data.findAll(class_="rightContainer")
    genres_lists=[]
    try:
        for i in range(1,10,3):
            genres_lists.append(genres[0].text.strip().split("Genres")[1].split("\n\n\n")[i].split("\n")[-1].strip())
        return genres_lists[0:3]
    except:
        return np.nan

In [None]:
#Creating Awards list as a function
def awards_list(soup_data):
    awards=soup_data.findAll(class_="award")
    try:
        awards_lists=[award.text.strip() for award in awards]
        return awards_lists
    except:
        return np.nan

In [None]:
#Creating Places List as a Function
def places_list(soup_data):
    places=soup_data.findAll(id="bookDataBox")
    try:
        pre_list_places=places[0].text.split("\nSetting")[1].strip().split("Literary Awards")[0].strip().split("\n\n\n")
        places_lists=[places.strip().split("\n\n") for places in pre_list_places ]
        #deleting empty list values
        filter_object = filter(lambda x: x != '', places_lists)
        return list(filter_object)
    except:
        return np.nan

**Creating Main Function**

In [None]:
#Creating Function soup object as input
def get_data(url):
    url_get=get(url)
    request=url_get.text
    soup_data=Soup(request,"html.parser")
    return soup_data

#Creating Dataframe as a function
def book(url):
    soup_data=get_data(url)
    
    #Creating URL List From Function
    url_lists=url_list(soup_data)

    #Creating Title List From Function
    title_lists=title_list(soup_data)
    
    #Creating Author List From Function
    author_lists=author_list(soup_data)
    
    #Creating Avg List From Function
    avg_lists=avg_list(soup_data)
    
    pages_lists,ratings_lists,reviews_lists,published_lists,series_lists,genres_lists,awards_lists,places_lists=[],[],[],[],[],[],[],[]

    
    #Creating "for loop" for iterating through the pages
    for urls in url_lists:
        soup_data2=get_data(urls)
        
        #Creating Pages List From Function
        pages_lists.append(pages_list(soup_data2))
        
        #Creating Ratings List From Function
        ratings_lists.append(ratings_list(soup_data2))
        
        #Creating Reviews List From Function
        reviews_lists.append(reviews_list(soup_data2))
        
        #Creating Published Year List From Function
        published_lists.append(published_list(soup_data2))
        
        #Creating Series Boolean List From Function
        series_lists.append(series_list(soup_data2))
        
        #Creating Genres List From Function
        genres_lists.append(genres_list(soup_data2))
        
        #Creating Awards List From Function
        awards_lists.append(awards_list(soup_data2))
          
        #Creating Places List From Function
        places_lists.append(places_list(soup_data2))

    df_dict={"URL":url_lists,"Title":title_lists,"Author":author_lists,"Number of Ratings":ratings_lists,"Number of Reviews":reviews_lists,
        "Average Ratings":avg_lists,"Number of Pages":pages_lists,"Published Year":published_lists,
        "Series":series_lists,"Genres":genres_lists,"Awards":awards_lists,"Places":places_lists}
    return df_dict


url="https://www.goodreads.com/list/show/5"
    
    

In [None]:
#Creating Column Names as a List
column_names=["URL","Title","Author","Number of Ratings","Number of Reviews",
        "Average Ratings","Number of Pages","Published Year",
        "Series","Genres","Awards","Places"]
#Creating list page for 10 pages of books
main_page="https://www.goodreads.com/list/show/5.Best_Books_of_the_Decade_2000s?page="
list_pages={main_page+str(page) for page in range(2,11)}

#First page defined as main page
main_page=book(url)

#"for loop" for other pages
for page in list_pages:
    next_page=book(page)
    for column in column_names:
        main_page[column].extend(next_page[column])

**Creating Data Frame**

In [None]:
df=pd.DataFrame(data=main_page)
df

In [None]:
df.to_csv(r'data\Data.csv', index = False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5bf96c14-2f6d-49a7-85d0-5816aca72bae' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>