In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Billboard: Hot 100 (Scraping to a CSV)

https://www.billboard.com/charts/hot-100

Scrape the fields below, and save as a CSV file.

* Rank
* Artist name
* Song name

This one is pretty simple.

In [2]:
response = requests.get('https://www.billboard.com/charts/hot-100')
doc = BeautifulSoup(response.text)

In [7]:
songs = doc.find_all(class_='chart-list-item')
rows = []
for song in songs:
    row = {}
    row['Rank'] = song['data-rank']
    row['Artist_name'] = song['data-artist']
    row['Song_name'] = song['data-title']
    rows.append(row)

df = pd.DataFrame(rows)
df.to_csv('billboard_hot_100.csv')

## Goodreads: Science Fiction Books by Female Authors (Scraping to a CSV)

https://www.goodreads.com/list/show/6934.Science_Fiction_Books_by_Female_Authors

Scrape the fields below, and save as a CSV file.

* Field - Example
* Rank -	1
* Title -	The Handmaid's Tale
* Author -	Margaret Atwood
* Score -	score: 30,733
* Votes -	314 people voted
* Rating -	4.09 avg rating — 1,101,120 ratings

In [8]:
response = requests.get('https://www.goodreads.com/list/show/6934.Science_Fiction_Books_by_Female_Authors')
doc = BeautifulSoup(response.text)

In [33]:
books = doc.find_all('tr', itemtype = 'http://schema.org/Book')
rows = []
for book in books:
    row = {}
    row['Rank'] = book.find(class_='number').text.strip()
    row['Title'] = book.find(class_='bookTitle').text.strip()
    row['Author'] = book.find(class_='authorName').text.strip()
    row['Score'] = book.find(href='#').text.strip()
    row['Votes'] = book.find(href='#').find_next_siblings()[1].text.strip()
    row['Rating'] = book.find(class_='minirating').text.strip()
    rows.append(row)
df = pd.DataFrame(rows)
df.to_csv('goodreads_raw.csv')

This one is a little tougher, but the main difficulty is in cleaning the data! Clean and separate the scraped data, cleaning up columns and creating new ones like so:

* Before -	After
* A Wrinkle in Time (Time Quintet, #1) -	A Wrinkle in Time
* Series -	Time Quintet
* Number in series -	1
* score: 30,733 -	30733
* 4.09 avg rating — 1,101,120 ratings -	4.09
* Number of ratings - 1101120

In [138]:
import re
clean_gr = df.copy()

# transform columns
clean_gr['Series'] = clean_gr['Title'].str.extract(r".* [(](.*) #",expand=False).str.replace(",","")
clean_gr['Number_in_series'] = clean_gr['Title'].str.extract(r".* [(].* #(.*)[)]")
clean_gr['Clean_title'] = clean_gr['Title'].str.extract(r"([\w\s']*)[(]*",expand=False).str.strip()
clean_gr['Clean_score'] = clean_gr['Score'].str.replace("score: ","").str.replace(",","").astype(int)
clean_gr['Clean_rating'] = clean_gr['Rating'].str.extract(r"(\d[.]\d\d)").astype(float)
clean_gr['Number_of_ratings'] = clean_gr['Rating'].str.extract(r"rating — (.*) ratings",expand=False).str.replace(",","").astype(int)
clean_gr[['Title','Clean_title','Series','Number_in_series','Score','Clean_score','Clean_rating',
         'Number_of_ratings']]
    
# clean up df by removing unnecessary fields
clean_gr = clean_gr[['Clean_title','Series','Number_in_series','Clean_score','Clean_rating',
         'Number_of_ratings']]
clean_gr

# rename fields as desired
columns = ['Title','Series','Number_in_series','Score','Rating','Number_of_ratings']
clean_gr.columns = columns

# save to csv
clean_gr.to_csv('goodreads_clean.csv')

## Epicurious (Pagination, scraping 1x per row)

https://www.epicurious.com/search/cucumbers - or whatever other search term

Remember, we're scraping multiple pages of search results, so the URL will be different!

Scrape 10 pages of cucumber search results, and save as a CSV file. Include the following fields:

* Tag/category
* Title
* Summary
* Rating (We'll only want the 2, not the 2 / 4)
* Would make again
* Link/URL

Tip: You'll need to try/escape on some of these fields

In [166]:
rows = []
for page_num in range(1,11):
    url = f"https://www.epicurious.com/search/peanuts?page={page_num}"
    print("Scraping",url)
    response = requests.get(url)
    doc = BeautifulSoup(response.text)
    items = doc.find_all('article')
    for item in items:
        row = {}
        try:
            row['Category'] = item.find(class_='tag').text
        except:
            pass
        try:
            row['Title'] = item.find('h4').text
        except:
            pass
        try:
            row['Summary'] = item.find(class_='dek').text
        except:
            pass
        try:
            row['Rating'] = item.find(itemprop='ratingValue').text
        except:
            pass
        try:
            row['Make_again'] = item.find(class_='make-again-percentage').text
        except:
            pass
        try:
            row['URL'] = "https://www.epicurious.com/" + item.find(itemprop='url')['href']
        except:
            pass
        rows.append(row)
        
df = pd.DataFrame(rows)
# remove legal notices
df = df[df['Title']!='Legal Notice']

# save as csv
df.to_csv('epicurious.csv')

Scraping https://www.epicurious.com/search/peanuts?page=1
Scraping https://www.epicurious.com/search/peanuts?page=2
Scraping https://www.epicurious.com/search/peanuts?page=3
Scraping https://www.epicurious.com/search/peanuts?page=4
Scraping https://www.epicurious.com/search/peanuts?page=5
Scraping https://www.epicurious.com/search/peanuts?page=6
Scraping https://www.epicurious.com/search/peanuts?page=7
Scraping https://www.epicurious.com/search/peanuts?page=8
Scraping https://www.epicurious.com/search/peanuts?page=9
Scraping https://www.epicurious.com/search/peanuts?page=10


### Epicurious, Part 2: Once-per-row scraping

Then, open your search results csv, filter for ONLY recipes. Merge the following fields with your original recipes and save as a new CSV file:

* Ingredients
* Directions
* Tags

Tip: If you use .find for your directions/ingredients, it'll print them all on one line. But if you use .find_all to separate them, it makes your life a lot harder! ...unless you just steal this code:

In [None]:
# Get the text from each step individually
# Then join them together with \n between each step
'\n'.join([step.text for step in steps])

In [210]:
df = pd.read_csv('epicurious.csv')
df = df.drop('Unnamed: 0',axis=1)
df = df[df.Category=='recipe']
df.loc[148]['URL']

'https://www.epicurious.com//recipes/food/views/how-to-toast-nuts-51220040'

In [216]:
def scrape_page(row):
    url = row['URL']
    response = requests.get(url)
    doc = BeautifulSoup(response.text)
    page = {}
    # ingredients
    ingredients = ""
    try:
        ingredient_list = doc.find(class_='ingredients').find_all('li')
        for item in ingredient_list:
            ingredients += (item.text + ', ')
    except:
        pass
    page['Ingredients'] = ingredients.rstrip(' ,')
    # directions
    direction_list = doc.find(class_='preparation-steps')
    directions = ""
    for direction in direction_list:
        directions += (direction.text.strip() + " ")
    page['Directions'] = directions.strip()
    # tags
    tags = ""
    try:
        tag_list = doc.find(class_='tags').find_all('a')
        for tag in tag_list:
            tags += (tag.text.strip() + ', ')
    except:
        pass
    page['Tags'] = tags.rstrip(' ,')
    return pd.Series(page)

In [218]:
recipes = df.apply(scrape_page,axis=1)

In [220]:
df = df.merge(recipes,left_index=True,right_index=True)
df

Unnamed: 0,Category,Make_again,Rating,Summary,Title,URL,Ingredients,Directions,Tags
0,recipe,0%,0.0,This boldly flavored fruit salad is a welcome ...,Watermelon with Lime Dressing and Peanuts,https://www.epicurious.com//recipes/food/views...,"2 Tbsp. fresh lime juice, 1 Tbsp. fish sauce, ...","Stir lime juice, fish sauce, sugar, and 1 Tbsp...","Bon Appétit, Watermelon, Lime Juice, Basil, Pe..."
2,recipe,73%,3.5,Quickly caramelized pineapple and sautéed shri...,Pineapple Shrimp Noodle Bowls,https://www.epicurious.com//recipes/food/views...,"12 oz. pad Thai–style rice noodles, 1 lb. larg...",Cook rice noodles according to package directi...,"Shrimp, Noodle, Pineapple, Soy Sauce, Ginger, ..."
3,recipe,50%,3.5,"The shrimp cooks in minutes, so while it’s mar...",Sambal Shrimp Lettuce Wraps,https://www.epicurious.com//recipes/food/views...,1/2 cup hot chili paste (such as sambal oelek)...,"Whisk hot chili paste, honey, vinegar, and 2 t...","Bon Appétit, Dinner, Seafood, Shellfish, Shrim..."
4,recipe,100%,4.0,Sauté asparagus hot and fast in a skillet to s...,Blistered Asparagus with Peanut Dressing,https://www.epicurious.com//recipes/food/views...,"2 Tbsp. vegetable oil, 12 oz. asparagus, trimm...",Heat oil in a large skillet over medium-high. ...,"Bon Appétit, Side, Asparagus, Spring, Peanut, ..."
5,recipe,0%,0.0,"In this Indonesian satay, the dipping sauce is...",Chicken Satay,https://www.epicurious.com//recipes/food/views...,"5 candlenuts or cashews, or 10 blanched almond...",Soak 14 medium bamboo skewers for 30 minutes i...,"HarperCollins, HarperCollins, Indonesian, Sout..."
6,recipe,50%,2.5,"Tossed with crunchy cucumbers, peanuts, and fr...",Thai-Style Squid and Cucumber Salad,https://www.epicurious.com//recipes/food/views...,"1/4 cup fresh lime juice (from about 3 limes),...","Whisk lime juice, garlic, fish sauce, brown su...","Salad, Lime Juice, Garlic, Cucumber, Chile, Pe..."
7,recipe,100%,4.0,"Refreshingly cold noodles, Vietnamese-spiced p...",Rice Noodles with Lemongrass Pork Meatballs,https://www.epicurious.com//recipes/food/views...,"2 stalks lemongrass, 2 pounds ground pork, 1 t...",Cut the bottom 1/2 inch off each lemongrass st...,"Small Plates, Dinner, Meatball, Noodle, Pork, ..."
15,recipe,100%,4.0,These meringue-like cookies have all our favor...,Granola Cluster Cookies,https://www.epicurious.com//recipes/food/views...,"1 1/2 cups pecans, 1 cup sliced almonds, 1 cup...","Preheat oven to 325°F. Toss pecans, almonds, c...","Bon Appétit, Cookies, snack, Granola, Quick an..."
18,recipe,0%,0.0,These easy-to-make peanuts will make you feel ...,Chocolate-Covered Peanuts,https://www.epicurious.com//recipes/food/views...,4 ounces (115 g) semisweet or bittersweet choc...,Put the pieces of chocolate in an absolutely d...,"Chocolate, Nut, Dessert, Fourth of July, Veget..."
24,recipe,0%,0.0,If you have the homemade red curry paste on ha...,Satay Peanut Sauce/Naam Jim Satay,https://www.epicurious.com//recipes/food/views...,"1 quart (820 grams) canola oil, for frying, 1 ...",Heat the oil in a wok or heavy saucepan over m...,"HarperCollins, Thai, Peanut, Chile Pepper, Sau..."


In [221]:
df.to_csv('epicurious_merged.csv')

## Metrolyrics (Pagination, scraping 1x per row)

http://www.metrolyrics.com/rem-lyrics.html (or whatever other musician you'd like!)

Remember, we're scraping multiple pages of search results, so the URL will be different!

Scrape all pages of search results for your musician, and save as a CSV file. Include the following fields:

* Song title
* URL
* Popularity
* Year

Bonus: Make the popularity a normal number (e.g., 6)

In [261]:
rows = []
for page_num in (range(1,7)):
    url = f"http://www.metrolyrics.com/beatles-alpage-{page_num}.html"
    response = requests.get(url)
    doc = BeautifulSoup(response.text)
    songs = doc.find('tbody').find_all('tr')
    for song in songs:
        row = {}
        elements = song.find_all('td')
        row['Title'] = elements[1].text.replace(" Lyrics","").strip()
        row['URL'] = elements[1].find('a')['href']
        row['Popularity'] = elements[3].find('span')['class'][1].replace("popular","")
        row['Year'] = elements[2].text.strip()
        rows.append(row)

df = pd.DataFrame(rows)
df.to_csv('metrolyrics.csv')

### Metrolyrics, Part 2: Scrape the lyrics pages

Then, open your search results csv, and scrape the following field:

* Lyrics

Merge with your original song information and save as a new CSV file

Tip: If you use .find for your lyrics, they'll have a bunch of ads inside! You can use the ingredients/directions trick from above, or you can clean them with regex.

In [265]:
df = pd.read_csv('metrolyrics.csv')
df = df.drop('Unnamed: 0',axis=1)

In [282]:
len(df)

445

In [273]:
def scrape_page(row):
    url = row['URL']
    page = {}
    response = requests.get(url)
    doc = BeautifulSoup(response.text)
    verses = doc.find_all(class_='verse')
    page['Lyrics'] = '\n'.join([verse.text for verse in verses])
    return pd.Series(page)

In [283]:
lyrics = df.apply(scrape_page,axis=1)

In [285]:
df = df.merge(lyrics,left_index=True,right_index=True)

In [287]:
df.to_csv('metrolyrics_merged.csv')

## BONUS: AZLyrics (Scraping 1x per row, getting banned from a website)

Want to know what it feels like to get banned from scraping? Try this one! If someone else tries it before you, you might be out of luck until you get home. If you scrape too much you'll get banned for a couple days, or you'll need to use a VPN. Feel free to come ask me about it, it's a fun one!

Unlike MetroLyrics and Epicurious, this is not multiple pages of search results.

Scrape the following fields, and save as a CSV file:

* Song title
* Song URL
* Tip: You might need to clean up the song URL before you save it https://www.azlyrics.com/r/rem.html

In [288]:
response = requests.get('https://www.azlyrics.com/b/beatles.html')
doc = BeautifulSoup(response.text)

In [297]:
songs = doc.find_all('a',target='_blank',href=True)
rows = []
for song in songs:
    row = {}
    row['Title'] = song.text.strip()
    row['URL'] = song['href'].replace("..","")
    rows.append(row)
df = pd.DataFrame(rows)
df.to_csv('azlyrics.csv')

### AZLyrics, Part 2: Fields to scrape

Then, open your search results csv, and scrape the following field:

* Lyrics

Merge with your original song information and save as a new CSV file

In [299]:
df = pd.read_csv('azlyrics.csv')
df = df.drop('Unnamed: 0',axis=1)

In [300]:
response = requests.get('https://www.azlyrics.com/lyrics/beatles/isawherstandingthere.html')
doc = BeautifulSoup(response.text)

In [313]:
doc.find(class_='ringtone').find_next_siblings()[3].text

'\n\r\n(1,2,3,4!)\n\nWell, she was just seventeen\nYou know what I mean\nAnd the way she looked was way beyond compare\nSo how could I dance with another (Ooh)\nWhen I saw her standing there\n\nWell she looked at me, and I, I could see\nThat before too long I\'d fall in love with her\nShe wouldn\'t dance with another (Whooh)\nWhen I saw her standing there\n\nWell, my heart went "boom"\nWhen I crossed that room\nAnd I held her hand in mine\n\nWhoah, we danced through the night\nAnd we held each other tight\nAnd before too long I fell in love with her\nNow I\'ll never dance with another (Whooh)\nWhen I saw her standing there\n\nWell, my heart went "boom"\nWhen I crossed that room\nAnd I held her hand in mine\n\nWhoah, we danced through the night\nAnd we held each other tight\nAnd before too long I fell in love with her\nNow I\'ll never dance with another (Whooh)\nSince I saw her standing there\nOh since I saw her standing there\nOh since I saw her standing there\n'

In [316]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}

def scrape_page(row):
    url = "https://www.azlyrics.com" + row['URL']
    page = {}
    response = requests.get(url,headers=headers)
    doc = BeautifulSoup(response.text)
    page['Lyrics'] = doc.find(class_='ringtone').find_next_siblings()[3].text
    return pd.Series(page)

In [317]:
df.apply(scrape_page,axis=1)

ConnectionError: (ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), 'occurred at index 0')

In [None]:
# ^ aaaack!