In [2]:
# Import libaries
import pandas as pd
import requests
import html
from bs4 import BeautifulSoup


### Step 1: Create a soup object from the home page

In [3]:
url = 'https://pages.git.generalassemb.ly/rldaggie/for-scraping/'
res = requests.get(url)

In [4]:
res.status_code

200

In [5]:
soup = BeautifulSoup(res.content, 'lxml')

In [6]:
#makes more readable
print(soup.prettify()[:1000])

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Nutrition Information
  </title>
  <link crossorigin="anonymous" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css" integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" rel="stylesheet"/>
 </head>
 <body>
  <header>
   <section class="container">
    <nav class="navbar navbar-expand-lg navbar-light bg-light" role="navigation">
     <a class="navbar-brand" href="/">
      Nutrition Information
     </a>
    </nav>
   </section>
  </header>
  <main class="container" role="main">
   <br/>
   <div class="alert alert-danger">
    NOTE: This data is super old and rife with errors. It's meant for scraping practice only.
   </div>
   <table class="table" id="restaurants">
    <thead>
     <tr>
      <th>
       Name
      </th>

In [7]:
print(soup.title)

<title>Nutrition Information</title>


In [8]:
print(soup.a)

<a class="navbar-brand" href="/">Nutrition Information</a>


### Step 2: Scrape the home page soup for every restaurant

Note: Your best bet is to create a list of dictionaries, one for each restaurant. Each dictionary contains the restaurant's name and path from the `href`. The result of your scrape should look something like this:

```python
restaurants = [
    {'name': 'A&W Restaurants', 'href': 'restaurants/1.html'}, 
    {'name': "Applebee's", 'href': 'restaurants/2.html'},
    ...
]
```

In [41]:
div = soup.find_all('table')

In [42]:
restaurant_table = soup.find('table', {'id': 'restaurants'})

In [43]:
table1 = restaurant_table.find_all('a')

In [44]:
table1[0].text

'A&W Restaurants'

In [45]:
restaurants = []

for row in table1:
    rest = {}
    rest['name'] = row.text
    rest['href'] = row['href']
    restaurants.append(rest)

In [46]:
table2 = pd.DataFrame(restaurants)

In [47]:
table2

Unnamed: 0,name,href
0,A&W Restaurants,restaurants/1.html
1,Applebee's,restaurants/2.html
2,Arby's,restaurants/3.html
3,Atlanta Bread Company,restaurants/4.html
4,Bojangle's Famous Chicken 'n Biscuits,restaurants/5.html
5,Buffalo Wild Wings,restaurants/6.html
6,Burger King,restaurants/7.html
7,Captain D's,restaurants/8.html
8,Carl's Jr.,restaurants/9.html
9,Charley's Grilled Subs,restaurants/10.html


In [48]:
restaurants[0:5]

[{'name': 'A&W Restaurants', 'href': 'restaurants/1.html'},
 {'name': "Applebee's", 'href': 'restaurants/2.html'},
 {'name': "Arby's", 'href': 'restaurants/3.html'},
 {'name': 'Atlanta Bread Company', 'href': 'restaurants/4.html'},
 {'name': "Bojangle's Famous Chicken 'n Biscuits",
  'href': 'restaurants/5.html'}]

### Step 3: Using the `href`, scrape each restaurant's page and create a single list of food dictionaries.

Your list of foods should look something like this:
```python
foods = [
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    ...
]
```

**Note**: Remove extra white space from each category

In [61]:
href_list = [restaurants[i]['href'] for i in range(len(restaurants))]

In [62]:
base_url = 'https://pages.git.generalassemb.ly/rldaggie/for-scraping/'

In [71]:
#code help from stackoverflow.com

for x in range(len(href_list)):
    res = requests.get(base_url + href_list[x] + '/')
    soup = BeautifulSoup(res.content, 'lxml')
    table_soup = soup.find_all('tr')
    foods = []
    
    for i in range(1,(len(table_soup))):
        food = {}
        food['name'] = html.unescape(str(table_soup[i]).split('itemprop="name">')[1].split('</td>')[0])
        food['category'] = str(table_soup[i]).split('href="/categories/')[1].split('"')[0]
        food['calories'] = str(table_soup[i]).split('calories">')[1].split('</td>')[0]
        food['fat'] = str(table_soup[i]).split('fatContent">')[1].split('</td>')[0]
        food['carbs'] = str(table_soup[i]).split('carbohydrateContent">')[1].split('</td>')[0]
        food['restaurant'] = restaurants[x]['name']

        foods.append(food)
        
    restaurants[x]['foods'] = foods

### Step 4: Create a pandas DataFrame from your list of foods

**Note**: Your DataFrame should have 5,131 rows

In [72]:
list_of_food_dicts = []
for i in range(len(restaurants)):
    for x in range(len(restaurants[i]['foods'])):
        dict = {}
        dict = (restaurants[i]['foods'][x])
        list_of_food_dicts.append(dict)

In [73]:
df = pd.DataFrame(list_of_food_dicts)

In [76]:
#df.head()

### Step 5: Export to csv

**Note:** Don't export the index column from your DataFrame

In [75]:
df.to_csv('restuarants_dataframe.csv', index=False, sep=",")