In [None]:
# Import dependencies

import pandas as pd
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
from splinter import Browser
import time

In [None]:
# Set the URL that will be scraped to a variable
# https://www.theinfatuation.com with filters for San Francisco dog friendly restaurants

url = "https://www.theinfatuation.com/san-francisco/guides/dog-friendly-restaurants-SF"

# Create function to get html from website using Beautiful Soup

def getHTMLContent(link):
    html = urlopen(link)
    soup = BeautifulSoup(html, "html.parser")
    return soup

In [None]:
# Use the function created above and pass through the URL defined earlier

content = getHTMLContent(url)

# From the content collected, find all tags "div" with class "spot-block__title-copy"
# Each of the restaurant data is found within the html tag and class

rest_data = content.find_all("div", class_="spot-block__title-copy")

# Check the amount of restaurants to see if data makes sense

len(rest_data)

In [None]:
# Create a for loop to go through the HTML
# Test out code - print out all the restaurant names to see if data collected is correct
# Check to see if the tag and class are correct for getting the restaurant name

for restaurant in rest_data:
    name = restaurant.find("h3").text
    category = restaurant.find("span", class_="overview-bold").text
    print(name)    

In [None]:
# Also find the restaurant URL by looping through content and finding the correct tag

for entry in rest_data:
    partial_url = entry.find("a", href=True)["href"]
    print(partial_url)

In [None]:
# The address data is nested

city = content.find_all("p", class_="small")

# To get just the text, loop through all "p" tags with class "small" and get both children

for line in city[0:2]:
    street = line.text
    
    # Test to see if extraction worked
    
    print(street)

# The above produces the address in two separate lines
# To get the address in one line, use f-string and position of data

street = city[0].text
city_state = city[1].text

# Combine the data 

address = f"{street}. {city_state}"

# Display address to see if it is correct

address

In [None]:
# Create function for initializing browser
# Use chromdriver.exe

def init_browser():
    executable_path = {'executable_path': 'chromedriver.exe'}

    # For Mac Users:
    # executable_path = {"executable_path": "/usr/local/bin/chromedriver"}

    return Browser('chrome', **executable_path, headless=False)

In [None]:
# From tests above, create one function to scrape all the necessary data

def scrape(url):
    
    # Initialize browser (from function created earlier)

    browser = init_browser()

    # Create an empty dicitonary to store scraped dog restaurant data
    
    restaurants = {}
    restaurants["name"] = []
    restaurants["address"] = []
    restaurants["category"] = []
    
    browser.visit(url)
    
    time.sleep(3)
    
    dog_html = browser.html
    
    dog_soup = BeautifulSoup(dog_html, "html.parser")
    
    rest_data = dog_soup.find_all("div", class_="spot-block__title-copy")
    
    # Create a loop to collect the restaurant data
    
    for restaurant in rest_data:
        name = restaurant.find("h3").text
        category = restaurant.find("span", class_="overview-bold").text
        partial_url = restaurant.find("a", href=True)["href"]
        base_url = "https://www.theinfatuation.com"
        get_address_url = base_url+partial_url
        browser.visit(get_address_url)
        time.sleep(3)
        address_html = browser.html
        address_soup = BeautifulSoup(address_html, "html.parser")
        address_box = address_soup.find_all("p", class_="small")
        street = address_box[0].text
        city = address_box[1].text
        address = f"{street}, {city}" 
        restaurants["name"].append(name)
        restaurants["address"].append(address)
        restaurants["category"].append(category)
    
    browser.quit()
    
    return restaurants

In [None]:
# Use the function above to scrape the site

rest_df = scrape(url)

In [None]:
# Convert the dictionary to a Pandas Data Frame

restaurant_data = pd.DataFrame.from_dict(rest_df)

# Display the data frame

restaurant_data.head()

In [None]:
# Check to see of all restaurant names were gathered

len(restaurant_data["name"])

In [None]:
# Save the data to a CSV file to be used later

restaurant_data.to_csv("infat_dog_rest_initial.csv", index = False)