# Scrape URLs on YP.com

In [39]:
# find all coffee shops in los angeles on http://www.yellowpages.com
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random
# scan and store all page links
def makeURL(pages=38):
    urlList = []
    for i in xrange(1,pages+1):
        url = 'http://www.yellowpages.com/search?search_terms=Coffee&geo_location_terms=Los%20Angeles%2C%20CA&page=' + str(i)
        urlList.append(url)
    return urlList
urlList = makeURL()

# store all shop links
def menu_link_spider(urlList):
    menuLinkList = list()    
    for url in urlList:
        source_code = requests.get(url)
        soup = BeautifulSoup(source_code.text, 'html.parser')
        for name in soup.findAll('h3', class_=re.compile('^n')):
            if len(name["class"]) != 1: # match and only match "class='n'"
                continue;
            link = name.find('a',)
            href = "http://www.yellowpages.com" + link.get('href')
            title = link.string
            menuLinkList.append(href)
    menuLinkList = list(set(menuLinkList))
    return menuLinkList
output = menu_link_spider(urlList)

In [353]:
len(output) # total number of coffee shop links is 1092

1092

In [45]:
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random
import geocoder

def menu_content_spider(linkList):
    allMenu = list()
    for url in linkList:
        source_code = requests.get(url)
        soup = BeautifulSoup(source_code.text, 'html.parser')
        itemList = list()
        for item in soup.findAll('p', {'class': 'description'}):
            itemList.append(item.string)
        allMenu.append(itemList)
    return allMenu

# find all shops' name, address, longitude and altitude, return a data frame.
def res_location_spider(linkList):
    res_list = list()
    index = 0
    for url in linkList:
        index += 1
        #time.sleep(random.uniform(1, 10)) 
        source_code = requests.get(url)
        soup = BeautifulSoup(source_code.text, 'html.parser')
        name_bs = soup.find('h1', {'itemprop': 'name'})
        name = name_bs.string.encode('utf-8')
        name = re.sub(r"'|’", "&#39", name)
        name = re.sub(r'\xc3\xa9', '&#xE9', name)
        website_bs = soup.find("a",{'class': 'custom-link'})
        if not website_bs:
            website = "not found"
        else:
            website = website_bs["href"]
            if any(key in website for key in ['grubhub', 'redirect']):
                website = "not found"
                
        phone_bs = soup.find("p",{"class":"phone"})
        if not phone_bs:
            phone = "not found"
        else:
            phone = phone_bs.string
        street = soup.find("p",{"class":"street-address"})
        city_state = soup.find("p",{"class":"city-state"})
        if not street or not city_state:
            continue
        else:
            address = street.string + city_state.string
            address = re.sub(r"'|’", "&#39", address)            
        g = geocoder.google(address)
        longitude = g.lng
        latitude = g.lat
        res_list.append((name, address, phone, website, longitude, latitude))
    df = pd.DataFrame(res_list, columns=['shopname', 'address', 'phone', 'website', 'longitude', 'latitude']) 
    return df
df5 = res_location_spider(output[800:])
#df.to_csv("coffee_shops.csv", sep='\t')

In [46]:
frames = [df1, df2, df3, df4, df5]
df = pd.concat(frames, ignore_index=True)
df = pd.DataFrame.drop_duplicates(df)
df.to_csv("coffee_shops.csv", sep='\t')

# Map out coffee shops with Google Maps API

In [47]:
import pandas as pd
df = pd.read_csv("coffee_shops.csv", sep='\t')

In [48]:
from IPython.core.display import HTML, Javascript
def gmap_init():
    js = """
window.gmap_initialize = function() {};
$.getScript('https://maps.googleapis.com/maps/api/js?v=3&sensor=false&callback=gmap_initialize');
"""
    return Javascript(data=js)
gmap_init()


<IPython.core.display.Javascript object>

In [49]:
%%html
<style type="text/css">
  .map-canvas { height: 400px; }
</style

In [50]:
from IPython.core.display import HTML, Javascript

def map_pos_coffeeshops(coffeeshops, display=True, lat=34.04461, lng=-118.256742, zoom=12):

    div_id = "shopname" # name of the div where are we are going to display the map.
    html = """<div id="%s" class="map-canvas"/>""" % (div_id)

    # This is a template for the infobox that we are going to present to the user when he clicks a 
    # Marker
    content_template = """'<ul style="list-style: none;padding:0; margin:0;">' +
    '<li><a href={website} target="_blank">{shopname}</a></li>' + '<li>{address}</li>' + '<li>{phone}</li></ul>'
    """
    content_template_nolink = """'<ul style="list-style: none;padding:0; margin:0;">' +
    '<li>{shopname}</li>' + '<li>{address}</li>' + '<li>{phone}</li></ul>'
    """

    # This is the template for a Marker on the map.  It also contains the code for generating the "Infowindow"
    # That appears when clicked. 
    marker_template = """
        var myLatlng = new google.maps.LatLng({lat},{lng});
        var pinColor = "00FF00";
        var starbucks = new google.maps.MarkerImage("icon/starbucks_tiny.png",
            new google.maps.Size(40, 40),
            new google.maps.Point(0,0),
            new google.maps.Point(0, 32));
        var peets = new google.maps.MarkerImage("icon/peets_tiny.gif",
            new google.maps.Size(40, 40),
            new google.maps.Point(0,0),
            new google.maps.Point(0, 32));
        var cbtl = new google.maps.MarkerImage("icon/cbtl_tiny.png",
            new google.maps.Size(40, 40),
            new google.maps.Point(0,0),
            new google.maps.Point(0, 32)); 
        var others = new google.maps.MarkerImage("icon/others_tiny.png",
            new google.maps.Size(40, 40),
            new google.maps.Point(0,0),
            new google.maps.Point(0, 20)); 
        var brand = "{shopname}";
        var image = (brand == "Starbucks Coffee")? starbucks:
                   ((brand == "The Coffee Bean & Tea Leaf")? cbtl: 
                   ((brand == "Peet&#39s Coffee & Tea")? peets: others));
        var marker_{i} = new google.maps.Marker({{ 
            position: myLatlng,
            map: map,
            icon: image,
            title:"{title}"
        }});
        
        var contentString = {content};

        var infowindow_{i} = new google.maps.InfoWindow({{
            content: contentString
        }});
    
        google.maps.event.addListener(marker_{i}, 'click', function() {{ 
            infowindow_{i}.open(map,marker_{i});
            if (lastWindow) {{ 
                lastWindow.close();
            }}
            lastWindow = infowindow_{i}
        }});
    """
    ## JS intitalization code.
    js_init = """
    <script type="text/Javascript">
      (function(){
        var mapOptions = {
            zoom: %s,
            center: new google.maps.LatLng(%s, %s)
          };

        var map = new google.maps.Map(document.getElementById('%s'),
              mapOptions);
              
        var lastWindow = false;
        
        var transitLayer = new google.maps.TransitLayer();
        transitLayer.setMap(map);
              
              """ % (zoom, lat, lng, div_id)

    # closing script
    js_end = """
      })();  
    </script>
    
    """

    # Not the actual part that generates the Markers based on the code from 
    # the data crawled.

    js_markers = ""
    for i,r in enumerate(coffeeshops.iterrows()):
        d = r[1]
        shopname = d.shopname.encode('utf-8')
        address = d.address.encode('utf-8')
        phone = d.phone.encode('utf-8')
        website = d.website.encode('utf-8')
        if website == "not found":
            content = content_template_nolink.format(shopname=shopname, address=address, phone=phone)
        else:
            content = content_template.format(shopname=shopname, address=address, phone=phone, website=website)
        js_markers +=  marker_template.format(i=i, lat=d.latitude, lng=d.longitude, title=shopname,
                                              shopname=shopname, content=content)

    html = html+js_init+js_markers+js_end
    if display:
        return HTML(html)
    else:
        return html
map_pos_coffeeshops(df)