# 시카고 샌드위치 맛집 분석

### 1. 시카고 샌드위치 맛집 사이트에 접근하기

In [5]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [6]:
url_base = 'http://www.chicagomag.com'
url_sub = '/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'
url = url_base + url_sub

html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

In [7]:
rest_list = soup.select('.sammy')
rest_list[0]

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br>
Old Oak Tap<br>
<em>Read more</em> </br></br></a></div>
</div>

In [8]:
len(rest_list)

50

### 2. 접근한 웹 페이지에서 원하는 데이터 추출하고 정리하기

In [9]:
# rank
rank = rest_list[0].select_one('.sammyRank').get_text()
rank

'1'

In [10]:
# url
from urllib.parse import urljoin
url = urljoin(url_base, rest_list[10].find('a')['href'])
url

'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Lula-Cafe-Ham-and-Raclette-Panino/'

In [11]:
tmp = rest_list[0].find('a').get_text()
tmp

'BLT\r\nOld Oak Tap\nRead more '

In [12]:
tmp.split('\n')

['BLT\r', 'Old Oak Tap', 'Read more ']

In [13]:
menu = tmp.split('\n')[0].replace('\r', '')
cafe = tmp.split('\n')[1]
menu, cafe

('BLT', 'Old Oak Tap')

In [14]:
rank_list = []; url_list = []; menu_list = []; cafe_list = []
for rest in rest_list:
    rank = int(rest.select_one('.sammyRank').get_text())
    url = urljoin(url_base, rest.find('a')['href'])
    tmp = rest.find('a').get_text().split('\n')
    menu = tmp[0].replace('\r', '')
    cafe = tmp[1]

    rank_list.append(rank); url_list.append(url)
    menu_list.append(menu); cafe_list.append(cafe)

In [15]:
import pandas as pd 
df = pd.DataFrame({
    'Rank': rank_list,
    'Cafe': cafe_list,
    'Menu': menu_list,
    'URL': url_list
})
df.head()

Unnamed: 0,Rank,Cafe,Menu,URL
0,1,Old Oak Tap,BLT,http://www.chicagomag.com/Chicago-Magazine/Nov...
1,2,Au Cheval,Fried Bologna,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,3,Xoco,Woodland Mushroom,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,4,Al’s Deli,Roast Beef,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,5,Publican Quality Meats,PB&L,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [16]:
df.to_csv('../input/chicago.csv', sep=',', encoding='utf8', index=False)

### 3. 다수의 웹 페이지에 자동으로 접근해서 원하는 정보 가져오기

In [17]:
html = urlopen(df['URL'][0])
soup_tmp = BeautifulSoup(html, 'html.parser')

In [18]:
soup_tmp.select_one('.addy')

<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>

In [19]:
soup_tmp.select_one('.addy').get_text()

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [20]:
tmp = soup_tmp.select_one('.addy').get_text().split(',')
len(tmp)

3

In [21]:
tmp[0]

'\n$10. 2109 W. Chicago Ave.'

In [22]:
price = tmp[0].split()[0][:-1]
price

'$10'

In [23]:
address = ' '.join(tmp[0].split()[1:])
address

'2109 W. Chicago Ave.'

In [24]:
tmp[1].strip()

'773-772-0406'

In [25]:
price = tmp[0].split()
tmp = soup_tmp.select_one('.addy').get_text().split()
tmp[0], tmp[-2], tmp[-1]

('$10.', '773-772-0406,', 'theoldoaktap.com')

In [26]:
' '.join(tmp[1:-2]).replace(',', '')

'2109 W. Chicago Ave.'

In [27]:
price = tmp[0][:-1]
tel = tmp[-2].replace(',', '')
price, tel

('$10', '773-772-0406')

In [28]:
from tqdm import tqdm_notebook

In [29]:
rank_list = []; url_list = []; menu_list = []; cafe_list = []
price_list = []; address_list = []; tel_list = []; hp_list = []
for rest in tqdm_notebook(rest_list):
    rank = int(rest.select_one('.sammyRank').get_text())
    url = urljoin(url_base, rest.find('a')['href'])
    tmp = rest.find('a').get_text().split('\n')
    menu = tmp[0].replace('\r', '')
    cafe = tmp[1]

    rank_list.append(rank); url_list.append(url)
    menu_list.append(menu); cafe_list.append(cafe)

    html = urlopen(url)
    soup_tmp = BeautifulSoup(html, 'html.parser')
    tmp = soup_tmp.select_one('.addy').get_text().split(',')
    price = tmp[0].split()[0][:-1]
    address = ' '.join(tmp[0].split()[1:])

    price_list.append(price)
    #address_list.append(address)

    if len(tmp) == 1:
        address_list.append(address)
        tel_list.append(' ')
        hp_list.append(' ')
    elif len(tmp) == 2:
        if address.find('Multiple') >= 0:
            address_list.append(' ')
            tel_list.append(' ')
            hp_list.append(tmp[1])
        else:
            address_list.append(address)
            tel_list.append(tmp[1].strip())
            hp_list.append(' ')
    elif len(tmp) == 3:
        address_list.append(address)
        tel_list.append(tmp[1].strip())
        hp_list.append(tmp[2])
    elif len(tmp) == 4:
        address += ', ' + tmp[1]
        address_list.append(address)
        tel_list.append(tmp[2].strip())
        hp_list.append(tmp[3])

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [30]:
# 데이터프레임에 들어갈 리스트들의 길이가 같은지 확인
len(rank_list), len(menu_list), len(cafe_list), len(price_list), len(address_list), len(tel_list), len(hp_list)

(50, 50, 50, 50, 50, 50, 50)

In [31]:
df2 = pd.DataFrame({
    'Rank': rank_list,
    'Cafe': cafe_list,
    'Menu': menu_list,
    'Price': price_list,
    'Address': address_list,
    'Telephone': tel_list,
    'Home Page': hp_list
})
df2.head()

Unnamed: 0,Rank,Cafe,Menu,Price,Address,Telephone,Home Page
0,1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.,773-772-0406,theoldoaktap.com
1,2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.,312-929-4580,aucheval.tumblr.com
2,3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.,312-334-3688,rickbayless.com
3,4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston",847-475-9400,alsdeli.net
4,5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.,312-445-8977,publicanqualitymeats.com


In [32]:
df2.to_csv('../input/chicago2.csv', sep=',', encoding='utf8')

### 3. 맛집 위치를 지도에 표기하기

In [33]:
import numpy as np 
import pandas as pd 
import folium
import googlemaps

In [34]:
key_fd = open('googlemapskey.txt', mode='r')
gmaps_key = key_fd.read(100)
key_fd.close()
gmaps_key

'AIzaSyCYqNC10wXPVP49wQCDzq_wiRGH3L4yKUo'

In [35]:
gmaps = googlemaps.Client(key=gmaps_key)

In [36]:
lat = []; lng = []

for n in tqdm_notebook(df2.index):
    if df2['Address'][n].find(' ') != 0:
        target_name = df2['Address'][n] + ', ' + 'Cicago'
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get('geometry')
        lat.append(location_output['location']['lat'])
        lng.append(location_output['location']['lng'])
    else:
        lat.append(np.nan)
        lng.append(np.nan)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




ApiError: REQUEST_DENIED (This API project is not authorized to use this API.)

In [None]:
print(lat)

In [None]:
df2['Lattitude'] = lat
df2['Longitude'] = lng
df2.head()

In [None]:
mapping = folium.Map(location=[df2['Lattitude'].mean(), df2['Longitude'].mean()], zoom_start=11)
folium.Marker([df2['Lattitude'].mean(), df2['Longitude'].mean()], popup='center').add_to(mapping)
mapping

In [None]:
mapping = folium.Map(location=[df2['Lattitude'].mean(), df2['Longitude'].mean()], zoom_start=11)
for n in df2.index:
    if df2['Address'][n].find(' ') != 0:
        folium.Marker([df2['Lattitude'][n], df2['Longitude'][n]], popup=df2['Cafe']).add_to(mapping)
mapping

In [None]:
df2.to_csv('../input/chicago3.csv', sep=',', encoding='utf8')