# Chicago 사이트에서 정보 불러오고 맵에 표시하기

#### 참고자료(지금은 12년도 사이트를 활용할것)
- (2012년도) : https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/

- 최신년도 : https://www.chicagomag.com

In [1]:
from bs4 import BeautifulSoup
import requests

### 사이트 접속

In [2]:
url = 'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'

# headers={"User-Agent":"Mozilla/5.0"}
# 접근하는 웹사이트가 우리를 브라우저로 인식하게해서
# 접근금지를 풀어줌
html = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}).text
soup = BeautifulSoup(html, 'lxml')

In [3]:
# headers={"User-Agent":"Mozilla/5.0"}를 안적었다면
# 접근금지뜸
# print(soup)

### 첫번째 햄버거 정보 뽑아보기

In [4]:
# 사이트에 있는 햄버거의 개수
len(soup.select('div.sammy'))

50

In [5]:
# 첫번째 햄버거 정보
soup.select('div.sammy')[0]

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>
</div>

In [6]:
# sammy태그의 첫번째 햄버거 정보를 변수로 지정
tmp = soup.select('div.sammy')[0]

In [7]:
# .text와 get_text()의 차이는 뭘까
# select는 배열로 가져오기때문에
# 값을 뽑아오려면 뒤에 [0]을 붙인것임
tmp.select('.sammyRank')[0].text

'1'

In [8]:
tmp.select('.sammyListing')[0].text

'BLT\nOld Oak Tap\nRead more '

In [9]:
# 띄어쓰기를 기준으로 단어 나누기
tmp.select('.sammyListing')[0].text.split('\n')

['BLT', 'Old Oak Tap', 'Read more ']

In [10]:
tmp.find('a')

<a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a>

In [11]:
# href 안에있는 내용만 뽑아오고싶음
tmp.select('div.sammyListing a')[0] # 실패

<a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a>

In [12]:
tmp.find('a')['href'] # 성공

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

### 50종류의 햄버거를 각 배열에 저장

In [13]:
rank = []
main_menu = []
cafe_name = []
link_url = []

list_soup = soup.select('div.sammy')

for item in list_soup:
    rank.append(item.select('.sammyRank')[0].text)
    
    # 띄어쓰기를 기준으로 단어를 나누기
    list_string = item.select('.sammyListing')[0].text.split('\n')
    main_menu.append(list_string[0])
    cafe_name.append(list_string[1])
    
    link_url.append(item.find('a')['href'])

In [14]:
print(len(rank), len(main_menu), len(cafe_name), len(link_url))

50 50 50 50


In [15]:
print(rank)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50']


In [16]:
base_url = 'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'

In [17]:
from urllib.parse import urljoin
from tqdm.notebook import tqdm

rank = []
main_menu = []
cafe_name = []
link_url = []

list_soup = soup.select('div.sammy')

# tqdm
# 해당부분의 진행정도를 보여주게 됨
for item in tqdm(list_soup):
    rank.append(item.select('.sammyRank')[0].text)
    
    # 띄어쓰기를 기준으로 단어를 나누기
    list_string = item.select('.sammyListing')[0].text.split('\n')
    main_menu.append(list_string[0])
    cafe_name.append(list_string[1])
    
    link_url.append(urljoin(base_url, 
                            item.find('a')['href']))

  0%|          | 0/50 [00:00<?, ?it/s]

In [18]:
import pandas as pd

data = {'Rank':rank, 'Cafe':cafe_name, 'Menu':main_menu, 'URL':link_url}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Rank,Cafe,Menu,URL
0,1,Old Oak Tap,BLT,https://www.chicagomag.com/Chicago-Magazine/No...
1,2,Au Cheval,Fried Bologna,https://www.chicagomag.com/Chicago-Magazine/No...
2,3,Xoco,Woodland Mushroom,https://www.chicagomag.com/Chicago-Magazine/No...
3,4,Al’s Deli,Roast Beef,https://www.chicagomag.com/Chicago-Magazine/No...
4,5,Publican Quality Meats,PB&L,https://www.chicagomag.com/Chicago-Magazine/No...


### 이제 주소와 가격 등을 뽑기위해 
### 각 햄버거마다 다른 url에 접속해줘야함

### 첫번째 햄버거 url에 접근

In [19]:
df.index

RangeIndex(start=0, stop=50, step=1)

In [20]:
df['URL'][0]

'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [21]:
# 50종류의 햄버거중 첫번째 햄버거로 url 클릭
html = requests.get(df['URL'][0], headers={"User-Agent":"Mozilla/5.0"}).text
tmp = BeautifulSoup(html, 'lxml')

In [22]:
# p태그의 addy클래스의 주소정보, 전화번호, 사이트를 뽑아옴
tmp.select('p.addy')[0].text

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [23]:
# 주소정보만 뽑아옴
tmp.select('p.addy')[0].text.split(',')[0]

'\n$10. 2109 W. Chicago Ave.'

In [24]:
# 주소정보를 불러옴
tmp.select('p.addy')[0].text.split(',')[0].split()

['$10.', '2109', 'W.', 'Chicago', 'Ave.']

In [25]:
# $10. 만 뽑아옴 (가격 완성!)
tmp.select('p.addy')[0].text.split(',')[0].split()[0]

'$10.'

In [26]:
# $10과 .을 분리
# [:-1]은 뒤에서 빼고 나머지는 출력
# [-1]은 뒤에서 첫번째만 출력
tmp.select('p.addy')[0].text.split(',')[0].split()[0][:-1]

'$10'

In [27]:
# $10 빼고 출력
tmp.select('p.addy')[0].text.split(',')[0].split()[1:]

['2109', 'W.', 'Chicago', 'Ave.']

In [28]:
# $10 빼고 출력한것을 붙여줌 (주소 완성!)
' '.join(tmp.select('p.addy')[0].text.split(',')[0].split()[1:])

'2109 W. Chicago Ave.'

### 50종류에 대한 햄버거의 주소, 가격 등을 뽑아옴

In [29]:
import time

price = []
address = []

for n in tqdm(df.index):
    html = requests.get(df['URL'][n], headers={"User-Agent":"Mozilla/5.0"})
    time.sleep(1)
    tmp = BeautifulSoup(html.text, 'lxml')
    
    val_list = tmp.select('p.addy')[0].text.split(',')[0].split()
    
    price.append(val_list[0][:-1])
    address.append(' '.join(val_list[1:]))

  0%|          | 0/50 [00:00<?, ?it/s]

In [30]:
print(len(price), len(address))

50 50


In [31]:
df['Price'] = price
df['Address'] = address
df.head()

Unnamed: 0,Rank,Cafe,Menu,URL,Price,Address
0,1,Old Oak Tap,BLT,https://www.chicagomag.com/Chicago-Magazine/No...,$10,2109 W. Chicago Ave.
1,2,Au Cheval,Fried Bologna,https://www.chicagomag.com/Chicago-Magazine/No...,$9,800 W. Randolph St.
2,3,Xoco,Woodland Mushroom,https://www.chicagomag.com/Chicago-Magazine/No...,$9.50,445 N. Clark St.
3,4,Al’s Deli,Roast Beef,https://www.chicagomag.com/Chicago-Magazine/No...,$9.40,914 Noyes St.
4,5,Publican Quality Meats,PB&L,https://www.chicagomag.com/Chicago-Magazine/No...,$10,825 W. Fulton Mkt.


In [32]:
del df['URL']
df.head()

Unnamed: 0,Rank,Cafe,Menu,Price,Address
0,1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.
1,2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.
2,3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.
3,4,Al’s Deli,Roast Beef,$9.40,914 Noyes St.
4,5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.


In [33]:
df.set_index('Rank', inplace=True)
df.head()

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.
2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.
3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.
4,Al’s Deli,Roast Beef,$9.40,914 Noyes St.
5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.


### 주소를 활용해서 구글맵에 표시
### 구글 키(Geocoding API)를 발급받아야함
### Git에 올릴때 키가 노출되지않도록 조심

In [34]:
import folium
import googlemaps
import numpy as np

In [35]:
# Git 노출때문에 gmaps_key는 가렸음
gmaps_key = "**************************"
gmaps = googlemaps.Client(key=gmaps_key)

In [36]:
lat = []
lng = []

for n in tqdm(df.index):
    if df['Address'][n] != 'Multiple locations':
        target_name = df['Address'][n] + ', ' + 'Chicago'
        gmaps_output = gmaps.geocode(target_name)
        time.sleep(0.5)
        location = gmaps_output[0].get('geometry')
        lat.append(location['location']['lat'])
        lng.append(location['location']['lng'])
    else:
        lat.append(np.nan)
        lng.append(np.nan)

  0%|          | 0/50 [00:00<?, ?it/s]

In [37]:
print(len(lat), len(lng))

50 50


In [38]:
# 위도 경도로 쓰이는 'geometry' value값 구해보기
target_name = df['Address'][0] + ', ' + 'Chicago'
gmaps_output = gmaps.geocode(target_name)

print(gmaps_output[0]['geometry'])

{'bounds': {'northeast': {'lat': 41.8957463, 'lng': -87.6798563}, 'southwest': {'lat': 41.8954846, 'lng': -87.6800603}}, 'location': {'lat': 41.8955577, 'lng': -87.6799673}, 'location_type': 'ROOFTOP', 'viewport': {'northeast': {'lat': 41.89702008029149, 'lng': -87.67860931970849}, 'southwest': {'lat': 41.8943221197085, 'lng': -87.6813072802915}}}


In [39]:
df['lat'] = lat
df['lng'] = lng
df.head()

Unnamed: 0_level_0,Cafe,Menu,Price,Address,lat,lng
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.,41.895558,-87.679967
2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.,41.884639,-87.64759
3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.,41.890523,-87.630783
4,Al’s Deli,Roast Beef,$9.40,914 Noyes St.,41.878114,-87.629798
5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.,41.886604,-87.648536


In [40]:
map = folium.Map(location=[df['lat'].mean(),
                          df['lng'].mean()],
                zoom_start=11)
folium.Marker([df['lat'].mean(), df['lng'].mean()],
             popup='center').add_to(map)
map

In [41]:
map = folium.Map(location=[df['lat'].mean(),
                          df['lng'].mean()],
                zoom_start=11)

for n in df.index:
    if np.isnan(df['lat'][n]) == False:
        folium.Marker([df['lat'][n], df['lng'][n]], 
                popup=df['Cafe'][n]).add_to(map)
        
map