# 03. Web Data

## 1. BeautifulSoup for web data

---

## BeautifulSoup Basic
- install
```
    - conda install -c anaconda beautifulsoup4
    - pip install beautifulsoup4

```

- data
    - 03.test_first.html

In [1]:
# import
from bs4 import BeautifulSoup

In [2]:
page = open('../data/03.zerobase.html', 'r').read()
soup = BeautifulSoup(page, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Very Simple HTML Code by Jqdjhy
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    Happy ZeroBase.
    <a href="http://www.pinkwink.kr" id="pw-link">
     PinkWink
    </a>
   </p>
   <p class="inner-text second-item">
    Happy Data Science.
    <a href="https://www.python.org" id="py-link" target="_blink">
     Python
    </a>
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    Data Science is funny.
   </b>
  </p>
  <p class="outer-text">
   <i>
    All I need is Love.
   </i>
  </p>
 </body>
</html>


In [3]:
# head 태그 확인
soup.head

<head>
<title>Very Simple HTML Code by Jqdjhy</title>
</head>

In [4]:
# body 태그 확인
soup.body

<body>
<div>
<p class="inner-text first-item" id="first">
                Happy ZeroBase.
                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link" target="_blink">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>Data Science is funny.</b>
</p>
<p class="outer-text">
<i>All I need is Love.</i>
</p>
</body>

In [5]:
# p 태그 확인
# 처음 발견한 p 태그만 출력
soup.find("p")

<p class="inner-text first-item" id="first">
                Happy ZeroBase.
                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
</p>

In [6]:
# find_all()
soup.find_all("p")

[<p class="inner-text first-item" id="first">
                 Happy ZeroBase.
                 <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
 </p>,
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link" target="_blink">Python</a>
 </p>,
 <p class="outer-text first-item" id="second">
 <b>Data Science is funny.</b>
 </p>,
 <p class="outer-text">
 <i>All I need is Love.</i>
 </p>]

In [7]:
# 파이썬 예약어
# class, id, def, list, str, int, tuple ...

In [8]:
soup.find("p", class_="inner-text first-item")

<p class="inner-text first-item" id="first">
                Happy ZeroBase.
                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
</p>

In [9]:
soup.find('p',{'class' : 'outer-text first-item'}).text.strip()

'Data Science is funny.'

In [10]:
# 다중 조건
soup.find('p', {'class':'inner-text first-item', 'id' :'first'})

<p class="inner-text first-item" id="first">
                Happy ZeroBase.
                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
</p>

In [11]:
# find_all() : 여러개의 태그를 반환
# list 형태로 반환한다.

soup.find_all('p')

[<p class="inner-text first-item" id="first">
                 Happy ZeroBase.
                 <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
 </p>,
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link" target="_blink">Python</a>
 </p>,
 <p class="outer-text first-item" id="second">
 <b>Data Science is funny.</b>
 </p>,
 <p class="outer-text">
 <i>All I need is Love.</i>
 </p>]

In [12]:
# 특정 태그 확인, list형태이기에 오프셋인덱스를 사용해야한다.
soup.find_all(id='pw-link')[0].text

'PinkWink'

In [13]:
soup.find_all('p', class_='inner-text second-item')

[<p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link" target="_blink">Python</a>
 </p>]

In [14]:
print(soup.find_all('p')[0].text)
print(soup.find_all('p')[1].string)
print(soup.find_all('p')[1].get_text())


                Happy ZeroBase.
                PinkWink

None

                Happy Data Science.
                Python



In [15]:
# p 태그 리스트에서 텍스트 속성만 출력

for each_tag in soup.find_all("p"):
    print("="*50)
    print(each_tag.text)


                Happy ZeroBase.
                PinkWink


                Happy Data Science.
                Python


Data Science is funny.


All I need is Love.



In [16]:
# a 태그에서 href 속성값에 있는 값(링크) 추출

links = soup.find_all("a")
links[0].get('href'), links[1]['href']

('http://www.pinkwink.kr', 'https://www.python.org')

In [17]:
for each in links:
    href = each.get('href') # each['href']
    text = each.get_text()
    print(text, '->', href)

PinkWink -> http://www.pinkwink.kr
Python -> https://www.python.org


# BeautifulSoup 예제 1-1 - 네이버금융

In [18]:
# import
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [19]:
url = "https://finance.naver.com/marketindex"
page = urlopen(url)
# page 말고 res, response 등의 변수명도 자주 사용한다.
# page.status
# HTTP 상태 코드 확인 : 2XX 는 정상
soup = BeautifulSoup(page, 'html.parser')
print(soup.prettify())

<script language="javascript" src="/template/head_js.naver?referer=info.finance.naver.com&amp;menu=marketindex&amp;submenu=market">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20210916165954/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20210916165954/js/jindo.1.5.3.element-text-patch.js" type="text/javascript">
</script>
<div id="container" style="padding-bottom:0px;">
 <div class="market_include">
  <div class="market_data">
   <div class="market1">
    <div class="title">
     <h2 class="h_market1">
      <span>
       환전 고시 환율
      </span>
     </h2>
    </div>
    <!-- data -->
    <div class="data">
     <ul class="data_lst" id="exchangeList">
      <li class="on">
       <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
        <h3 class="h_lst">
         <span class="blind">
          미국 U

In [20]:
# 1
soup.find_all('span', 'value')

[<span class="value">1,187.00</span>,
 <span class="value">1,067.78</span>,
 <span class="value">1,375.26</span>,
 <span class="value">184.41</span>,
 <span class="value">111.0600</span>,
 <span class="value">1.1593</span>,
 <span class="value">1.3545</span>,
 <span class="value">94.0400</span>,
 <span class="value">75.88</span>,
 <span class="value">1650.1</span>,
 <span class="value">1757.0</span>,
 <span class="value">66873.16</span>]

In [21]:
# 2
soup.find_all('span', class_ ='value')

[<span class="value">1,185.20</span>,
 <span class="value">1,058.50</span>,
 <span class="value">1,375.19</span>,
 <span class="value">183.08</span>,
 <span class="value">111.9000</span>,
 <span class="value">1.1615</span>,
 <span class="value">1.3425</span>,
 <span class="value">94.3500</span>,
 <span class="value">74.83</span>,
 <span class="value">1645.39</span>,
 <span class="value">1721.5</span>,
 <span class="value">66004.96</span>]

In [22]:
# 3
soup.find_all('span', {'class' : 'value'})

[<span class="value">1,185.20</span>,
 <span class="value">1,058.50</span>,
 <span class="value">1,375.19</span>,
 <span class="value">183.08</span>,
 <span class="value">111.9000</span>,
 <span class="value">1.1615</span>,
 <span class="value">1.3425</span>,
 <span class="value">94.3500</span>,
 <span class="value">74.83</span>,
 <span class="value">1645.39</span>,
 <span class="value">1721.5</span>,
 <span class="value">66004.96</span>]

## BeautifulSoup 예제1-2 - 네이버금융
- !pip install requests
- find, find_all
- select, select_one
- find, select_one : 단일 선택
- select, find_all : 다중 선택

In [21]:
import requests
# from urllib.request.Request 둘 중에 편한거 사용!
from bs4 import BeautifulSoup

In [23]:
url = 'https://finance.naver.com/marketindex/'
response = requests.get(url)
# requests.get(), request.post()
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify())

<script language="javascript" src="/template/head_js.naver?referer=info.finance.naver.com&amp;menu=marketindex&amp;submenu=market">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20210916165954/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20210916165954/js/jindo.1.5.3.element-text-patch.js" type="text/javascript">
</script>
<div id="container" style="padding-bottom:0px;">
 <div class="market_include">
  <div class="market_data">
   <div class="market1">
    <div class="title">
     <h2 class="h_market1">
      <span>
       환전 고시 환율
      </span>
     </h2>
    </div>
    <!-- data -->
    <div class="data">
     <ul class="data_lst" id="exchangeList">
      <li class="on">
       <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
        <h3 class="h_lst">
         <span class="blind">
          미국 U

In [30]:
# exchangeList =  soup.find_all('li', 'on')
# id => #
# class => .
exchangeList = soup.select('#exchangeList > li')
# exchangeList = soup.find_all( id = 'exchangeList',  )
len(exchangeList), exchangeList

(1,
 [<ul class="data_lst" id="exchangeList">
  <li class="on">
  <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
  <h3 class="h_lst"><span class="blind">미국 USD</span></h3>
  <div class="head_info point_up">
  <span class="value">1,187.00</span>
  <span class="txt_krw"><span class="blind">원</span></span>
  <span class="change">1.00</span>
  <span class="blind">상승</span>
  </div>
  </a>
  <a class="graph_img" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdc', '', '', event);">
  <img alt="" height="153" src="https://ssl.pstatic.net/imgfinance/chart/marketindex/FX_USDKRW.png" width="295"/>
  </a>
  <div class="graph_info">
  <span class="time">2021.10.01 20:03</span>
  <span class="source">하나은행 기준</span>
  <span class="count">고시회차<span class="num">289</span>회</span>
  </div>
  </li>
  <li class="">
  <a class="head jpy" href="/marketindex/exchangeDeta

In [35]:
title = exchangeList[0].select_one(".h_lst").text
exchange = exchangeList[0].select_one(".value").text
change = exchangeList[0].select_one(".change").text
updown = exchangeList[0].select_one("div > .blind").text
# > 가 의미하는 바는, 해당 태그의 '바로 하위'이기에, 적절히 선택할 수 있다.
# 띄어쓰기가 있으면, 클래스 속성값이 2개 있다고 판단해야한다.
# link = exchangeList[0].select_one().text
title, exchange, change, updown

('미국 USD', '1,187.00', '1.00', '상승')

In [39]:
findMethod = soup.find_all('ul', id = 'exchangeList')
# findMethod = findMethod.find_all('li')
findMethod

[<ul class="data_lst" id="exchangeList">
 <li class="on">
 <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
 <h3 class="h_lst"><span class="blind">미국 USD</span></h3>
 <div class="head_info point_up">
 <span class="value">1,187.00</span>
 <span class="txt_krw"><span class="blind">원</span></span>
 <span class="change">1.00</span>
 <span class="blind">상승</span>
 </div>
 </a>
 <a class="graph_img" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdc', '', '', event);">
 <img alt="" height="153" src="https://ssl.pstatic.net/imgfinance/chart/marketindex/FX_USDKRW.png" width="295"/>
 </a>
 <div class="graph_info">
 <span class="time">2021.10.01 20:03</span>
 <span class="source">하나은행 기준</span>
 <span class="count">고시회차<span class="num">289</span>회</span>
 </div>
 </li>
 <li class="">
 <a class="head jpy" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_

In [None]:
baseUrl = 'https://finance.naver.com'
baseUrl + exchangeList[0].select_one('a').get('href')

'https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW'

In [None]:
# 4개 데이터 수집
exchange_datas = []
baseUrl = 'https://finance.naver.com'

for item in exchangeList:
    data = {
        'title' : item.select_one('.h_lst').text,
        'exchange' : item.select_one('.value').text,
        'change' : item.select_one('.change').text,
        'updown' : item.select_one('.head_info.point_dn > .blind').text,
        "link" : baseUrl + item.select_one('a').get('href')
    }
    exchange_datas.append(data)

df = pd.DataFrame(exchange_datas)
df.to_excel('./naverfinance.xlsx',encoding='utf-8')

# BeautifulSoup 예제2 - 위키백과 문서 정보 가져오기

In [40]:
from urllib.request import urlopen, Request
import urllib
from bs4 import BeautifulSoup

html = 'https://ko.wikipedia.org/wiki/{search_words}'
req = Request(html.format(search_words=urllib.parse.quote("여명의 눈동자"))) # 글자를 URL로 인코딩
response = urlopen(req)
print(response.status)
soup = BeautifulSoup(response, 'html.parser')
print(soup.prettify())

SyntaxError: invalid syntax (<ipython-input-40-8ef9d64877d4>, line 9)

In [28]:
n = 0
for each in soup.find_all('ul'):
    print("=>", n)
    print(each.get_text())
    n +=1

=> 0


계정 만들기


=> 1
토론기여
=> 2
대문최근 바뀜요즘 화제임의의 문서로기부
=> 3
사랑방사용자 모임관리 요청
=> 4
도움말정책과 지침질문방
=> 5
여기를 가리키는 문서가리키는 글의 최근 바뀜파일 올리기특수 문서 목록고유 링크문서 정보이 문서 인용하기위키데이터 항목
=> 6
책 만들기PDF로 다운로드인쇄용 판
=> 7
문서토론
=> 8

=> 9
읽기편집역사 보기
=> 10

=> 11
EnglishفارسیKreyòl ayisyenMagyarBahasa IndonesiaItaliano日本語PolskiPortuguês吴语中文
=> 12
소설《여명의 눈동자》
=> 13

1 개요
2 등장 인물

2.1 주요 인물
2.2 여옥의 주변 인물
2.3 하림의 주변 인물
2.4 그 외


3 제작진
4 시청률
5 본방송 편성 변경
6 재방송 결방 사유 및 편성 변경
7 수상 경력
8 OST
9 참고 사항
10 고증 오류
11 주해
12 각주
13 외부 링크

=> 14

2.1 주요 인물
2.2 여옥의 주변 인물
2.3 하림의 주변 인물
2.4 그 외

=> 15
채시라 : 윤여옥 역 (아역: 김민정)
박상원 : 장하림(하리모토 나츠오) 역 (아역: 김태진)
최재성 : 최대치(사카이) 역 (아역: 장덕수)
=> 16
최불암 : 윤홍철 역 - 윤여옥의 아버지
한차돌 : 최대운 역 - 최대치와 윤여옥의 아들
오연수 : 봉순 역
=> 17
김소원 : 장하림의 어머니 역
김동현 : 장경림 역 - 장하림의 형 (아역: 이민우)
안해숙 : 장경림의 아내 역
=> 18
박근형 : 최두일(스즈키) 역
이정길 : 김기문 역
장항선 : 오오에 오장 역
박인환 : 구보다 일병 역
임현식 : 황성철 역
홍승옥 : 성철 처 역
김흥기 : 미다 요시노리 대위 역
고현정 : 안명지 역
최현미 : 이경애 역
심양홍 : 박창석 변호사 역
남성훈 : 백인수 역
이창환 : 이승만 대통령 역
민지환 : 관동군 731부대장 이시이 시로 중장 역
김기주 : 일본군 15군 사령관 무다구치

In [29]:
soup.find_all("ul")[15].text.strip().replace('\xa0',' ').replace('\n', '')

'채시라 : 윤여옥 역 (아역: 김민정)박상원 : 장하림(하리모토 나츠오) 역 (아역: 김태진)최재성 : 최대치(사카이) 역 (아역: 장덕수)'

## Python List 데이터형

- list형은 대괄호로 생성


---

# 2. 시카고 맛집 데이터 분석 - 개요
- https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/

```
최종 목표
총 51개 페이지에서 각 가게의 정보를 가져온다
- 가게이름
- 대표메뉴
- 가격
- 가게주소
```

# 3. 시카고 맛집 데이터 분석 - 메인페이지

In [30]:
# !pip install fake-useragent
import ssl
from urllib.request import Request, urlopen 
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
context = ssl._create_unverified_context() 
url_base = "https://www.chicagomag.com/"
url_sub = "Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/"
url = url_base + url_sub  
ua = UserAgent()
req = Request(url, headers={"user-agent": ua.ie})
html = urlopen(req,context=context)
soup = BeautifulSoup(html, "html.parser")
print(soup.prettify())


<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible">
   <link href="https://gmpg.org/xfn/11" rel="profile"/>
   <title>
    The 50 Best Sandwiches in Chicago – Chicago Magazine
   </title>
   <style type="text/css">
    .heateorSssInstagramBackground{background:radial-gradient(circle at 30% 107%,#fdf497 0,#fdf497 5%,#fd5949 45%,#d6249f 60%,#285aeb 90%)}
						div.heateor_sss_horizontal_sharing i.heateorSssInstagramBackground{background:#000!important;}div.heateor_sss_standard_follow_icons_container i.heateorSssInstagramBackground{background:#000;}
										.heateor_sss_horizontal_sharing .heateorSssSharing,.heateor_sss_standard_follow_icons_container .heateorSssSharing{
							background-color: #000;
							color: #fff;
						border-width: 0px;
			border-style: solid;
			border-color: transparent;
		}
				.heateor_sss_horizontal_sharing .heateorSssTCBackground{
			color:#666;
		}
				.heateor_sss_horizontal_shari

In [31]:
len(soup.find_all("div", class_='sammy'))
# len(soup.select(".sammy"))

50

In [32]:
tmp_one = soup.find_all("div",class_='sammy')[0]
tmp_one, type(tmp_one)
# bs4.element.Tag -> find 사용가능

(<div class="sammy" style="position: relative;">
 <div class="sammyRank">1</div>
 <div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
 Old Oak Tap<br/>
 <em>Read more</em> </a></div>
 </div>,
 bs4.element.Tag)

In [33]:
# Rank
tmp_one.find(class_='sammyRank').get_text()

'1'

In [34]:
# Menu, Cafe
tmp_one.find('div', class_= 'sammyListing').get_text()

'BLT\nOld Oak Tap\nRead more '

In [35]:
# Link
tmp_one.find('a')['href']

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [36]:
import re # Regular Expression

tmp_string = tmp_one.find(class_='sammyListing').get_text()
re.split("\n|\r\n", tmp_string)

['BLT', 'Old Oak Tap', 'Read more ']

In [37]:
print(re.split("\n|\r\n", tmp_string)[0])
print(re.split("\n|\r\n", tmp_string)[1])

BLT
Old Oak Tap


In [42]:
from urllib.parse import urljoin

url_base = url_base

# 필요한 내용을 담을 빈 리스트
# 리스트로 하나씩 컬럼을 만들고, DataFrame으로 합칠 예정
rank = []
main_menu = []
cafe_name = []
url_add = []

list_soup = soup.find_all('div', 'sammy') 

for item in list_soup:
    # Rank
    rank.append(item.find(class_='sammyRank').get_text())
    # Main_menu, Cafe_name
    tmp_string = item.find(class_='sammyListing').get_text()
    main_menu.append(re.split("\n|\r\n", tmp_string)[0])
    cafe_name.append(re.split("\n|\r\n", tmp_string)[1])
    # Link
    url_add.append(urljoin(url_base, item.find("a")['href'])) # 절대경로면 안붙이고, 상대경로면 붙여주는
    




In [43]:
len(rank), len(main_menu), len(cafe_name), len(url_add) 

(50, 50, 50, 50)

In [44]:
import pandas as pd
data = {
    "Rank" : rank,
    "Menu" : main_menu,
    "Cafe" : cafe_name,
    "URL" : url_add,
}

df = pd.DataFrame(data)
df.tail(2)

Unnamed: 0,Rank,Menu,Cafe,URL
48,49,Le Végétarien,Toni Patisserie,https://www.chicagomag.com/Chicago-Magazine/No...
49,50,The Gatsby,Phoebe’s Bakery,https://www.chicagomag.com/Chicago-Magazine/No...


In [45]:
# 컬럼 순서 변경

df = pd.DataFrame(data, columns=['Rank','Cafe','Menu','URL'])
df.tail()

Unnamed: 0,Rank,Cafe,Menu,URL
45,46,Chickpea,Kufta,https://www.chicagomag.com/Chicago-Magazine/No...
46,47,The Goddess and Grocer,Debbie’s Egg Salad,https://www.chicagomag.com/Chicago-Magazine/No...
47,48,Zenwich,Beef Curry,https://www.chicagomag.com/Chicago-Magazine/No...
48,49,Toni Patisserie,Le Végétarien,https://www.chicagomag.com/Chicago-Magazine/No...
49,50,Phoebe’s Bakery,The Gatsby,https://www.chicagomag.com/Chicago-Magazine/No...


In [46]:
# 데이터 저장
df.to_csv("../data/03. best_sandwiches_list_chicago.csv",sep=',', encoding='utf-8')

# 4. 시카고 맛집 데이터 분석 - 하위페이지

In [47]:
# !pip install fake-useragent
import ssl
from urllib.request import Request, urlopen 
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
context = ssl._create_unverified_context() 
url_base = "https://www.chicagomag.com/"
url_sub = "Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/"
url = url_base + url_sub  
ua = UserAgent()
req = Request(url, headers={"user-agent": ua.ie})
html = urlopen(req,context=context)
soup = BeautifulSoup(html, "html.parser")
print(soup.prettify())

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible">
   <link href="https://gmpg.org/xfn/11" rel="profile"/>
   <title>
    The 50 Best Sandwiches in Chicago – Chicago Magazine
   </title>
   <style type="text/css">
    .heateorSssInstagramBackground{background:radial-gradient(circle at 30% 107%,#fdf497 0,#fdf497 5%,#fd5949 45%,#d6249f 60%,#285aeb 90%)}
						div.heateor_sss_horizontal_sharing i.heateorSssInstagramBackground{background:#000!important;}div.heateor_sss_standard_follow_icons_container i.heateorSssInstagramBackground{background:#000;}
										.heateor_sss_horizontal_sharing .heateorSssSharing,.heateor_sss_standard_follow_icons_container .heateorSssSharing{
							background-color: #000;
							color: #fff;
						border-width: 0px;
			border-style: solid;
			border-color: transparent;
		}
				.heateor_sss_horizontal_sharing .heateorSssTCBackground{
			color:#666;
		}
				.heateor_sss_horizontal_shari

In [48]:
# requirements
import ssl
import pandas as pd #import를 가장 첫줄에
from urllib.request import urlopen, Request #from 절은 그 밑에
from fake_useragent import UserAgent
from bs4 import BeautifulSoup


In [49]:
df = pd.read_csv('../data/03. best_sandwiches_list_chicago.csv', index_col = 0)
df.tail()

Unnamed: 0,Rank,Cafe,Menu,URL
45,46,Chickpea,Kufta,https://www.chicagomag.com/Chicago-Magazine/No...
46,47,The Goddess and Grocer,Debbie’s Egg Salad,https://www.chicagomag.com/Chicago-Magazine/No...
47,48,Zenwich,Beef Curry,https://www.chicagomag.com/Chicago-Magazine/No...
48,49,Toni Patisserie,Le Végétarien,https://www.chicagomag.com/Chicago-Magazine/No...
49,50,Phoebe’s Bakery,The Gatsby,https://www.chicagomag.com/Chicago-Magazine/No...


In [50]:
df['URL'][0]

'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [51]:
ua = UserAgent()
context = ssl._create_unverified_context() 

req = Request(df["URL"][0], headers = {"user-agent" : ua.ie})
html = urlopen(req, context=context).read()
soup_tmp = BeautifulSoup(html, 'html.parser')
soup_tmp.find("p", class_='addy')

<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>

In [52]:
# regular expression
price_tmp = soup_tmp.find("p", class_='addy').text
price_tmp

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [53]:
import re
re.split(".,", price_tmp)[0]

'\n$10. 2109 W. Chicago Ave'

In [54]:
price_tmp = re.split(".,", price_tmp)[0]
price_tmp

'\n$10. 2109 W. Chicago Ave'

In [55]:
tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
price_tmp[len(tmp)+2:]

'2109 W. Chicago Ave'

## for문으로 추출하기!

In [56]:
from tqdm import tqdm

price = []
address = []

for idx, rows in tqdm(df.iterrows()):
    req = Request(rows["URL"], headers = {"user-agent" : ua.ie})
    html = urlopen(req, context=context).read()
    soup_tmp = BeautifulSoup(html, 'html.parser')
    gettings = soup_tmp.find("p", class_='addy').get_text()
    price_tmp = re.split('.,', gettings)[0]
    tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
    price.append(tmp)
    address.append(price_tmp[len(tmp)+2:])
    # print(idx)



50it [00:53,  1.07s/it]


In [57]:
len(price), len(address)

(50, 50)

In [58]:
price[:5], address[:5]

(['$10.', '$9.', '$9.50', '$9.40', '$10.'],
 ['2109 W. Chicago Ave',
  '800 W. Randolph St',
  ' 445 N. Clark St',
  ' 914 Noyes St',
  '825 W. Fulton Mkt'])

In [59]:
df.tail(2)

Unnamed: 0,Rank,Cafe,Menu,URL
48,49,Toni Patisserie,Le Végétarien,https://www.chicagomag.com/Chicago-Magazine/No...
49,50,Phoebe’s Bakery,The Gatsby,https://www.chicagomag.com/Chicago-Magazine/No...


In [60]:
df['Price'] = price
df['Address'] = address
del df['URL']
df = df.set_index('Rank')
df.tail()

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
46,Chickpea,Kufta,$8.,2018 W. Chicago Ave
47,The Goddess and Grocer,Debbie’s Egg Salad,$6.50,25 E. Delaware Pl
48,Zenwich,Beef Curry,$7.50,416 N. York St
49,Toni Patisserie,Le Végétarien,$8.75,65 E. Washington St
50,Phoebe’s Bakery,The Gatsby,$6.85,3351 N. Broadwa


In [61]:
df.to_csv(
    '../data/03. best_sandwiches_list_chicago2.csv', sep=',', encoding='utf-8'
)

In [62]:
pd.read_csv('../data/03. best_sandwiches_list_chicago2.csv', index_col=0).head()

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Old Oak Tap,BLT,$10.,2109 W. Chicago Ave
2,Au Cheval,Fried Bologna,$9.,800 W. Randolph St
3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St
4,Al’s Deli,Roast Beef,$9.40,914 Noyes St
5,Publican Quality Meats,PB&L,$10.,825 W. Fulton Mkt


# 5. 시카고 맛집 데이터 지도 시각화

In [68]:
# requirements

import folium
import pandas as pd
import numpy as np
import googlemaps
from tqdm import tqdm

In [69]:
df = pd.read_csv("../data/03. best_sandwiches_list_chicago2.csv", index_col=0)
df.tail()

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
46,Chickpea,Kufta,$8.,2018 W. Chicago Ave
47,The Goddess and Grocer,Debbie’s Egg Salad,$6.50,25 E. Delaware Pl
48,Zenwich,Beef Curry,$7.50,416 N. York St
49,Toni Patisserie,Le Végétarien,$8.75,65 E. Washington St
50,Phoebe’s Bakery,The Gatsby,$6.85,3351 N. Broadwa


In [73]:
gmaps_key = 'AIzaSyDkVUBTQ0pMz3ikmlto4WS3vX05L1dJBhU'
gmaps = googlemaps.Client(key=gmaps_key)

In [77]:
lat = []
lng = []

for idx, row in tqdm(df.iterrows()):
    if not row['Address'] == "Multiple location":
        target_name = row['Address'] + ", " + "Chicago"
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get("geometry")
        lat.append(location_output["location"]['lat'])
        lng.append(location_output["location"]['lng'])
    else :
        lat.append(np.nan)
        lng.append(np.nan)

50it [00:04, 12.17it/s]


In [78]:
len(lat), len(lng)

(50, 50)

In [81]:
df['lat']= lat
df['lng']= lng
df.tail()

Unnamed: 0_level_0,Cafe,Menu,Price,Address,lat,lng
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
46,Chickpea,Kufta,$8.,2018 W. Chicago Ave,41.896113,-87.677857
47,The Goddess and Grocer,Debbie’s Egg Salad,$6.50,25 E. Delaware Pl,41.898979,-87.627393
48,Zenwich,Beef Curry,$7.50,416 N. York St,41.910583,-87.940488
49,Toni Patisserie,Le Végétarien,$8.75,65 E. Washington St,41.883106,-87.625438
50,Phoebe’s Bakery,The Gatsby,$6.85,3351 N. Broadwa,41.943163,-87.644507


In [87]:
mapping = folium.Map(location = [41.8781136, -87.6297982], zoom_start=11)

for idx, row in df.iterrows():
    if not row["Address"] == 'Multiple location':
        folium.Marker(
            location = [row['lat'], row['lng']],
            popup = row['Cafe'],
            tooltip=row['Menu'],
            icon= folium.Icon(
                icon = 'coffee',
                prefix = 'fa'
            )
        ).add_to(mapping)
mapping