In [54]:
import os
from os.path import exists

import requests  
from bs4 import BeautifulSoup 

from urllib.request import urlopen

import pandas as pd
import re

# <font color='#da5e3c'>웹 크롤링</font>

## <font color='#e87a54'>웹 페이지에서 필요한 정보 파싱</font>
- The 50 Best Sandwiches in Chicago
- 메인페이지의 TOP50 리스트 정보 가져오기
- 각각에 연결된 상세 정보 가져오기

> - 과제1 : 메인페이지 정보 크롤링 - 랭킹, 카페명, 메뉴명, 상세페이지링커
> - 과제2 : 상세페이지 정보 크롤링 - 가격, 주소, 전화번호, 홈페이지정보
> - 과제3 : 과제1, 과제2 정보를 모두 포함하여 파일로 저장하기

### <font color='#fc9d6f'>과제1: 메인페이지 정보 크롤링</font>

- 랭킹
- 카페명
- 메뉴명
- 상세페이지 링크

### 하나의 데이터(1위, BLT)로 구축

In [4]:
url = 'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'

html = urlopen(url)
soup = BeautifulSoup(html, "lxml")

    # 데이터가 잘 불러졌는지 title로 확인
soup.title

<title>
  The 50 Best Sandwiches in Chicago |
  Chicago magazine
      |  November 2012
    </title>

In [11]:
# 모든 샌드위치 목록을 포함하는 div(class = content post)
# 한 개의 샌드위치 정보를 가진  div(class = sammy)

content_tag = soup.find('div', 'content post')
sammy_tags  = content_tag.find_all('div', 'sammy')

len(sammy_tags)  # The "50" Best Sandwiches in Chicago  -> length: 50

50

In [14]:
# div "sammy" 안에서 개별적으로 가져올 수 있는 데이터 확인

sammy = sammy_tags[0]

sammy

    # sammyRank      -> Ranking
    # sammyListing   ->  Menu & Cafe, Link

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>
</div>

In [42]:
# div "sammyRank"를 통해 랭킹 불러오기(int형으로 변환)

tmp = sammy.find('div', 'sammyRank')
rank = int(tmp.get_text().strip())

rank

1

In [58]:
# <div "sammyListing"> / <a - 메뉴, 카페, 더보기> / <b - 메뉴>

listing = sammy.find('div', 'sammyListing')
tmp = listing.find('a')

# ------------------------------------------------------------ #
    # menu: <b> 태그 text
menu = (tmp.find('b')).get_text().strip()

# ------------------------------------------------------------ #
    # link = url + tmp(<a> 태그) 안의 href 값
link = "http://www.chicagomag.com" + tmp['href']

# ------------------------------------------------------------ #
    # cafe: <a> 태그 안의 값을 가져온 다음, 
    # 불필요한 데이터(문자열, menu) 제거
cafe = tmp.get_text().strip()

cafe = cafe.replace("\r", "")
cafe = cafe.replace("\n", "")
cafe = cafe.replace("Read more", "")
cafe = cafe.replace(menu, "")

cafe = cafe.strip()
# ------------------------------------------------------------ #

print(menu)
print(cafe)
print(link)

BLT
Old Oak Tap
http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/


### 모듈화 하여 50개 샌드위치 데이터 저장

In [218]:
def crawl_ChiSdw():
    Rank = []; Menu = []
    Cafe = []; Link = []
    
    url = 'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'
    html = urlopen(url)
    soup = BeautifulSoup(html, "lxml")
    
    content_tag = soup.find('div', 'content post')
    sammy_tags  = content_tag.find_all('div', 'sammy')
    
    
    for sammy in sammy_tags:

        tmp = sammy.find('div', 'sammyRank')
        rank = int(tmp.get_text().strip())

        listing = sammy.find('div', 'sammyListing')
        tmp = listing.find('a')
        
        # 메뉴 ======================================== #
        menu = (tmp.find('b')).get_text().strip()
        
        # 링크 ======================================== #
        link = tmp['href']
        link = link.replace('http://www.chicagomag.com', '')
        link = "http://www.chicagomag.com" + link
        
        # 카페 ======================================== #
        cafe = tmp.get_text().strip()
        
        cafe = cafe.replace("\r", "")
        cafe = cafe.replace("\n", "")
        cafe = cafe.replace("Read more", "")
        cafe = cafe.replace(menu, "")
        
        cafe = cafe.strip()
        
        
        Rank.append(rank); Cafe.append(cafe)
        Menu.append(menu); Link.append(link)
    
    data = {'순위'  : Rank, '카페명': Cafe,
            '메뉴명': Menu, '링크'  : Link }
    
    returnDF = pd.DataFrame(data)
    
    return returnDF

In [220]:
df = crawl_ChiSdw()
df.set_index('순위', inplace = True)

df.head(10)

Unnamed: 0_level_0,카페명,메뉴명,링크
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Old Oak Tap,BLT,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,Au Cheval,Fried Bologna,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,Xoco,Woodland Mushroom,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,Al’s Deli,Roast Beef,http://www.chicagomag.com/Chicago-Magazine/Nov...
5,Publican Quality Meats,PB&L,http://www.chicagomag.com/Chicago-Magazine/Nov...
6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad,http://www.chicagomag.com/Chicago-Magazine/Nov...
7,Acadia,Lobster Roll,http://www.chicagomag.com/Chicago-Magazine/Nov...
8,Birchwood Kitchen,Smoked Salmon Salad,http://www.chicagomag.com/Chicago-Magazine/Nov...
9,Cemitas Puebla,Atomica Cemitas,http://www.chicagomag.com/Chicago-Magazine/Nov...
10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [61]:
# 1차 저장
df.to_csv('data/chicago_sandwiches_2012.csv', sep = ',', encoding = 'UTF-8')

### <font color='#fc9d6f'>과제2: 상세페이지 정보 크롤링</font>

- 가격
- 주소
- 전화번호
- 홈페이지 정보

In [228]:
url_1 = df['링크'][6]

url_1

'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Hendrickx-Belgian-Bread-Crafter-Belgian-Chicken-Curry-Salad/'

In [229]:
html = urlopen(url_1)
soup = BeautifulSoup(html, 'lxml')

soup.title

<title>
  6. Hendrickx Belgian Bread Crafter Belgian Chicken Curry Salad |
  Chicago magazine
      |  November 2012
    </title>

In [230]:
# <p "addy" - 가격, 주소, 홈페이지 링크> / <a - 홈페이지 링크>

p_tag = soup.find('p', 'addy')

addy = p_tag.get_text().strip()

    # 홈페이지 링크
    
if p_tag.find('a') is not None:
    tmp = p_tag.find('a')
    homepage = tmp['href']
else:
    homepage = "-"

homepage

'-'

# 주소 정규식 모르겠어요ㅠㅠ

In [231]:
# price : '$00.00'

re_price = re.search('\$\d{1,2}\.\d{0,2}[ .]?', addy)
if re_price is not None:
     price = re_price.group().strip()
else: 
     pass

In [232]:
# phone : 000-000-0000

re_phone = re.search('\d{3}[ -.]?\d{3}[ -.]?\d{4}', addy)
if re_phone is not None:
     phone = re_phone.group()
else:
     pass

In [233]:
# addr: replace(price, ''); replace(phone, ''); replace(removal, '')

    # <p "addy"> 에서 홈페이지 제거 용도
removal = homepage.replace('http://', '')
removal = removal.replace('www.', '')
removal = removal.replace('/', '').strip()
      
    # re_addr: addy에서 주소, 금액, 홈페이지 삭제
re_addr = addy.replace(phone, '')
re_addr = re_addr.replace(price, '')
re_addr = re_addr.replace(removal, '')

if re_addr is not None:
    addr = re_addr.replace(',', '').strip()
else:
    pass

In [234]:
phone

'312-649-6717'

In [235]:
addr

'100 E. Walton St.'

In [236]:
price

'$7.25.'

In [237]:
homepage

'-'

### 모듈화 하여 상세페이지들 합치기

In [238]:
from tqdm import tqdm_notebook

In [239]:
df

Unnamed: 0_level_0,카페명,메뉴명,링크
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Old Oak Tap,BLT,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,Au Cheval,Fried Bologna,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,Xoco,Woodland Mushroom,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,Al’s Deli,Roast Beef,http://www.chicagomag.com/Chicago-Magazine/Nov...
5,Publican Quality Meats,PB&L,http://www.chicagomag.com/Chicago-Magazine/Nov...
6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad,http://www.chicagomag.com/Chicago-Magazine/Nov...
7,Acadia,Lobster Roll,http://www.chicagomag.com/Chicago-Magazine/Nov...
8,Birchwood Kitchen,Smoked Salmon Salad,http://www.chicagomag.com/Chicago-Magazine/Nov...
9,Cemitas Puebla,Atomica Cemitas,http://www.chicagomag.com/Chicago-Magazine/Nov...
10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [240]:
def ChiSdw_detail(df):
    Price = []; Address = []
    Phone = []; Hompage = []
    
    for link in tqdm_notebook(df['링크']):
        html = urlopen(link)
        soup = BeautifulSoup(html, 'lxml')
        
        p_tag = soup.find('p', 'addy')

        addy = p_tag.get_text().strip()

        # 홈페이지 링크 =============================== #
        if p_tag.find('a') is not None:
            tmp = p_tag.find('a')
            homepage = tmp['href']
        else:
            homepage = "-"
        
        # price : '$00.00' ============================ #
        re_price = re.search('\$\d{1,2}\.\d{0,2}[ .]?', addy)
        if re_price is not None:
             price = re_price.group().strip()
        else: 
             pass
        
        # phone : '000-000-0000' ====================== #
        re_phone = re.search('\d{3}[ -.]?\d{3}[ -.]?\d{4}', addy)
        if re_phone is not None:
             phone = re_phone.group()
        else:
             pass
        
        # addr: 정규식으로 하고 싶은데 안 돼요 ======== #
        removal = homepage.replace('http://', '')
        removal = removal.replace('www.', '')
        removal = removal.replace('/', '').strip()

        re_addr = addy.replace(phone, '')
        re_addr = re_addr.replace(price, '')
        re_addr = re_addr.replace(removal, '')

        if re_addr is not None:
            addr = re_addr.replace(',', '').strip()
        else:
            pass
    
        Price.append(price)
        Phone.append(phone)
        Address.append(addr)
        Hompage.append(homepage)
        
    df['가격'] = Price
    df['주소'] = Address
    df['전화번호'] = Phone
    df['홈페이지'] = Hompage
    
    print('저녁도 못 먹었지만.. 결국.. 뚜쉬,, 내가해냄ㅠ')
    
    return df

In [241]:
result_df = ChiSdw_detail(df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


저녁도 못 먹었지만.. 결국.. 뚜쉬,, 내가해냄ㅠ


In [244]:
result_df

Unnamed: 0_level_0,카페명,메뉴명,링크,가격,주소,전화번호,홈페이지
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Old Oak Tap,BLT,http://www.chicagomag.com/Chicago-Magazine/Nov...,$10.,2109 W. Chicago Ave.,773-772-0406,http://www.theoldoaktap.com/
2,Au Cheval,Fried Bologna,http://www.chicagomag.com/Chicago-Magazine/Nov...,$9.,800 W. Randolph St.,312-929-4580,http://aucheval.tumblr.com/
3,Xoco,Woodland Mushroom,http://www.chicagomag.com/Chicago-Magazine/Nov...,$9.50.,445 N. Clark St.,312-334-3688,http://www.rickbayless.com/
4,Al’s Deli,Roast Beef,http://www.chicagomag.com/Chicago-Magazine/Nov...,$9.40.,914 Noyes St. Evanston,847-475-9400,http://alsdeli.net/
5,Publican Quality Meats,PB&L,http://www.chicagomag.com/Chicago-Magazine/Nov...,$10.,825 W. Fulton Mkt.,312-445-8977,http://publicanqualitymeats.com/
6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad,http://www.chicagomag.com/Chicago-Magazine/Nov...,$7.25.,100 E. Walton St.,312-649-6717,-
7,Acadia,Lobster Roll,http://www.chicagomag.com/Chicago-Magazine/Nov...,$16.,1639 S. Wabash Ave.,312-360-9500,http://acadiachicago.com/
8,Birchwood Kitchen,Smoked Salmon Salad,http://www.chicagomag.com/Chicago-Magazine/Nov...,$10.,2211 W. North Ave.,773-276-2100,http://www.birchwoodkitchen.com/
9,Cemitas Puebla,Atomica Cemitas,http://www.chicagomag.com/Chicago-Magazine/Nov...,$9.,3619 W. North Ave.,773-772-8435,http://cemitaspuebla.com/
10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy,http://www.chicagomag.com/Chicago-Magazine/Nov...,$17.,3267 S. Halsted St.,312-929-2486,http://nanaorganic.com/


In [245]:
# 2차 저장
df.to_csv('data/chicago_sandwiches_2012_v2.csv', sep = ',', encoding = 'UTF-8')

In [253]:
class ChicagoSandwichesCrawling:
    df = pd.DataFrame()
    
    def init(self, name='시카고샌드위치먹어야됨'):
        self.name = name
        
    def get_df(self):
        return self.df
    
    def set_df(self, df):
        self.df = df
        
    def read_file(self, file_name='data/chicago_sandwiches_2012.csv'):
        self.df = pd.read_csv(file_name, encoding='UTF-8', index_col='순위')
        
    def save_file(self, file_name='data/chicago_sandwiches_2012_v2.csv'):
        self.df.to_csv(file_name, sep=',', encoding='UTF-8')
        
    def ChiSdw_detail(self):
        
        Price = []; Address = []
        Phone = []; Hompage = []

        for link in tqdm_notebook(df['링크']):
            html = urlopen(link)
            soup = BeautifulSoup(html, 'lxml')

            p_tag = soup.find('p', 'addy')

            addy = p_tag.get_text().strip()

            # 홈페이지 링크 =============================== #
            if p_tag.find('a') is not None:
                tmp = p_tag.find('a')
                homepage = tmp['href']
            else:
                homepage = "-"

            # price : '$00.00' ============================ #
            re_price = re.search('\$\d{1,2}\.\d{0,2}[ .]?', addy)
            if re_price is not None:
                 price = re_price.group().strip()
            else: 
                 pass

            # phone : '000-000-0000' ====================== #
            re_phone = re.search('\d{3}[ -.]?\d{3}[ -.]?\d{4}', addy)
            if re_phone is not None:
                 phone = re_phone.group()
            else:
                 pass

            # addr: 정규식으로 하고 싶은데 안 돼요 ======== #
            removal = homepage.replace('http://', '')
            removal = removal.replace('www.', '')
            removal = removal.replace('/', '').strip()

            re_addr = addy.replace(phone, '')
            re_addr = re_addr.replace(price, '')
            re_addr = re_addr.replace(removal, '')

            if re_addr is not None:
                addr = re_addr.replace(',', '').strip()
            else:
                pass

            Price.append(price)
            Phone.append(phone)
            Address.append(addr)
            Hompage.append(homepage)

        self.df['가격'] = Price
        self.df['주소'] = Address
        self.df['전화번호'] = Phone
        self.df['홈페이지'] = Hompage

        print('샌드위치 먹으러 시카고 가야겠네...')

    def do_start(self):
        self.read_file()
        self.ChiSdw_detail()
        self.save_file()

In [254]:
crawler = ChicagoSandwichesCrawling()
crawler.do_start()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


샌드위치 먹으러 시카고 가야겠네...


In [255]:
crawler.df.loc[1]

카페명                                           Old Oak Tap
메뉴명                                                   BLT
링크      http://www.chicagomag.com/Chicago-Magazine/Nov...
가격                                                   $10.
주소                                   2109 W. Chicago Ave.
전화번호                                         773-772-0406
홈페이지                         http://www.theoldoaktap.com/
Name: 1, dtype: object