# Crawler and Scraper

## Architecture
```
         HTTP Request
   > > > > > > > > > > > > 
  |                      |
Client                 Server
  |                      |
   < < < < < < < < < < < <
         HTTP Response
```
- The response **MUST** be sent even if Error occurs
- Proper **REQUEST HEADER** should be set manually

## Crawler vs Scraper

| | Crawler | Scraper|
| - | - | - |
| Goal | Index sites | Collect data |
| Target | All sites | Wanted sited |
| Usage | Search-site | Big-Data |

Crawler Architecture
```
          Internet
             |
     ---> Get Req  ---> Link Extractor ---> URL normalization ---> URL filter
    |        |                                                          |
    |         -----> Is not duplicate?    ----->(else) Drop             |
Work-thread                               <----------------------       |
  manager                /      |     \                          |   URL seen?
    |                   /       |      \                         |    |    |
    |               Data     Web page   URLs & relative details  |    |    |
URLs work-pool   Extraction   Content                |           |    |   Drop
  handler            |          |                    |           |    |
    |                |          |                    |           |   else
    |                |          |                    |           |    |
    |                 -->  File Repository <--->   Index <-------   URL frontier
    |                                                                 |
      <----------------------------------------------------------------
```
- The data extraction is scraper's role

## How to check whether legal or illegal
- Everything in DB are private
- Check ```robots.txt```
- Syntax(just convention)
```
# Welcom to my robots.txt file! <--- comment
User-agent: * # * is wild card
Allow: *
Disallow: /private # Except /private, can access all

User-agent: Googlebot
Allow: /?_escaped_fragment_
```
- Permission is overlapped

## Status Code

### 2xx
- Normal Code

### 3xx
- Normal and Redirection when respose

### 4xx
- Client Failure
- ```404 := Server Not Found```

### 5xx
- Server Failure
- Traffic high, etc

In [53]:
# Parsing robot.txt
from urllib.robotparser import RobotFileParser

parser = RobotFileParser(r'https://www.google.com/robots.txt')
parser.read()
for entry in parser.entries:
    print(entry.useragents)
    for rule in entry.rulelines:
        print(rule.allowance, rule.path)
    print("================")

defaultEntry = parser.default_entry
print("*")
for rule in defaultEntry.rulelines:
    print(rule.allowance, rule.path)
    print("================")

['AdsBot-Google']
False /maps/api/js/
True /maps/api/js
False /maps/api/place/js/
False /maps/api/staticmap
False /maps/api/streetview
['Twitterbot']
True /imgres
['facebookexternalhit']
True /imgres
*
False /search
True /search/about
True /search/static
True /search/howsearchworks
False /sdch
False /groups
False /index.html
False /
True /%3Fhl%3D
False /%3Fhl%3D%2A%26
True /%3Fhl%3D%2A%26gws_rd%3Dssl%24
False /%3Fhl%3D%2A%26%2A%26gws_rd%3Dssl
True /%3Fgws_rd%3Dssl%24
True /%3Fpt1%3Dtrue%24
False /imgres
False /u/
False /preferences
False /setprefs
False /default
False /m
False /m/
True /m/finance
False /wml
False /wml/
False /wml/search
False /xhtml
False /xhtml/
False /xhtml/search
False /xml
False /imode
False /imode/
False /imode/search
False /jsky
False /jsky/
False /jsky/search
False /pda
False /pda/
False /pda/search
False /sprint_xhtml
False /sprint_wml
False /pqa
False /palm
False /gwt/
False /purchases
False /local
False /local_url
False /shihui
False /shihui/
False /products

## Exercise

In [48]:
from urllib.robotparser import RobotFileParser
from requests import request, get, post
from requests.compat import urljoin, urlparse, quote, unquote
from requests.exceptions import HTTPError
from urllib.error import URLError
from time import sleep
import random

def canFetch(ua:"user-agent", path:"uri") -> bool:
    urlParser = urlparse(path)
    url = urlParser.scheme + r"://" + urlParser.netloc + r"/robots.txt"
    # OR
    url = urljoin(path, "/robots.txt") # / is absolute path, ./ is relative path
    
    try:
        robotParser =  RobotFileParser(url)
        robotParser.read()
        return robotParser.can_fetch(ua, path)
    except HTTPError as e:
        if e.status_code // 100 == 4:
            return True # robots.txt doesn't exist. So, the access is permitted
        else:
            return False
    except URLError:
        raise # Invalid URL Error

def download(url, method='GET', /, *, params={}, data={}, headers={}, maximum:"maximum trial" = 3) -> "Response or None":
    if 'user-agent' not in headers.keys():
        headers['user-agent'] = '*'
        
    if canFetch(headers['user-agent'], url) == False:
        print("Can't access it by robots.txt")
    
    res = request(method, url, params=params, data=data, headers=headers)
    
    try:
        res.raise_for_status() #Raise HTTPError if occurs
    except HTTPError as e:
        maximum -= 1
        if e.response.status_code // 100 == 5 and maximum > 0: # 5xx
            sleep(random.randint(1, 10))
            res = download(url, method, params, data, headers, maximum - 1)
        else:
            res = None
            print(e.response.status_code)
            print(e.response.reason)
            print(e.request.headers)
    return res

In [50]:
# Access robot.txt
canFetch("Yeti", "http://nasdfasdfews.naer.com/main/imagemontage")

True

In [81]:
res = download('https://www.httpbin.org/status/400', maximum=1)
res = download('https://www.google.com/search', params={"q":"나무"})
with open('나무.html', 'w') as f:
    f.writelines(res.text)

400
BAD REQUEST
{'user-agent': '*', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
Can't access it by robots.txt


In [86]:
# Mimic browser
## 01. Set user-agent
headers = {}
params = {}
data = {}

# Copy&Paste 
headers['user-agent'] = r'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
res = res = download('https://www.google.com/search', params={"q":"나무"}, headers=headers)
print(res.request.headers)

Can't access it by robots.txt
{'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}


In [131]:
# Parsing the titles
import re

pattern = re.compile(r'<h3 class="LC20lb [^"]+?">([^<]+?)</h3>') #<h3 class="LC20lb MBeuO DKV0Md">나무증권</h3>
print(pattern.findall(res.text))

# Travel the hyper-links
pattern = re.compile(r'<a href="([^"]+)".*?>')
for link in pattern.finditer(res.text):
    print(link.group(1))

['나무위키', '나무위키:대문 - 나무위키', '나무 - 위키백과, 우리 모두의 백과사전', '나무증권', '나무 - 위키낱말사전', '나무 - 한국민족문화대백과사전', '재미있는 나무이야기 | 강원도청 &gt; 분야별정보 &gt; 산림', '나무증권-일상의 Shift - Google Play 앱']
/search?q=%EB%82%98%EB%AC%B4&amp;gbv=1&amp;sei=o-7EYpX3OvDF4-EPiaG-kAw
https://www.google.com/webhp?hl=ko&amp;sa=X&amp;ved=0ahUKEwjV1PWSl-P4AhXw4jgGHYmQD8IQPAgI
/search?q=%EB%82%98%EB%AC%B4&amp;source=lnms&amp;tbm=isch&amp;sa=X&amp;ved=2ahUKEwjV1PWSl-P4AhXw4jgGHYmQD8IQ_AUoAXoECAIQAw
/search?q=%EB%82%98%EB%AC%B4&amp;source=lnms&amp;tbm=vid&amp;sa=X&amp;ved=2ahUKEwjV1PWSl-P4AhXw4jgGHYmQD8IQ_AUoAnoECAIQBA
https://maps.google.com/maps?q=%EB%82%98%EB%AC%B4&amp;um=1&amp;ie=UTF-8&amp;sa=X&amp;ved=2ahUKEwjV1PWSl-P4AhXw4jgGHYmQD8IQ_AUoA3oECAIQBQ
/search?q=%EB%82%98%EB%AC%B4&amp;source=lnms&amp;tbm=shop&amp;sa=X&amp;ved=2ahUKEwjV1PWSl-P4AhXw4jgGHYmQD8IQ_AUoBHoECAIQBg
https://namu.wiki/w/%EB%82%98%EB%AC%B4
https://namu.wiki/w/%EB%B6%84%EB%A5%98:%EB%82%98%EB%AC%B4
https://namu.wiki/w/%EC%84%B8%ED%94%BC%EB%A1%9C%ED%8A%B8%EC%9D%98%2

In [137]:
# Scraping at Naver
url = r'https://search.naver.com/search.naver'
params = {'query': '맛집'}
res = download(url, params=params)

## Parsing titles
# <span class="place_bluelink OXiLu">삼겹살 전문점 대통령</span>
# <span class="lnk_tit">가평 대형빵공장 더스틸카페</span>
# 
pattern = re.compile(r'<span class=(?:"place_bluelink OXiLu"|"lnk_tit")>(.+?)</span>')
for title in pattern.finditer(res.text):
    print(title.grobup(1))

Can't access it by robots.txt
200
SNS 화제!직화삼겹 직구삼
인량 전통훠궈전문점
가평 대형빵공장 더스틸카페
루프탑과 스카이워크까지!
정원레스토랑 어반가든
유럽 풍 정통 이탈리안요리!
인스타 핫플레이스 <strong class='hl'>맛집</strong>
쿠팡 맛집
로켓와우멤버 무제한 무료배송
성공창업 장미막창
고민은 성공을 늦출뿐입니다.
익선동 맛집 살롱순라
수익률 41% 고깃집 임대장
JMT수복얼큰감자탕
맛찬들왕소금구이 양덕&amp;맛집
소문난 의정부부대찌개 고대점
삼겹살 전문점 대통령
수아당
문화식당
계모임
니르코브
라라면가
Wnch
화끈한 불맛 이선생짜글이
특허받은한우곱창전문점 곱선생
명품곶감직판 상주행운곶감농원
에노크 샌드위치 맛집
맛집 위메프


# DOM

- In memory domain object
- By **DHTML**, DOM may differ the original source

## HTML vs XML
- HTML is **NOT-WELL-FORMED**
```
<html>
    <body>
        <p>  //</p> can be missed
    </body>
</html>
```

- XML is **WELL-FORMED**
```
<?xml version=1.0>
<CAT>
    <NAME> meme </NAME>
    <AGE> 1 //</AGE> cannot be missed
</CAT>
```
- The parsing XML is **faster** than html

## DOM Tree vs CSSOM vs Render Tree

### DOM Tree
- Data Structure by HTML
- Tag, attributes, etc are used for searching it

### CSSOM
- Data Structure by CSS
- Selector is used for searching it

### Render Tree
- Data Structure by DOM Tree and CSSOM
- The final output

## BeautifulSoup
- By pulling data from XML or HTML, manage DOM

### Parser
```lxml``` is recommanded

### Searching with Selector
https://code.tutsplus.com/ko/tutorials/the-30-css-selectors-you-must-memorize--net-16048
- Can't travel **DOM tree structure** such as parent, sibling, etc
- But, easily search the exact target
- Syntax
```
.classname
#id
div a > p // ' ' represent the "a" tag corresponding the descendant of "div"
          // > represent the "p" tag corresponding the children of "a"
a[href] // tag[attr] represent the tag having the attribute
```
- Use ```bs.select[_one]```

## DHTML Scraping
- Direct JS(impossible)
- Selenium(Slow)
- **AJAX(OK!)**
- Check Fetch/XHR in Chrome Network


In [3]:
from bs4 import BeautifulSoup

html = '''
<!doctype html>
<html>
    <head></head>
    <body>
        <a href="address01">
        <a class="" href="address02">
            <div width=800>
                <p> test </p>
            </div>
        </a>
    </body>
</html>
'''

dom = BeautifulSoup(html, 'html.parser') # Default python html parser
print(dom)

print("================================")

dom = BeautifulSoup(html, 'html5lib') # Make dom same as browser
print(dom)

print("================================")
dom = BeautifulSoup(html, 'lxml') # Very fast
print(dom)


<!DOCTYPE html>

<html>
<head></head>
<body>
<a href="address01">
<a class="" href="address02">
<div width="800">
<p> test </p>
</div>
</a>
</a></body>
</html>

<!DOCTYPE html>
<html><head></head>
    <body>
        <a href="address01">
        </a><a class="" href="address02">
            <div width="800">
                <p> test </p>
            </div>
        </a>
    

</body></html>
<!DOCTYPE html>
<html>
<head></head>
<body>
<a href="address01">
</a><a class="" href="address02">
<div width="800">
<p> test </p>
</div>
</a>
</body>
</html>



In [26]:
# Parsing Naver Advertisements link with

res = download(r"https://search.naver.com/search.naver", params={"query":"나무"})
assert res != None
dom = BeautifulSoup(res.text, 'lxml')

for _ in dom.find_all("span", {"class":"lnk_tit"}):
    print(_.get_text())
    print(_.parent['href'])

Can't access it by robots.txt
한국원예종묘 농업회사법인
https://adcr.naver.com/adcr?x=KoGguLakS4AMdaCYk/0q9f///w==kBrHCRrCm5iw311iXbH2z7sDC2WAezXd3r2VtP5+w4HRq9ZeC2mjTugJUC8MR++nkTZLuQOX46OX69pnfWTcuPf40gTdioEBlRlKRCoEINJwycggoMseaP01H4c5xk7IHaP6+mOnHjCqA1e9Sg1zh9whf/FgGiBaNm2k6iyIBikEbbKFA3WZTozTeg8C56CYO1EJybvhQw5fMUV1YKw6obDazegGAfla3wp+z623TuWmwCW/nm+h0f37J12fuU8xAm/yNhIFDuCaj7mk8XYp2QZIiT/9kextp1qfDt2MOQydarGfOSuBkTwUV3v0nWmGVm9L5KSoBqz7ZgA+leuV/iLGWkgWdEk2L6UXnGdzLPcK0OVhotmbiKN4CiNqf9hMAnTo8fNUOMzOFBx7IEw7QHK4GESqHiU8lDtPTh2+v5ipiPmUPOQnLdM5Ag6lwSZdX6s6kcLRTlDHLBUIErsczhYq6hHF8aYRzhwYGDXvoNRDN9gZLCVyt0/gl736DaJVm0D79vG/AVEt6Wnmxf8paj/J245kh7D0Q0FCKOBiYhb1qMxu3Vz2rR0V8atCFj/psoczMEOHlJDWGAfhoCz1Fuy94FUnhhDdv9OcQICTpGkyVwCnJ/DSsnhPEWSuZtCfNesaZeL4JigR4hz6kG9p5z7MPTvrG8kpSB19BKdaC4kndLzHWvJ3ZOpS6ic3dT2cfwga5b0gkim8r6gyVxESwo5WBd+08QXiEQwCAaiK++Yk8VjOrUA8Fa3zfrrdYkhkdYVFIBlqyQYvgSxKZUzr2RrYaKpnJ99asrtE41SElMHDYS1JKlFdvQHV/AQP9PKmZIVUwPap70Hqzk2Da6/pu9vsO/tMbnmu3aps39LBfOad1fiRX5DdIsZAQPGeucCAfD

# Exercies

## Scrap 4 searching sites
- Titles, associated links, summary texts


### Naver

### Google

### Daum

### Stackoverflow

# Crawling

## Definition
- Web crawlers are known by a variety of names, spiders, bots or web crawlers
- Systematically browses the WWW

## Reason
- Typically for the purpose of Web indexing
  - Which site have what data
  - Search site like Google
- Specially for the purpose of finding contexts and associated data

## Apps
- Searching Site
  - Crawl all sites and collect data
  - Google
- Mirror Site
  - Crwal one site and duplicate data
  - Wikipedia
  
## Architecture
```
Access specific site ----> Collect data ----> Collect links
          |                                          |
          |                                          |
           <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<          
```
- Scrapy is popular library of python to **build a scraper**
- BeautifulSoup is popular library of python to **parsing a site**

## Strategies
- WEB is widest. So, the searching methods are carefully selected
```
State := Current site, collected data, links, times, etc
Init := (Inital site, 0, 0, 0, ...)
Goal := (*, specific amount of data, *, timeout, ...)
Action := Start | Collect | Travel | Stop | Exit
    Start := Execute the scraper in S if it is valid
    Collect := Collect useful data and links. Test whether it is new or not
    Travel := Move S2 via links
    Stop := Stop during a little time
    Exit := Exit the scraper
Transition :=
    (Start, S1) -> (Collect, S1) | (Travel, S1, S2 in links) | (Stop, S1) | (Exit, S1)
    (Collect, S1) -> (Travel, S1, S2 in links) | (Stop, S1) | (Exit, S1)
    (Travel, S1, S2) -> (Start, S2)
    (Stop, S1) -> (Collect, S1)
    (Exit, S1) -> Done
Action Cost := Manually defined
```
- Use various techniques like BFS, DFS, Simulated Annealing, Genetic, A* Search, IDS, etc

In [422]:
# Simple Scraper
import time
import random
from functools import wraps
from operator import lt, eq
from copy import copy, deepcopy

from dataclasses import dataclass
from abc import ABCMeta, abstractmethod

import requests
from urllib.error import URLError
from requests.exceptions import HTTPError
from urllib.robotparser import RobotFileParser


@dataclass
class State:
    site: str
    data: set
    links: set
    times: float #seconds
    
    def __init__(self, site: str, data: set = set(), links: set = set(), times:float = 0.):
        self.site = site
        self.data = data
        self.links = links
        self.times = times
    
    def __copy__(self):
        cls = self.__class__
        result = cls.__new__(cls)
        result.__dict__.update(self.__dict__)
        return result

    def __deepcopy__(self, memo):
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

class Scraper(metaclass=ABCMeta):
    def __init__(self, init:State, goal:State):
        self.__init = init
        self.__current = deepcopy(self.__init)
        self.__goal = goal
        self.lastStep = None
    
    @property
    def initState(self):
        return self.__init
    @property
    def currentState(self):
        return self.__current
    @property
    def goalState(self):
        return self.__goal
    
    def __canFetch():
        robots = dict()
        
        def canFetch(self, path:"uri", ua:"user-agent" = '*') -> bool:
            url = urljoin(path, "/robots.txt")
            
            if url in robots.keys():
                return robots[url]
            
            try:
                robotParser =  RobotFileParser(url)
                robotParser.read()
                robots[url] = robotParser.can_fetch(ua, path)
                return robots[url]
            except HTTPError as e:
                if e.status_code // 100 == 4:
                    robots[url] = True # robots.txt doesn't exist. So, the access is permitted
                else:
                    robots[url] = False
                return robots[url]
            except URLError:
                raise # Invalid URL Error

        return canFetch
    canFetch = __canFetch()
    
    # Run
    def run(self):
        print("Running Scraper")
        return self.start()
    
    # Action
    def start(self):
        # Vaild URL Test
        url = self.currentState.site
        try:
            if self.canFetch(url):
                #Stop randomly
                return self.collect() if random.randint(1, 100) > 10 else self.stop()
            
            link = self.select_link()
            if link:
                return self.travel(link)
            else:
                self.lastStep = 'Start'
                return self.exit()
        except URLError:
            print("ERROR: URLError")
            return self.exit()
    
    @abstractmethod
    def select_link(self):
        '''Return selected link or None'''
        pass
    
    def collect(self):
        try:
            self.collect_by_custom()
        except HTTPError as e:
            if e.response.status_code // 100 == 5: #5xx
                return self.stop()
            print("Code", e.response.status_code)
            print("Reason", e.response.reason)
            print("Req Header", e.request.headers)
            return self.exit()
        else:
            if not self.is_exit():
                link = self.select_link()
                if link:
                    return self.travel(link)
            self.lastStep = 'Collect'
            return self.exit()
    
    @abstractmethod
    def collect_by_custom(self):
        '''make collect algorithm'''
        pass
    
    def travel(self, link):
        self.currentState.site = link
        return self.start()
    
    def stop(self):
        time.sleep(random.randint(1, 10))
        return self.collect()
        
    def exit(self):
        print("Last step is", self.lastStep)
        print("Last site is", self.currentState.site)
        print("Collected data size is", len(self.currentState.data))
        print("Remaining links is", len(set(self.currentState.links)))
        print("Collect time is", self.currentState.times, 'secs')
        return self.currentState
    
    @abstractmethod
    def is_exit(self):
        '''make exit condition. True if exit'''
        pass

In [446]:
# Wikipedia Scraper
## Collect Headlines

from operator import lt, eq
import re
import requests
from requests.compat import urljoin
from bs4 import BeautifulSoup
from itertools import chain
from functools import reduce

class WikiScraper(Scraper):
    def __init__(self, init:State, goal:State, maximum: "Maximum data", baseUrl = 'https://ko.wikipedia.org/'):
        super().__init__(init, goal)
        self.maximum = maximum
        self.baseUrl = baseUrl
        
    def __timeit(fn):
        @wraps(fn)
        def wrapper(self, *args, **kwargs):
            start = time.time()
            retVal = fn(self, *args, **kwargs)
            end = time.time()
            self.currentState.times += end - start
            return retVal
        return wrapper
    
    def is_exit(self):
        if not lt(self.currentState.times, self.goalState.times):
            return True
        if not lt(len(self.currentState.data), self.maximum):
            return True
        return False
    
    @__timeit
    def collect_by_custom(self):
        try:
            res = requests.get(self.currentState.site)
            res.raise_for_status()
            
            self.__collect_data(res.text)
            self.__collect_links(res.text)
        except HTTPError as e:
             raise
    
    def __collect_data(self, html):
        titleDom = BeautifulSoup(html, 'lxml')
        title = titleDom.find('h1', {'id': 'firstHeading'}).get_text()
        
        try:
            #.*</div> is the last </div>
            parsedHeadline = re.search(r'<div id="bodyContent"[^>]*?>(.+?)(?:<mw:tocplace>|<h2>)', html, re.DOTALL)
            headlineDom = BeautifulSoup(parsedHeadline.group(1), 'lxml')
            headline = ''.join(p.get_text() for p in headlineDom.find_all('p'))
            headline = ''
            for _ in headlineDom.find_all('p'):
                headline += _.get_text()
        except AttributeError:
            # Simple document like https://ko.wikipedia.org/wiki/휘게
            try:
                headlineDom = BeautifulSoup(html, 'lxml')
                headline = '\n'.join(p.get_text() for p in headlineDom.find('div', {'id': 'bodyContent'}).find_all('p'))
            except AttributeError:
                print("Check", self.currentState.site)
        finally:
            datum = (title, headline)
            self.currentState.data.add(datum)
        return
    
    def __collect_links(self, html):
        dom = BeautifulSoup(html, 'lxml')
        accept = dom.find('div', {'id': 'mw-content-text'}).find_all('a', {'title': re.compile(r'.+?')})
        accept = set(accept)
        
        # Reject metadata
        reject = dom.find('table', {'role': 'presentation'})
        if reject:
            reject = set(reject.find_all('a', {'title': re.compile(r'.+?')}))
        else:
            reject = set()
        
        links = []
        for link in accept:
            if link['href'] in reject or 'action=edit' in link['href']:
                continue
            if re.match(r'^/.+',link['href']):
                url = urljoin(self.baseUrl, link['href'])
            else:
                url = link['href']
            links.append(url)
        self.currentState.links = chain(self.currentState.links, links)
        return
    
    def select_link(self):
        try:
            return next(self.currentState.links)
        except StopIteration:
            return None

In [447]:
import sys
sys.setrecursionlimit(10000)
wikiScraper = WikiScraper(State('https://ko.wikipedia.org/wiki/행복'), State('', times=600), maximum=10)
ret = wikiScraper.run()

Running Scraper
Last step is Collect
Last site is https://ko.wikipedia.org/wiki/%ED%9C%98%EA%B2%8C
Collected data size is 10
Remaining links is 398
Collect time is 4.472466707229614 secs


In [448]:
with open('행복.txt', 'w') as f:
    for title, content in ret.data:
        f.write(title)
        f.write('\n')
        f.write(content)

## Public-Data API
https://www.data.go.kr

### Check the speficification

In [25]:
import requests
from requests.compat import urljoin
from requests.exceptions import HTTPError
import json
from bs4 import BeautifulSoup

key = r'HcXpfDqp+BaRkFkDAOUYhSMN0zmJ1bDvke8sdDkkVN7LWEyjGiQ5m0Qcu5ZP4ZPglDYAg/ElrI0ZQ1R2ERL1Vw=='
endpoint = r'https://api.odcloud.kr/api/15069309/v1/'
info = r'uddi:973ad0df-617a-4565-8e82-8ea7869e75d4'

url = endpoint + info
try:
    res = requests.get(url, params={'page':1, 'perPage':10, 'serviceKey':key})
    res.raise_for_status()
except HTTPError as e:
    print(e.request)
else:
    if 'application/json' in res.headers['Content-Type']:
        parsed = json.loads(res.text)
    elif 'application/html' in res.headers['Content-Type']:
        parsed = BeautifulSoup(res.text, 'lxml')