In [1]:
import pandas as pd
import requests
import time

from bs4 import BeautifulSoup
from datetime import datetime
from dateutil.relativedelta import relativedelta

# 전체 data 수집하기

기본값

In [2]:
flower_limit = 3000  #max : 3000
baseurl = 'https://flower.at.or.kr/api/returnData.api?kind=f001&'
servicekey = 'EBA10CC9512644A8AD805B510D3FC532'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}

parsed_data = []

[get_dealym] 시작일부터 종료일까지의 날짜

In [3]:
def get_dealymd(start_ymd, end_ymd) :
    res = []
    start = datetime.strptime(start_ymd, '%Y-%m-%d')
    end = datetime.strptime(end_ymd, '%Y-%m-%d')
    diff = (end.year - start.year) * 365 + (end.month  - start.month) * 12 + (end.day - start.day) + 1
    for i in range(diff) :
        res.append(datetime.strftime(start + relativedelta(days = i), '%Y-%m-%d'))
    return res

[get_url] url 가져오기

In [4]:
def get_url(goodname, deal_ymd) :
    params = {'serviceKey' : servicekey, 
              'baseDate' : deal_ymd,
              'countPerPage' : '999', 
              'flowerGubn' : '1',
              'pumName' : '장미',
              'goodName' : goodname,
              'dataType' : 'xml' }

    url = baseurl + '&'.join(['{}={}'.format(k,v) for k, v in params.items()])
    return url

[get_data] request로 xml에서 필요한 데이터 가져오기

In [5]:
def get_data(url) :
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'lxml-xml')
    items = soup.findAll('item')
    return items

[parse_text] if문으로 null값 호출시 error해결

In [6]:
def parse_text(soup_object) : 
    if soup_object :
        return soup_object.text
    else :
        return ''

[parse_item] 데이터 파싱

In [7]:
def parse_item(item, deal_ymd) :
    saledate = parse_text(item.find('saleDate'))
    saledate_ymd = datetime.strptime(saledate, '%Y-%m-%d')
    flower_gubun = parse_text(item.find('flowerGubn'))
    pum_name = parse_text(item.find('pumName'))
    good_name = parse_text(item.find('goodName'))
    level_num = parse_text(item.find('lvNm'))
    max_amt = int(parse_text(item.find('maxAmt')))
    min_amt = int(parse_text(item.find('minAmt')))
    avg_amt = int(parse_text(item.find('avgAmt')))
    total_amt = int(parse_text(item.find('totAmt')))
    total_qty = int(parse_text(item.find('totQty')))
    
    res = {'경매일' : saledate_ymd, '화훼부류명' : flower_gubun, '품목명' : pum_name, '품종명' : good_name,
           '등급명' : level_num, '최고가' : max_amt, '최저가' : min_amt, '평균가' : avg_amt, '총금액' : total_amt, '총물량' : total_qty
          }
    return res

[run] 실행함수

In [8]:
def run(goodname, deal_ymd) :
    url = get_url(goodname, deal_ymd)
    print(url)
    items = get_data(url)
    if items :
        parsed_data.extend([parse_item(x, deal_ymd) for x in items])
        return True
    else :
        return False

[save_data] 데이터 저장

In [17]:
def save_data(parsed_data) :
    df = pd.DataFrame(parsed_data)
    df.to_csv("data/rose_3type_2019_(1).csv" ,index=True, encoding='utf-8-sig')   # 파일경로 및 파일명 변경

[main] 분석

In [18]:
%%time

if __name__ == '__main__' : #필수적인 것(그냥외워두삼)
    start_ymd = '2019-01-01'
    end_ymd = '2020-12-31'

    ls_dealymd = get_dealymd(start_ymd, end_ymd) #날짜리스트
    rose_type_ls = ['푸에고', '빅토리아', '헤라']

    for rose_ls in rose_type_ls :  
        for ymd in ls_dealymd :
            run(rose_ls, ymd)
            time.sleep(0.3)

    save_data(parsed_data) #데이터저장
    #마지막저장기록 
    now = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
    with open('data/openapi_log.txt', 'a+') as f :
        f.write("크롤링날짜\t경매시작일\t경매종료일\n")
        f.write("{}\t{}\t{}\t".format(now, start_ymd, end_ymd))
        f.close()

https://flower.at.or.kr/api/returnData.api?kind=f001&serviceKey=EBA10CC9512644A8AD805B510D3FC532&baseDate=2019-01-01&countPerPage=999&flowerGubn=1&pumName=장미&goodName=푸에고&dataType=xml
https://flower.at.or.kr/api/returnData.api?kind=f001&serviceKey=EBA10CC9512644A8AD805B510D3FC532&baseDate=2019-01-02&countPerPage=999&flowerGubn=1&pumName=장미&goodName=푸에고&dataType=xml
https://flower.at.or.kr/api/returnData.api?kind=f001&serviceKey=EBA10CC9512644A8AD805B510D3FC532&baseDate=2019-01-03&countPerPage=999&flowerGubn=1&pumName=장미&goodName=푸에고&dataType=xml
https://flower.at.or.kr/api/returnData.api?kind=f001&serviceKey=EBA10CC9512644A8AD805B510D3FC532&baseDate=2019-01-04&countPerPage=999&flowerGubn=1&pumName=장미&goodName=푸에고&dataType=xml
https://flower.at.or.kr/api/returnData.api?kind=f001&serviceKey=EBA10CC9512644A8AD805B510D3FC532&baseDate=2019-01-05&countPerPage=999&flowerGubn=1&pumName=장미&goodName=푸에고&dataType=xml
https://flower.at.or.kr/api/returnData.api?kind=f001&serviceKey=EBA10CC9512644A8