In [152]:
import scrapy
import requests
from scrapy.http import TextResponse
import pandas as pd

In [153]:
# 1. Start project

In [154]:
!scrapy startproject daangn

New Scrapy project 'daangn', using template directory '/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/templates/project', created in:
    /home/ubuntu/Code/09_project/02_crawling/daangn

You can start your first spider with:
    cd daangn
    scrapy genspider example example.com


In [155]:
# 2. items.py

In [156]:
# %load daangn/daangn/items.py

In [157]:
%%writefile daangn/daangn/items.py
# '''
# region = 거래지역; ~구~ 동까지 표기됨
# temperature = 거래 온도; 좋은 거래를 할수록 온도가 높아짐
# chat_counts = 판매자와 구매 관심자의 채팅이 오고 간 횟수
# watch_counts = 찜한 횟수
# '''
import scrapy


class DaangnItem(scrapy.Item):
    title = scrapy.Field() 
    user_id = scrapy.Field()
    temperature = scrapy.Field()
    region = scrapy.Field() 
    price = scrapy.Field() 
    link = scrapy.Field()
    desc = scrapy.Field() 
    chat_counts = scrapy.Field() 
    watch_counts = scrapy.Field() 
    view_counts = scrapy.Field()
    

Overwriting daangn/daangn/items.py


In [158]:
# 3. spider.py

In [179]:
%%writefile daangn/daangn/spiders/spider.py
import string
import scrapy
from scrapy import Request
from daangn.items import DaangnItem

class DaangnSpider(scrapy.Spider):
    name = 'Daangn'
    start_urls = ["https://www.daangn.com/search/macbook/more/flea_market?page=1"]
    page_number = 2
    
    def parse(self, response):
        xp = "/html/body/article/a/@href"
        urls = response.xpath(xp).extract()
        # get full url
        urls =  list(map(response.urljoin, urls))
        return (Request(url, callback=self.parse_content) for url in urls)
    
    def parse_content(self, response):
        item = DaangnItem()
        item['title'] = response.xpath('//*[@id="article-title"]/text()')[0].extract().strip()
        item['price'] = response.xpath('//*[@id="article-price"]/text()')[0].extract().strip()
        desc = response.xpath('//*[@id="article-detail"]/p/text()').extract()
        item['desc'] = "".join(desc).replace('\n', '')
        item['user_id'] = response.xpath('//*[@id="nickname"]/text()')[0].extract().strip()
        item['temperature'] = response.xpath('//*[@id="temperature-wrap"]/dd/text()')[0].extract().strip()
        item['link'] = response.xpath('/html/head/link[1]/@href')[0].extract()
        item['region'] = response.xpath('//*[@id="region-name"]/text()')[0].extract().strip()
        counts = response.xpath('//*[@id="article-counts"]/text()')[0].extract().strip()
        item['chat_counts'] = counts.split('∙')[0].strip().split(' ')[1]
        item['watch_counts'] = counts.split('∙')[1].strip().split(' ')[1]
        item['view_counts'] = counts.split('∙')[2].strip().split(' ')[1]
       
        yield item
        
        next_page = 'https://www.daangn.com/search/macbook/more/flea_market?page=' + str(DaangnSpider.page_number)
        if DaangnSpider.page_number <= 800:
            DaangnSpider.page_number += 1
            yield response.follow(next_page, callback = self.parse)

Overwriting daangn/daangn/spiders/spider.py


In [180]:
%%writefile daangn/daangn/mongodb.py
import pymongo
from datetime import datetime

today = datetime.now()

client = pymongo.MongoClient("mongodb://id:pwd@1.23.45.6:27017")
db = client.joongo
collection = db["D{}".format(today.strftime('%y%m%d%H'))]

Overwriting daangn/daangn/mongodb.py


In [187]:
%%writefile daangn/daangn/pipelines.py


from itemadapter import ItemAdapter
from .mongodb import collection


class DaangnPipeline(object):
    def process_item(self, item, spider):
        columns = ["title","price","desc","user_id","temperature","link","region","chat_counts","watch_counts","view_counts"]
        data = {column: item[column] for column in columns}
        collection.insert(data)

        return item

Overwriting daangn/daangn/pipelines.py


In [188]:
%%writefile run.sh
cd daangn
# rm items.csv
scrapy crawl Daangn -o items.csv

Overwriting run.sh


In [190]:
%%time
!/bin/bash run.sh

2020-11-24 19:21:32 [scrapy.utils.log] INFO: Scrapy 2.4.0 started (bot: daangn)
2020-11-24 19:21:32 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.9 (default, Oct 16 2020, 02:01:21) - [GCC 7.5.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.2.1, Platform Linux-5.4.0-1029-aws-x86_64-with-debian-buster-sid
2020-11-24 19:21:32 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2020-11-24 19:21:32 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'daangn',
 'NEWSPIDER_MODULE': 'daangn.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['daangn.spiders']}
2020-11-24 19:21:32 [scrapy.extensions.telnet] INFO: Telnet Password: b872d3c9bc24ef19
2020-11-24 19:21:32 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.e

In [162]:
pd.read_csv("daangn/items.csv")

Unnamed: 0,chat_counts,desc,link,price,region,temperature,title,user_id,view_counts,watch_counts
0,2,맥북프로 15인치 터치바 2017 기본형입니다2019년에 구입했고 실사용은 몇 달 ...,https://www.daangn.com/articles/150455627,"1,500,000원",당진시 채운동,37.0,Macbook Pro 15inch 맥북 프로 15인치 2017년 (스페이스그레이),windsurfer,139,0
1,1,014년도 구입 최고사양 CTO 맥북 프로 A1278CPU : I7 최고사양 램 :...,https://www.daangn.com/articles/154330347,"400,000원",공주시 금학동,36.5,맥북프로 I7 CPU A1278프리미엄 맥북 - 최고사양,김소연,53,0
2,0,맥북 프로 15인치 2013년도 고급형 판매합니다.i7 2.3 쿼드코어입니다.풀박이...,https://www.daangn.com/articles/154263336,"850,000원",수원시 영통구 망포동,36.7,"Macbook Pro (Retina, 15-inch, Late 2013) 맥북 프로...",최의현,81,1
3,0,맥북에어[Macbook air] 판매합니다13인치이고 케이스 착용하고 조심히 사용해...,https://www.daangn.com/articles/151032265,"530,000원",성남시 분당구 금곡동,35.7,MaCbook air 맥북/멕북 노트북 판매합니다,야도르,235,4
4,5,맥북프로 레티나 13인치 입니다액정은 기본 필름입니다.하단 배면부 생활기스 있습니다...,https://www.daangn.com/articles/146403990,"500,000원",금천구 가산동,36.5,애플 맥북 프로 레티나 MacBook Pro A1502 13인치 판매,네오,733,7
5,0,MacBook Pro 맥북프로 13인치 Mid 2014 레티나 A1502현재 최신 ...,https://www.daangn.com/articles/154245784,"550,000원",용인시 기흥구 동백동,40.3,MacBook Pro 맥북프로 13인치 Mid 2014 레티나,cwru,49,2
6,5,박스랑 충전기까지 드립니다.기스랑 찍힘 있으니 사진 참고해주세요,https://www.daangn.com/articles/146539023,"600,000원",관악구 신림동,37.0,13인치 Macbook air 맥북에어 2017년형 256기가 팝니다,도야지,865,15
7,0,서울,https://www.daangn.com/articles/154383568,"720,000원",남구 신정3동,38.4,맥북 12인치 뉴맥북 2017,하늘,11,1
8,1,* 256GB* 최근 수리* 포맷완료* 풀박스* 직거래해요* 깨끗하게 쓰시라고 알코...,https://www.daangn.com/articles/151356238,"800,000원",성동구 행당동,36.5,"맥북프로 Macbook Pro, 13인치 Early 2015 & 애플 매직마우스2",채니,469,15
9,1,제품명 : Macbook Air early 2015 13inch프로세서 : 1.6 ...,https://www.daangn.com/articles/153549825,"380,000원",강남구 역삼1동,36.8,[Macbook Air] 맥북에어 2015 13인치,Danggeun,205,6
