# Data crawling with scrapy

In [1]:
!pip install scrapy 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scrapy
  Downloading Scrapy-2.7.1-py2.py3-none-any.whl (271 kB)
[K     |████████████████████████████████| 271 kB 16.5 MB/s 
Collecting zope.interface>=5.1.0
  Downloading zope.interface-5.5.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (254 kB)
[K     |████████████████████████████████| 254 kB 40.5 MB/s 
[?25hCollecting PyDispatcher>=2.0.5
  Downloading PyDispatcher-2.0.6.tar.gz (38 kB)
Collecting service-identity>=18.1.0
  Downloading service_identity-21.1.0-py2.py3-none-any.whl (12 kB)
Collecting itemadapter>=0.1.0
  Downloading itemadapter-0.7.0-py3-none-any.whl (10 kB)
Collecting pyOpenSSL>=21.0.0
  Downloading pyOpenSSL-22.1.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 1.8 MB/s 
Collecting itemloaders>=1.0.1
  Downloading itemloaders-1.0.6-py3-none-any.whl (11 kB)
Collecting parsel>=1.5.0

In [2]:
!scrapy startproject chilean_data_explore

New Scrapy project 'chilean_data_explore', using template directory '/usr/local/lib/python3.7/dist-packages/scrapy/templates/project', created in:
    /content/chilean_data_explore

You can start your first spider with:
    cd chilean_data_explore
    scrapy genspider example example.com


In [3]:
import os


!pwd
!ls


In [4]:
os.chdir('/content/chilean_data_explore/chilean_data_explore/')
!pwd
!ls

/content/chilean_data_explore/chilean_data_explore
__init__.py  items.py  middlewares.py  pipelines.py  settings.py  spiders


In [5]:
from chilean_data_explore.chilean_data_explore.items import ChileanDataExploreItem

In [6]:
%%writefile items.py

import scrapy

class ChileanDataExploreItem(scrapy.Item):
    url = scrapy.Field() # str
    article_from = scrapy.Field() # str
    article_type = scrapy.Field() # str
    title = scrapy.Field() # str
    publish_date = scrapy.Field() # str
    authors = scrapy.Field() # list json
    tags = scrapy.Field() # list json
    text = scrapy.Field() # list json
    text_html = scrapy.Field() # str
    images = scrapy.Field() # list json
    video = scrapy.Field() # list json
    links = scrapy.Field() # list json

from scrapy.item import Item, Field
class PropertiesItem(Item):
     # Primary fields
     title = Field()
     price = Field()
     description = Field()
     address = Field()
     image_urls = Field()

     # Calculated fields
     images = Field()
     location = Field()
     
     # Housekeeping fields
     url = Field()
     project = Field()
     spider = Field()
     server = Field()
     date = Field()

Overwriting items.py


In [7]:
!pwd
os.chdir('/content/chilean_data_explore/chilean_data_explore/spiders')
!mkdir logo

/content/chilean_data_explore/chilean_data_explore


In [8]:
#%%writefile -a /chilean_data_explore/chilean_data_explore/spiders/quotes_spider.py
%%writefile quotes_spider.py
import scrapy
from chilean_data_explore.items import ChileanDataExploreItem

import time
import re

class ChinatimesSpider(scrapy.Spider):
    name = 'chinatimes'
    allowed_domains = ['chinatimes.com']
    base_url = 'https://www.chinatimes.com'

    url_array = [
      'http://observatorio.ministeriodesarrollosocial.gob.cl/encuesta-casen',
      'https://www.sii.cl/sobre_el_sii/estadisticas_de_empresas.html',
      'https://www.ide.cl/index.php/informacion-territorial/descargar-informacion-territorial',
    ]

    date_str = str(time.strftime("%F", time.localtime()))

    #custom_settings = {
    #    'LOG_FILE': 'log/%s-%s.log' % (name, date_str),
    #}

    def start_requests(self):
        list_url = '%s/realtimenews' % (self.base_url)
        yield scrapy.Request(url=list_url, callback=self.parse_list)

    def parse_list(self, response):
        for page_url in response.css('section.article-list>ul>li h3.title>a::attr(href)').getall():
            yield scrapy.Request(url=self.base_url + page_url, callback=self.parse_news)

    def parse_news(self, response):
        item = ChileanDataExploreItem()

        item['url'] = response.url
        #item['article_from'] = self.name
        #item['article_type'] = 'news'

        #item['title'] = self._parse_title(response)
        #item['publish_date'] = self._parse_publish_date(response)
        #item['authors'] = self._parse_authors(response)
        #item['tags'] = self._parse_tags(response)
        #item['text'] = self._parse_text(response)
        #item['text_html'] = self._parse_text_html(response)
        #item['images'] = self._parse_images(response)
        #item['video'] = self._parse_video(response)
        #item['links'] = self._parse_links(response)

        return item

    def _parse_title(self, response):
        return response.css('article.article-box h1.article-title::text').get()

    def _parse_publish_date(self, response):
        return response.css('article.article-box time::attr(datetime)').get()

    def _parse_authors(self, response):
        authors = response.css('article.article-box div.author>a::text').getall()
        if len(authors) == 0:
            authors = [response.css('article.article-box div.author::text').get(default='').strip()]
        return authors

    def _parse_tags(self, response):
        return response.css('article.article-box div.article-hash-tag a::text').getall()

    def _parse_text(self, response):
        return response.css('article.article-box div.article-body p::text').getall()

    def _parse_text_html(self, response):
        return response.css('article.article-box div.article-body').get()

    def _parse_images(self, response):
        images_list = []
        images_list.extend(response.css('article.article-box div.main-figure').css('img::attr(src)').getall())
        images_list.extend(response.css('article.article-box div.article-body').css('img::attr(src)').getall())
        return images_list

    def _parse_video(self, response):
        return response.css('article.article-box div.article-body iframe::attr(src)').getall()

    def _parse_links(self, response):
        return response.css('article.article-box div.article-body').css('a::attr(href)').getall()

Writing quotes_spider.py


In [9]:
!scrapy runspider quotes_spider.py -o quotes.json

2022-11-05 03:11:26 [scrapy.utils.log] INFO: Scrapy 2.7.1 started (bot: chilean_data_explore)
2022-11-05 03:11:26 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.2.0, parsel 1.7.0, w3lib 2.0.1, Twisted 22.10.0, Python 3.7.15 (default, Oct 12 2022, 19:14:55) - [GCC 7.5.0], pyOpenSSL 22.1.0 (OpenSSL 3.0.7 1 Nov 2022), cryptography 38.0.3, Platform Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic
2022-11-05 03:11:26 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'chilean_data_explore',
 'NEWSPIDER_MODULE': 'chilean_data_explore.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_LOADER_WARN_ONLY': True,
 'SPIDER_MODULES': ['chilean_data_explore.spiders'],
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2022-11-05 03:11:26 [asyncio] DEBUG: Using selector: EpollSelector
2022-11-05 03:11:26 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor


In [10]:
os.chdir('/content/chilean_data_explore/')
!pwd | ls -l


total 8
drwxr-xr-x 4 root root 4096 Nov  5 03:11 chilean_data_explore
-rw-r--r-- 1 root root  283 Nov  5 03:11 scrapy.cfg


## Spider for bcentral.cl

In [11]:
%%writefile /content/chilean_data_explore/chilean_data_explore/spiders/base.py
import scrapy
from chilean_data_explore.items import PropertiesItem 
from scrapy.loader import ItemLoader
import datetime
import socket

class BasicSpider(scrapy.Spider):
    name = 'base'
    allowed_domains = ['si3.bcentral.cl']
    start_urls = ['https://si3.bcentral.cl/siete']

    def parse(self, response):
      loader_item = ItemLoader(item=PropertiesItem(), response=response)
      loader_item.add_xpath('title','//title/text()')
      loader_item.add_xpath('price','//*[@itemprop="price"][1]/text()', re='[.0-9]+')
      loader_item.add_xpath('description','//*[contains(@href, "html")]')
      loader_item.add_value('server', socket.gethostname())
      loader_item.add_value('date', datetime.datetime.now())
      return loader_item.load_item()

Writing /content/chilean_data_explore/chilean_data_explore/spiders/base.py


In [12]:
!scrapy crawl base

2022-11-05 03:11:30 [scrapy.utils.log] INFO: Scrapy 2.7.1 started (bot: chilean_data_explore)
2022-11-05 03:11:30 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.2.0, parsel 1.7.0, w3lib 2.0.1, Twisted 22.10.0, Python 3.7.15 (default, Oct 12 2022, 19:14:55) - [GCC 7.5.0], pyOpenSSL 22.1.0 (OpenSSL 3.0.7 1 Nov 2022), cryptography 38.0.3, Platform Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic
2022-11-05 03:11:30 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'chilean_data_explore',
 'NEWSPIDER_MODULE': 'chilean_data_explore.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['chilean_data_explore.spiders'],
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2022-11-05 03:11:30 [asyncio] DEBUG: Using selector: EpollSelector
2022-11-05 03:11:30 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2022-11-05 03:11:30 [scrapy.utils.

In [13]:
!scrapy crawl base -o items.csv

2022-11-05 03:11:35 [scrapy.utils.log] INFO: Scrapy 2.7.1 started (bot: chilean_data_explore)
2022-11-05 03:11:35 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.2.0, parsel 1.7.0, w3lib 2.0.1, Twisted 22.10.0, Python 3.7.15 (default, Oct 12 2022, 19:14:55) - [GCC 7.5.0], pyOpenSSL 22.1.0 (OpenSSL 3.0.7 1 Nov 2022), cryptography 38.0.3, Platform Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic
2022-11-05 03:11:35 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'chilean_data_explore',
 'NEWSPIDER_MODULE': 'chilean_data_explore.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['chilean_data_explore.spiders'],
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2022-11-05 03:11:35 [asyncio] DEBUG: Using selector: EpollSelector
2022-11-05 03:11:35 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2022-11-05 03:11:35 [scrapy.utils.

In [14]:
!scrapy parse --spider=base https://si3.bcentral.cl/siete

2022-11-05 03:11:40 [scrapy.utils.log] INFO: Scrapy 2.7.1 started (bot: chilean_data_explore)
2022-11-05 03:11:40 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.2.0, parsel 1.7.0, w3lib 2.0.1, Twisted 22.10.0, Python 3.7.15 (default, Oct 12 2022, 19:14:55) - [GCC 7.5.0], pyOpenSSL 22.1.0 (OpenSSL 3.0.7 1 Nov 2022), cryptography 38.0.3, Platform Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic
2022-11-05 03:11:40 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'chilean_data_explore',
 'NEWSPIDER_MODULE': 'chilean_data_explore.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['chilean_data_explore.spiders'],
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2022-11-05 03:11:40 [asyncio] DEBUG: Using selector: EpollSelector
2022-11-05 03:11:40 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2022-11-05 03:11:40 [scrapy.utils.

## Spider for two-direction-movement

In [15]:
%%writefile /content/chilean_data_explore/chilean_data_explore/spiders/manual.py
import scrapy
from chilean_data_explore.items import PropertiesItem 
from scrapy.loader import ItemLoader
import datetime
import socket
from scrapy.http import Request
from urllib.parse import urljoin

class BasicSpider(scrapy.Spider):
    name = 'manual'
    allowed_domains = ['si3.bcentral.cl']
    start_urls = ['https://si3.bcentral.cl/Siete']

    def parse(self, response):
      next_selector=response.xpath('//a[starts-with(@onclick, "SetCapitu")]//@onclick').extract()
      collec=[]
      for string in next_selector:
        string = string.replace("SetCapitulo(","") 
        string = string.replace(")","") 
        string = string.replace("'","") 
        string = string.replace(" ","") 
        string = "/".join(string.split(',')) 
        string = "Siete/ES/Siete/Cuadro/" + string.strip()
        collec.append(string)
      next_selector = collec

      for url in next_selector:
        x = urljoin(response.url, url)
        yield Request(x, callback=self.parse)

Writing /content/chilean_data_explore/chilean_data_explore/spiders/manual.py


In [16]:
!scrapy crawl manual 
!scrapy parse --spider=manual https://si3.bcentral.cl/Siete

2022-11-05 03:11:45 [scrapy.utils.log] INFO: Scrapy 2.7.1 started (bot: chilean_data_explore)
2022-11-05 03:11:45 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.2.0, parsel 1.7.0, w3lib 2.0.1, Twisted 22.10.0, Python 3.7.15 (default, Oct 12 2022, 19:14:55) - [GCC 7.5.0], pyOpenSSL 22.1.0 (OpenSSL 3.0.7 1 Nov 2022), cryptography 38.0.3, Platform Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic
2022-11-05 03:11:45 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'chilean_data_explore',
 'NEWSPIDER_MODULE': 'chilean_data_explore.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['chilean_data_explore.spiders'],
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2022-11-05 03:11:45 [asyncio] DEBUG: Using selector: EpollSelector
2022-11-05 03:11:45 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2022-11-05 03:11:45 [scrapy.utils.

## Crawlspider for two-direction-movement

In [17]:
!scrapy genspider -t crawl automatic web

Created spider 'automatic' using template 'crawl' in module:
  chilean_data_explore.spiders.automatic


In [18]:
%%writefile /content/chilean_data_explore/chilean_data_explore/spiders/automatic.py

import scrapy

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from chilean_data_explore.items import ChileanDataExploreItem


class AutomaticSpider(CrawlSpider):
    name = 'automatic'
    allowed_domains = ['si3.bcentral.cl']
    start_urls = ['https://si3.bcentral.cl/Siete']

    rules = (
        #vertical
        Rule(LinkExtractor(allow='Siete'), callback='parse_item', follow=True),
    )
    
    def parse_item(self, response):
        exist = response.xpath('//a[starts-with(@onclick, "SetCapitu")]').extract_first()

        if exist:
          title = response.xpath('//a[starts-with(@onclick, "SetCapitu")]//@onclick')
          title = [ self.start_urls[0]+"/ES/Siete/Cuadro/" +self.url_title(t.get())  for t in title]
          #title = [ t.get()  for t in title]
          print( title)

          item = ChileanDataExploreItem()

          item['url'] = "response.url"
          item['article_from'] = "self.name"
          item['title'] = title
          item['publish_date'] = "self._parse_publish_date(response)"

          #yield {          'title': title          }
          yield item
          
          print("Testing > ", response.url )

          for url_next_page in title:
            yield response.follow(url_next_page, callback=self.parse_item)

        else:
          print(response.url)

    def url_title(self,t):
        replacements = [
          ("'",""),
          (")",""),
          ("(",""),
          (", ","/"),
          ("SetCapitulo",""),
        ]
        for x,y in replacements:
          t=t.replace(x,y)

        return t

Overwriting /content/chilean_data_explore/chilean_data_explore/spiders/automatic.py


In [19]:
!scrapy crawl automatic -O data_bcentral.csv

2022-11-05 03:12:05 [scrapy.utils.log] INFO: Scrapy 2.7.1 started (bot: chilean_data_explore)
2022-11-05 03:12:05 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.2.0, parsel 1.7.0, w3lib 2.0.1, Twisted 22.10.0, Python 3.7.15 (default, Oct 12 2022, 19:14:55) - [GCC 7.5.0], pyOpenSSL 22.1.0 (OpenSSL 3.0.7 1 Nov 2022), cryptography 38.0.3, Platform Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic
2022-11-05 03:12:05 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'chilean_data_explore',
 'NEWSPIDER_MODULE': 'chilean_data_explore.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['chilean_data_explore.spiders'],
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2022-11-05 03:12:05 [asyncio] DEBUG: Using selector: EpollSelector
2022-11-05 03:12:05 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2022-11-05 03:12:05 [scrapy.utils.

## Setting up selenium enviroment

In [28]:
!apt update
!apt install chromium-chromedriver
!pip install selenium

[33m0% [Working][0m            Hit:1 http://archive.ubuntu.com/ubuntu bionic InRelease
[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.36)] [[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.36)] [[0m                                                                               Hit:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.36)] [[0m[33m0% [1 InRelease gpgv 242 kB] [Waiting for headers] [Connecting to security.ubun[0m                                                                               Get:4 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
[33m0% [1 InRelease gpgv 242 kB] [4 InRelease 11.3 kB/83.3 kB 14%] [Connecting to s[0m[33

In [42]:
%%writefile /content/class_selenium_xpath.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
class selenium_scrapy:
  url_list=[]
  def driversetup(self):
      options = webdriver.ChromeOptions()
      #run Selenium in headless mode
      options.add_argument('--headless')
      options.add_argument('--no-sandbox')
      #overcome limited resource problems
      options.add_argument('--disable-dev-shm-usage')
      options.add_argument("lang=en")
      #open Browser in maximized mode
      options.add_argument("start-maximized")
      #disable infobars
      options.add_argument("disable-infobars")
      #disable extension
      options.add_argument("--disable-extensions")
      options.add_argument("--incognito")
      options.add_argument("--disable-blink-features=AutomationControlled")
      
      driver = webdriver.Chrome(options=options)

      driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")

      return driver
    
  def pagesource(self,url):
      driver = self.driversetup()
      driver.get(url)
      #soup = BeautifulSoup(driver.page_source)
      soup=driver.find_elements(By.XPATH, '//a[starts-with(@onclick, "SetCapitu")]' )
      self.url_list = []
      for a in soup:
        self.url_list.append(self.url_title(a.get_attribute('onclick')))

      driver.close()

  def url_title(self,t):
      replacements = [
        ("'",""),
        (")",""),
        ("(",""),
        (", ","/"),
        ("SetCapitulo",""),
      ]
      for x,y in replacements:
        t=t.replace(x,y)
      return t

Writing /content/class_selenium_xpath.py


In [43]:
from class_selenium_xpath import selenium_scrapy
data_crawler = selenium_scrapy()
url='https://si3.bcentral.cl/Siete'
data_crawler.pagesource(url)

print(data_crawler.url_list)


['CAP_ESTADIST_MACRO/MN_EST_MACRO_IV/PEM_TC', 'CAP_PRECIOS/MN_CAP_PRECIOS/UF_IVP_DIARIO', 'CAP_TASA_INTERES/MN_TASA_INTERES_09/TPM_C1', 'CAP_TIPO_CAMBIO/MN_TIPO_CAMBIO4/DOLAR_OBS_ADO', 'CAP_DYB/MN_ESTAD_MON55/EM_BMAM2', 'CAP_DERYSPOT/MN_DERYSPOT/DER_MON_01', 'CAP_CCNN/MN_CCNN76/CCNN2018_IMACEC_01', 'CAP_BDP/MN_BDP42/BP6M_RES01', 'CAP_EMP_REM_DEM/MN_EMP_REM_DEM13/ED_TDNRM2', 'CAP_EXP_ECO/MN_EXP_EC11/EXE_BCCH_01', 'CAP_IND_SEC/MN_IND_SEC20/IS_GENERAL_PROPIEDAD_2008', 'CAP_EI/MN_EI11/EI_CREC_TRI', 'CAP_FIN_PUB/MN_FIN_PUB_1/GOB_TOT_1', 'CAP_ESTADIST_GENERO/MN_GENERO1/EST_GEN_POB_01', 'CAP_ESTADIST_REGIONAL/MN_REGIONAL1/CCNN2018_PIB_REGIONAL_T', 'CAP_ESTADIST_EXPERIM/MN_EXPERIM01/EST_EXP_001', 'CAP_ESTADIST_MACRO/MN_EST_MACRO_IV/PEM_TC', 'CAP_PRECIOS/MN_CAP_PRECIOS/UF_IVP_DIARIO', 'CAP_TASA_INTERES/MN_TASA_INTERES_09/TPM_C1', 'CAP_TIPO_CAMBIO/MN_TIPO_CAMBIO4/DOLAR_OBS_ADO', 'CAP_DYB/MN_ESTAD_MON55/EM_BMAM2', 'CAP_DERYSPOT/MN_DERYSPOT/DER_MON_01', 'CAP_CCNN/MN_CCNN76/CCNN2018_IMACEC_01', 'CA

#### Scrawl python

In [34]:
!scrapy genspider -t crawl tencent web

Traceback (most recent call last):
  File "/usr/local/bin/scrapy", line 8, in <module>
    sys.exit(execute())
  File "/usr/local/lib/python3.7/dist-packages/scrapy/cmdline.py", line 153, in execute
    cmd.crawler_process = CrawlerProcess(settings)
  File "/usr/local/lib/python3.7/dist-packages/scrapy/crawler.py", line 304, in __init__
    super().__init__(settings)
  File "/usr/local/lib/python3.7/dist-packages/scrapy/crawler.py", line 181, in __init__
    self.spider_loader = self._get_spider_loader(settings)
  File "/usr/local/lib/python3.7/dist-packages/scrapy/crawler.py", line 175, in _get_spider_loader
    return loader_cls.from_settings(settings.frozencopy())
  File "/usr/local/lib/python3.7/dist-packages/scrapy/spiderloader.py", line 67, in from_settings
    return cls(settings)
  File "/usr/local/lib/python3.7/dist-packages/scrapy/spiderloader.py", line 24, in __init__
    self._load_all_spiders()
  File "/usr/local/lib/python3.7/dist-packages/scrapy/spiderloader.py", line 51

In [55]:
%%writefile /content/chilean_data_explore/chilean_data_explore/spiders/tencent.py
import scrapy

#load class CrawlSpider and Rule
from scrapy.spiders import CrawlSpider,Rule

#load the link rules to match satisfiled links
from scrapy.linkextractors import LinkExtractor 

#from tenSpider.items import TenspiderItem 

#from .content.chilean_data_explore.chilean_data_explore.spiders.class_selenium_xpath import selenium_scrapy
from . import class_selenium_xpath


class TencentSpider(CrawlSpider):
    name = 'tencent'
    allowed_domains = ['si3.bcentral.cl']
    start_urls = ['https://si3.bcentral.cl/Siete']

    # the extract function in response,return a list match the rule
    list_pagelink=selenium_scrapy()
    list_pagelink.pagesource(start_urls[0])
    
    #follow item will be used to Recursive crawling
    rules=[
        # get the link in pagelink, send the request one by one,
        # call the specified function to process 
        Rule(list_pagelink.url_list,callback="parseTencent",follow=True)
    ]

    def list_url(self,response):
        list_url_ = [ "/ES/Siete/Cuadro/" +self.url_title(t.get())  for t in response.xpath('//a[starts-with(@onclick, "SetCapitu")]//@onclick') ]
        print(list_url_)
        return list_url_

    def url_title(self,t):
        replacements = [
          ("'",""),
          (")",""),
          ("(",""),
          (", ","/"),
          ("SetCapitulo",""),
        ]
        for x,y in replacements:
          t=t.replace(x,y)

        return t

    # specified process function
    def parseTencent(self,response):
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<",response.url)
        for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
            item={}#TenspiderItem()
            #store the data into dict item
            #extract will transform the data into unicode string
            item['positionname']=each.xpath('./td[1]/a/text()').extract()[0]
            item['positionlink']=each.xpath('./td[1]/a/@href').extract()[0]
            item['positionType']=each.xpath('./td[2]/text()').extract()[0]
            item['peopleNum']=each.xpath('./td[3]/text()').extract()[0]
            item['workLocation']=each.xpath('./td[4]/text()').extract()[0]
            item['publishTime']=each.xpath('./td[5]/text()').extract()[0]

            print(item)
            yield item

Overwriting /content/chilean_data_explore/chilean_data_explore/spiders/tencent.py


In [56]:
!scrapy crawl tencent

Traceback (most recent call last):
  File "/usr/local/bin/scrapy", line 8, in <module>
    sys.exit(execute())
  File "/usr/local/lib/python3.7/dist-packages/scrapy/cmdline.py", line 153, in execute
    cmd.crawler_process = CrawlerProcess(settings)
  File "/usr/local/lib/python3.7/dist-packages/scrapy/crawler.py", line 304, in __init__
    super().__init__(settings)
  File "/usr/local/lib/python3.7/dist-packages/scrapy/crawler.py", line 181, in __init__
    self.spider_loader = self._get_spider_loader(settings)
  File "/usr/local/lib/python3.7/dist-packages/scrapy/crawler.py", line 175, in _get_spider_loader
    return loader_cls.from_settings(settings.frozencopy())
  File "/usr/local/lib/python3.7/dist-packages/scrapy/spiderloader.py", line 67, in from_settings
    return cls(settings)
  File "/usr/local/lib/python3.7/dist-packages/scrapy/spiderloader.py", line 24, in __init__
    self._load_all_spiders()
  File "/usr/local/lib/python3.7/dist-packages/scrapy/spiderloader.py", line 51

# Storing crawled data 

In [None]:
!python -m pip install boto3


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import boto3
import os
import requests
import tqdm

In [None]:
dynamo_client  =  boto3.resource(service_name = 'data_crawled_dydb',region_name = 'us-east-1',
              aws_access_key_id = 'AKIA3BS5NFXXXXXXX',
              aws_secret_access_key = 'qfGTJL28HrqcbhKCM0t//xxx7gTGG4iNrv3/d94Lsp')

In [None]:
product_table = dynamo_client.Table('institution')
product_table.table_status

product_table_link = dynamo_client.Table('link_collection')
product_table_link.table_status

product_table_file = dynamo_client.Table('file_collection')
product_table_file.table_status

In [None]:
dynamo_client.get_available_subresources()

In [None]:
def query_police_department_record_by_guid(guid):  
    db = dynamodb_resource()
    extra_msg = {"region_name": REGION, "aws_service": "dynamodb", 
        "police_department_table":POLICE_DEPARTMENTS_TABLE,
        "guid":guid}
    log.info(f"Get PD record by GUID", extra=extra_msg)
    pd_table = db.Table(POLICE_DEPARTMENTS_TABLE)
    response = pd_table.get_item(
        Key={
            'guid': guid
            }
    )
    return response['Item']

In [None]:
print(query_police_department_record_by_guid("jlkdajfldskj1312312"))

#References

> DynamoDB and its purposes

*   [A one size fits all database doesn't fit anyone
](https://www.allthingsdistributed.com/2018/06/purpose-built-databases-in-aws.html)

*   [Amazon DynamoDB](https://aws.amazon.com/dynamodb/)

*   [Scaling globally with the new AWS](https://www.allthingsdistributed.com/2022/08/aws-launches-middle-east-uae-region.html)


> Interfaces for reliable connections

*   [DynamoDB and the AWS SDKs](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GettingStarted.html
)

*   [Boto3 and Amazon DynamoDB](https://www.section.io/engineering-education/python-boto3-and-amazon-dynamodb-programming-tutorial/)

*   [DynamoDb in Python using BOTO3](https://www.analyticsvidhya.com/blog/2022/05/working-with-dynamodb-in-python-using-boto3/)

> Scrapy 

* [Google Colab tips: using both %%writefile magic and %%javascript magic in the same cell
](https://stephencowchau.medium.com/google-colab-tips-using-both-writefile-magic-and-javascript-magic-in-the-same-cell-7820e508e455)

* [Scrapy - User Agents and Proxies](https://scrapeops.io/python-scrapy-playbook/scrapy-beginners-guide-user-agents-proxies/)

* [Scrapy - LinkExtractors](https://www.tutorialspoint.com/scrapy/scrapy_link_extractors.htm)

> Xpath

* [Parsing HTML with Xpath](https://scrapfly.io/blog/parsing-html-with-xpath/)

* [Scrapy - User Agents and Proxies](https://scrapeops.io/python-scrapy-playbook/scrapy-beginners-guide-user-agents-proxies/)

* [XPath tester](https://extendsclass.com/xpath-tester.html)

*   [XPath tester codebeautify](https://codebeautify.org/Xpath-Tester)

*   [Xpath for python: is xpath underappreciated?](https://towardsdatascience.com/xpath-for-python-89f4423415e0)

> Selenium

*   [Selenium webdriver in colab](https://blog.devgenius.io/use-selenium-webdriver-in-google-colab-d5f2dba1d9f5)

*   [Selenium - Navigation](https://selenium-python.readthedocs.io/navigating.html?highlight=driver.find_element#drag-and-drop)


