# Data crawling with scrapy

In [9]:
!pip install scrapy 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
!scrapy startproject chilean_data_explore

Error: Module 'chilean_data_explore' already exists


In [None]:
import os


!pwd
!ls


In [11]:
os.chdir('/content/chilean_data_explore/chilean_data_explore/')
!pwd
!ls

/content/chilean_data_explore/chilean_data_explore
__init__.py  middlewares.py  __pycache__  spiders
items.py     pipelines.py    settings.py


In [12]:
from chilean_data_explore.chilean_data_explore.items import ChileanDataExploreItem

In [13]:
%%writefile items.py

import scrapy

class ChileanDataExploreItem(scrapy.Item):
    url = scrapy.Field() # str
    article_from = scrapy.Field() # str
    article_type = scrapy.Field() # str
    title = scrapy.Field() # str
    publish_date = scrapy.Field() # str
    authors = scrapy.Field() # list json
    tags = scrapy.Field() # list json
    text = scrapy.Field() # list json
    text_html = scrapy.Field() # str
    images = scrapy.Field() # list json
    video = scrapy.Field() # list json
    links = scrapy.Field() # list json

from scrapy.item import Item, Field
class PropertiesItem(Item):
     # Primary fields
     title = Field()
     price = Field()
     description = Field()
     address = Field()
     image_urls = Field()

     # Calculated fields
     images = Field()
     location = Field()
     
     # Housekeeping fields
     url = Field()
     project = Field()
     spider = Field()
     server = Field()
     date = Field()

Overwriting items.py


In [14]:
!pwd
os.chdir('/content/chilean_data_explore/chilean_data_explore/spiders')
!mkdir logo

/content/chilean_data_explore/chilean_data_explore
mkdir: cannot create directory ‘logo’: File exists


In [24]:
#%%writefile -a /chilean_data_explore/chilean_data_explore/spiders/quotes_spider.py
%%writefile quotes_spider.py
import scrapy
from chilean_data_explore.items import ChileanDataExploreItem

import time
import re

class ChinatimesSpider(scrapy.Spider):
    name = 'chinatimes'
    allowed_domains = ['chinatimes.com']
    base_url = 'https://www.chinatimes.com'

    url_array = [
      'http://observatorio.ministeriodesarrollosocial.gob.cl/encuesta-casen',
      'https://www.sii.cl/sobre_el_sii/estadisticas_de_empresas.html',
      'https://www.ide.cl/index.php/informacion-territorial/descargar-informacion-territorial',
    ]

    date_str = str(time.strftime("%F", time.localtime()))

    #custom_settings = {
    #    'LOG_FILE': 'log/%s-%s.log' % (name, date_str),
    #}

    def start_requests(self):
        list_url = '%s/realtimenews' % (self.base_url)
        yield scrapy.Request(url=list_url, callback=self.parse_list)

    def parse_list(self, response):
        for page_url in response.css('section.article-list>ul>li h3.title>a::attr(href)').getall():
            yield scrapy.Request(url=self.base_url + page_url, callback=self.parse_news)

    def parse_news(self, response):
        item = ChileanDataExploreItem()

        item['url'] = response.url
        #item['article_from'] = self.name
        #item['article_type'] = 'news'

        #item['title'] = self._parse_title(response)
        #item['publish_date'] = self._parse_publish_date(response)
        #item['authors'] = self._parse_authors(response)
        #item['tags'] = self._parse_tags(response)
        #item['text'] = self._parse_text(response)
        #item['text_html'] = self._parse_text_html(response)
        #item['images'] = self._parse_images(response)
        #item['video'] = self._parse_video(response)
        #item['links'] = self._parse_links(response)

        return item

    def _parse_title(self, response):
        return response.css('article.article-box h1.article-title::text').get()

    def _parse_publish_date(self, response):
        return response.css('article.article-box time::attr(datetime)').get()

    def _parse_authors(self, response):
        authors = response.css('article.article-box div.author>a::text').getall()
        if len(authors) == 0:
            authors = [response.css('article.article-box div.author::text').get(default='').strip()]
        return authors

    def _parse_tags(self, response):
        return response.css('article.article-box div.article-hash-tag a::text').getall()

    def _parse_text(self, response):
        return response.css('article.article-box div.article-body p::text').getall()

    def _parse_text_html(self, response):
        return response.css('article.article-box div.article-body').get()

    def _parse_images(self, response):
        images_list = []
        images_list.extend(response.css('article.article-box div.main-figure').css('img::attr(src)').getall())
        images_list.extend(response.css('article.article-box div.article-body').css('img::attr(src)').getall())
        return images_list

    def _parse_video(self, response):
        return response.css('article.article-box div.article-body iframe::attr(src)').getall()

    def _parse_links(self, response):
        return response.css('article.article-box div.article-body').css('a::attr(href)').getall()

Overwriting quotes_spider.py


In [25]:
!scrapy runspider quotes_spider.py -o quotes.json

2022-10-25 03:05:11 [scrapy.utils.log] INFO: Scrapy 2.7.0 started (bot: chilean_data_explore)
2022-10-25 03:05:11 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.1.0, parsel 1.6.0, w3lib 2.0.1, Twisted 22.8.0, Python 3.7.15 (default, Oct 12 2022, 19:14:55) - [GCC 7.5.0], pyOpenSSL 22.1.0 (OpenSSL 3.0.5 5 Jul 2022), cryptography 38.0.1, Platform Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic
2022-10-25 03:05:11 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'chilean_data_explore',
 'NEWSPIDER_MODULE': 'chilean_data_explore.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_LOADER_WARN_ONLY': True,
 'SPIDER_MODULES': ['chilean_data_explore.spiders'],
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2022-10-25 03:05:11 [asyncio] DEBUG: Using selector: EpollSelector
2022-10-25 03:05:11 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2

In [26]:
os.chdir('/content/chilean_data_explore/')
!pwd | ls -l


total 8
drwxr-xr-x 4 root root 4096 Oct 25 02:38 chilean_data_explore
-rw-r--r-- 1 root root  283 Oct 25 02:31 scrapy.cfg


In [27]:
!scrapy genspider basic web

Created spider 'basic' using template 'basic' in module:
  chilean_data_explore.spiders.basic


In [35]:
%%writefile /content/chilean_data_explore/chilean_data_explore/spiders/basic.py
import scrapy
from chilean_data_explore.items import PropertiesItem 

class BasicSpider(scrapy.Spider):
    name = 'basic'
    allowed_domains = ['bcentral.cl']
    start_urls = ['https://si3.bcentral.cl/siete']

    def parse(self, response):
      item = PropertiesItem()
      item['title'] = response.xpath('//title/text()'[0]).extract()
      item['price'] = response.xpath('//*[@itemprop="price"][1]/text()').re('[.0-9]+')
      item['description'] = response.xpath('//*[@itemprop="description"][1]/text()').extract()

      return item

Overwriting /content/chilean_data_explore/chilean_data_explore/spiders/basic.py


In [36]:
!scrapy crawl basic

2022-10-25 03:36:02 [scrapy.utils.log] INFO: Scrapy 2.7.0 started (bot: chilean_data_explore)
2022-10-25 03:36:02 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.1.0, parsel 1.6.0, w3lib 2.0.1, Twisted 22.8.0, Python 3.7.15 (default, Oct 12 2022, 19:14:55) - [GCC 7.5.0], pyOpenSSL 22.1.0 (OpenSSL 3.0.5 5 Jul 2022), cryptography 38.0.1, Platform Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic
2022-10-25 03:36:02 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'chilean_data_explore',
 'NEWSPIDER_MODULE': 'chilean_data_explore.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['chilean_data_explore.spiders'],
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2022-10-25 03:36:02 [asyncio] DEBUG: Using selector: EpollSelector
2022-10-25 03:36:02 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2022-10-25 03:36:02 [scrapy.utils.l

# Storing crawled data 

In [None]:
!python -m pip install boto3


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import boto3
import os
import requests
import tqdm

In [None]:
dynamo_client  =  boto3.resource(service_name = 'data_crawled_dydb',region_name = 'us-east-1',
              aws_access_key_id = 'AKIA3BS5NFXXXXXXX',
              aws_secret_access_key = 'qfGTJL28HrqcbhKCM0t//xxx7gTGG4iNrv3/d94Lsp')

In [None]:
product_table = dynamo_client.Table('institution')
product_table.table_status

product_table_link = dynamo_client.Table('link_collection')
product_table_link.table_status

product_table_file = dynamo_client.Table('file_collection')
product_table_file.table_status

In [None]:
dynamo_client.get_available_subresources()

In [None]:
def query_police_department_record_by_guid(guid):  
    db = dynamodb_resource()
    extra_msg = {"region_name": REGION, "aws_service": "dynamodb", 
        "police_department_table":POLICE_DEPARTMENTS_TABLE,
        "guid":guid}
    log.info(f"Get PD record by GUID", extra=extra_msg)
    pd_table = db.Table(POLICE_DEPARTMENTS_TABLE)
    response = pd_table.get_item(
        Key={
            'guid': guid
            }
    )
    return response['Item']

In [None]:
print(query_police_department_record_by_guid("jlkdajfldskj1312312"))

#References

> DynamoDB and its purposes

*   [A one size fits all database doesn't fit anyone
](https://www.allthingsdistributed.com/2018/06/purpose-built-databases-in-aws.html)

*   [Amazon DynamoDB](https://aws.amazon.com/dynamodb/)

*   [Scaling globally with the new AWS](https://www.allthingsdistributed.com/2022/08/aws-launches-middle-east-uae-region.html)


> Interfaces for reliable connections

*   [DynamoDB and the AWS SDKs](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GettingStarted.html
)

*   [Boto3 and Amazon DynamoDB](https://www.section.io/engineering-education/python-boto3-and-amazon-dynamodb-programming-tutorial/)

*   [DynamoDb in Python using BOTO3](https://www.analyticsvidhya.com/blog/2022/05/working-with-dynamodb-in-python-using-boto3/)

> Scrapy 

* [Google Colab tips: using both %%writefile magic and %%javascript magic in the same cell
](https://stephencowchau.medium.com/google-colab-tips-using-both-writefile-magic-and-javascript-magic-in-the-same-cell-7820e508e455)
