# Infomoney Links Scraping


Making the necessary imports


In [None]:
import re
import xml.etree.ElementTree as ET
from datetime import datetime
from typing import Any
from xml.etree.ElementTree import Element

from scrapy import Request
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import Spider

Instanciating a scrapy spider called `BigSpooder`, and sending it to the infomoney sitemap index `xml`.

After that, we are going trough all other links that send to another sitemap regarding news and that are around the time span after September 2022.

During this process we are writing a csv file containg all the links.

> In future instances we could directly send this to be treated and scraped or store it in a DB, for educational purposes we are separating the link scraping process and the news scraping process.


In [None]:
class BigSpooder(Spider):
    name: str = "BigSpooder"  # Spider's name
    # Site to be crawled
    start_urls: list[str] = ["https://www.infomoney.com.br/sitemap_index.xml"]
    namespaces: dict[str, str] = {
        "xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9"
    }

    def parse(self, response):
        xmlns: Any = (
            response.xpath("//xmlns:sitemap", namespaces=self.namespaces)
        ).getall()
        for sitemap in xmlns:
            xml: Element = ET.fromstring(sitemap)
            loc: str | None = xml.find(f"{{{ self.namespaces['xmlns'] }}}loc").text
            lastmod: str | None = xml.find(
                f"{{{ self.namespaces['xmlns'] }}}lastmod"
            ).text
            date: datetime = datetime.strptime(lastmod, "%Y-%m-%dT%H:%M:%S%z")
            if (
                date.year == 2022 and date.month >= 9 or date.year == 2023
            ) and re.match(".*(/post-sitemap\d*.xml)$", loc):
                yield Request(loc, callback=self.getLinks)

    def getLinks(self, response):
        self.logger.info("we hot")
        xmlns: Any = (
            response.xpath("//xmlns:url", namespaces=self.namespaces)
        ).getall()
        for sitemap in xmlns:
            xml: Element = ET.fromstring(sitemap)
            loc: str | None = xml.find(f"{{{ self.namespaces['xmlns'] }}}loc").text
            lastmod: str | None = xml.find(
                f"{{{ self.namespaces['xmlns'] }}}lastmod"
            ).text
            date: datetime = datetime.strptime(lastmod, "%Y-%m-%dT%H:%M:%S%z")
            if date.year >= 2022:
                yield {"link": loc}


# Settings do processo
process: Any = CrawlerProcess(
    settings={
        "FEEDS": {"infomoney-links.csv": {"format": "csv"}},
        "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7",
    }
)
process.crawl(BigSpooder)
process.start()

# Because of the way that the library works, if we want to run the spider again we must restart the kernel