From c966788ec3ef7328a67fc475f113d0470aa71449 Mon Sep 17 00:00:00 2001 From: mauro tony Date: Tue, 10 Oct 2023 03:10:39 -0300 Subject: [PATCH 01/16] Feat: Iniciando projeto com Scrapy --- scraping/scraping/__init__.py | 0 scraping/scraping/items.py | 12 +++ scraping/scraping/middlewares.py | 103 ++++++++++++++++++++++++++ scraping/scraping/pipelines.py | 13 ++++ scraping/scraping/settings.py | 93 +++++++++++++++++++++++ scraping/scraping/spiders/__init__.py | 4 + scraping/scrapy.cfg | 11 +++ 7 files changed, 236 insertions(+) create mode 100644 scraping/scraping/__init__.py create mode 100644 scraping/scraping/items.py create mode 100644 scraping/scraping/middlewares.py create mode 100644 scraping/scraping/pipelines.py create mode 100644 scraping/scraping/settings.py create mode 100644 scraping/scraping/spiders/__init__.py create mode 100644 scraping/scrapy.cfg diff --git a/scraping/scraping/__init__.py b/scraping/scraping/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scraping/scraping/items.py b/scraping/scraping/items.py new file mode 100644 index 0000000..0fe9a20 --- /dev/null +++ b/scraping/scraping/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class ScrapingItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/scraping/scraping/middlewares.py b/scraping/scraping/middlewares.py new file mode 100644 index 0000000..8445d6c --- /dev/null +++ b/scraping/scraping/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class ScrapingSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class ScrapingDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/scraping/scraping/pipelines.py b/scraping/scraping/pipelines.py new file mode 100644 index 0000000..db116b6 --- /dev/null +++ b/scraping/scraping/pipelines.py @@ -0,0 +1,13 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class ScrapingPipeline: + def process_item(self, item, spider): + return item diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py new file mode 100644 index 0000000..8a28084 --- /dev/null +++ b/scraping/scraping/settings.py @@ -0,0 +1,93 @@ +# Scrapy settings for scraping project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "scraping" + +SPIDER_MODULES = ["scraping.spiders"] +NEWSPIDER_MODULE = "scraping.spiders" + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = "scraping (+http://www.yourdomain.com)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "scraping.middlewares.ScrapingSpiderMiddleware": 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# "scraping.middlewares.ScrapingDownloaderMiddleware": 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# "scraping.pipelines.ScrapingPipeline": 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" diff --git a/scraping/scraping/spiders/__init__.py b/scraping/scraping/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/scraping/scraping/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/scraping/scrapy.cfg b/scraping/scrapy.cfg new file mode 100644 index 0000000..0e0aaa1 --- /dev/null +++ b/scraping/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = scraping.settings + +[deploy] +#url = http://localhost:6800/ +project = scraping From a0631c0421fc36bf4f67b6e4b8d9345ee690985c Mon Sep 17 00:00:00 2001 From: mauro tony Date: Tue, 10 Oct 2023 03:35:55 -0300 Subject: [PATCH 02/16] Feat: Adicionando scraping do site https://www.scrapethissite.com/pages/simple/ --- scraping/scraping/spiders/countries_spider.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 scraping/scraping/spiders/countries_spider.py diff --git a/scraping/scraping/spiders/countries_spider.py b/scraping/scraping/spiders/countries_spider.py new file mode 100644 index 0000000..4001b74 --- /dev/null +++ b/scraping/scraping/spiders/countries_spider.py @@ -0,0 +1,24 @@ +import scrapy + +class CountriesSpider(scrapy.Spider): + name = "countries" + start_urls = [ + 'https://www.scrapethissite.com/pages/simple/' + ] + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', + 'Accept-Language': 'pt-BR' + } + + def start_requests(self): + for url in self.start_urls: + yield scrapy.Request(url, headers=self.headers, callback=self.parse) + + def parse(self, response): + for country in response.css('.country'): + yield { + 'name': country.css('h3.country-name').xpath('normalize-space(.)').get(), + 'capital': country.css('.country-info .country-capital::text').get(), + 'population': country.css('.country-info .country-population::text').get(), + 'area': country.css('.country-info .country-area::text').get(), + } From d60c7f1e8a4c86a32cf989d7c5a15819a14112c4 Mon Sep 17 00:00:00 2001 From: mauro tony Date: Tue, 10 Oct 2023 03:44:34 -0300 Subject: [PATCH 03/16] =?UTF-8?q?Feat:=20Identifica=C3=A7=C3=A3o=20e=20man?= =?UTF-8?q?ipula=C3=A7=C3=A3o=20de=20headers=20(User-Agent)=20para=20simul?= =?UTF-8?q?ar=20diferentes=20browsers=20ou=20dispositivos.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scraping/scraping/middlewares.py | 93 ++----------------- scraping/scraping/settings.py | 17 +++- scraping/scraping/spiders/countries_spider.py | 8 +- 3 files changed, 23 insertions(+), 95 deletions(-) diff --git a/scraping/scraping/middlewares.py b/scraping/scraping/middlewares.py index 8445d6c..77d38ae 100644 --- a/scraping/scraping/middlewares.py +++ b/scraping/scraping/middlewares.py @@ -3,101 +3,20 @@ # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html +import random from scrapy import signals # useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter -class ScrapingSpiderMiddleware: - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. +class RandomUserAgentMiddleware: + def __init__(self, user_agents): + self.user_agents = user_agents @classmethod def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, or item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request or item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info("Spider opened: %s" % spider.name) - - -class ScrapingDownloaderMiddleware: - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s + return cls(user_agents=crawler.settings.getlist('USER_AGENTS')) def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info("Spider opened: %s" % spider.name) + request.headers.setdefault('User-Agent', random.choice(self.user_agents)) diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py index 8a28084..5fcbe07 100644 --- a/scraping/scraping/settings.py +++ b/scraping/scraping/settings.py @@ -50,9 +50,9 @@ # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# "scraping.middlewares.ScrapingDownloaderMiddleware": 543, -#} +DOWNLOADER_MIDDLEWARES = { + "scraping.middlewares.RandomUserAgentMiddleware": 100, +} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html @@ -91,3 +91,14 @@ REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" FEED_EXPORT_ENCODING = "utf-8" + +USER_AGENTS = [ + # Chrome 91 Windows 10 + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + # Firefox 89 Windows 10 + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0", + # Safari 14 macOS + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15", + # iPhone X Safari + "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1" +] \ No newline at end of file diff --git a/scraping/scraping/spiders/countries_spider.py b/scraping/scraping/spiders/countries_spider.py index 4001b74..28a58e1 100644 --- a/scraping/scraping/spiders/countries_spider.py +++ b/scraping/scraping/spiders/countries_spider.py @@ -5,17 +5,15 @@ class CountriesSpider(scrapy.Spider): start_urls = [ 'https://www.scrapethissite.com/pages/simple/' ] - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', - 'Accept-Language': 'pt-BR' - } def start_requests(self): for url in self.start_urls: - yield scrapy.Request(url, headers=self.headers, callback=self.parse) + yield scrapy.Request(url, callback=self.parse) def parse(self, response): for country in response.css('.country'): + # Exibe user-agent + self.logger.info("User-Agent: " + response.request.headers['User-Agent'].decode('utf-8')) yield { 'name': country.css('h3.country-name').xpath('normalize-space(.)').get(), 'capital': country.css('.country-info .country-capital::text').get(), From ec3ca5bc4c00026fe4ee400acb0117be8545c204 Mon Sep 17 00:00:00 2001 From: mauro tony Date: Tue, 10 Oct 2023 04:04:27 -0300 Subject: [PATCH 04/16] Feat: Adicionando uso de proxies rotativos --- scraping/scraping/middlewares.py | 24 ++++++++++++++++++++++++ scraping/scraping/settings.py | 10 +++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/scraping/scraping/middlewares.py b/scraping/scraping/middlewares.py index 77d38ae..cf19f95 100644 --- a/scraping/scraping/middlewares.py +++ b/scraping/scraping/middlewares.py @@ -4,6 +4,7 @@ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html import random +import logging from scrapy import signals # useful for handling different item types with a single interface @@ -20,3 +21,26 @@ def from_crawler(cls, crawler): def process_request(self, request, spider): request.headers.setdefault('User-Agent', random.choice(self.user_agents)) + + +class RandomProxyMiddleware: + def __init__(self, proxies): + self.proxies = proxies + self.logger = logging.getLogger(__name__) + + @classmethod + def from_crawler(cls, crawler): + return cls(proxies=crawler.settings.getlist('PROXIES')) + + def process_request(self, request, spider): + proxy = random.choice(self.proxies) + request.meta['proxy'] = proxy + + def process_exception(self, request, exception, spider): + if isinstance(exception, (ConnectionRefusedError, TimeoutError, )): + self.logger.warning(f"Failed to connect using proxy {request.meta['proxy']}, retrying a different proxy...") + new_request = request.copy() + new_request.dont_filter = True + new_request.priority = request.priority + 1 + return new_request + return None \ No newline at end of file diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py index 5fcbe07..46f64a4 100644 --- a/scraping/scraping/settings.py +++ b/scraping/scraping/settings.py @@ -51,7 +51,8 @@ # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { - "scraping.middlewares.RandomUserAgentMiddleware": 100, + "scraping.middlewares.RandomUserAgentMiddleware": 200, + "scraping.middlewares.RandomProxyMiddleware": 100, } # Enable or disable extensions @@ -101,4 +102,11 @@ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15", # iPhone X Safari "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1" +] + +PROXIES = [ + 'http://20.206.106.192:8123', + 'http://177.12.118.160:80', + 'http://138.204.95.166:8080', + 'http://191.243.46.162:43241' ] \ No newline at end of file From 80fba962fb9fc961f9a92800b85e8f6fa4257908 Mon Sep 17 00:00:00 2001 From: mauro tony Date: Wed, 11 Oct 2023 09:13:11 -0300 Subject: [PATCH 05/16] =?UTF-8?q?Feat:=20Implementando=20integra=C3=A7?= =?UTF-8?q?=C3=A3o=20com=20o=20kafka?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docker-compose.yml | 20 ++++++++ scraping/scraping/items.py | 7 +-- scraping/scraping/pipelines.py | 46 ++++++++++++++++++- scraping/scraping/settings.py | 20 ++++---- scraping/scraping/spiders/countries_spider.py | 21 +++++---- 5 files changed, 93 insertions(+), 21 deletions(-) create mode 100644 docker-compose.yml diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..6797d8b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,20 @@ +version: '3.8' + +services: + zookeeper: + image: confluentinc/cp-zookeeper:latest + environment: + ZOOKEEPER_CLIENT_PORT: 2181 + + kafka: + image: confluentinc/cp-kafka:latest + depends_on: + - zookeeper + ports: + - "9092:9092" + environment: + KAFKA_BROKER_ID: 1 + KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true" \ No newline at end of file diff --git a/scraping/scraping/items.py b/scraping/scraping/items.py index 0fe9a20..a79daf0 100644 --- a/scraping/scraping/items.py +++ b/scraping/scraping/items.py @@ -7,6 +7,7 @@ class ScrapingItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - pass + nameCountry = scrapy.Field() + capitalCountry = scrapy.Field() + populationCountry = scrapy.Field() + areaCountry = scrapy.Field() diff --git a/scraping/scraping/pipelines.py b/scraping/scraping/pipelines.py index db116b6..87b0fe3 100644 --- a/scraping/scraping/pipelines.py +++ b/scraping/scraping/pipelines.py @@ -2,12 +2,54 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - +import json # useful for handling different item types with a single interface from itemadapter import ItemAdapter +from scrapy.exceptions import DropItem +from confluent_kafka import Producer + +class DataProcessingPipeline: -class ScrapingPipeline: def process_item(self, item, spider): + if not item.get('populationCountry') or not item.get('areaCountry'): + raise DropItem("Item faltando campos necessários") + + try: + item['populationCountry'] = int(item['populationCountry']) + item['areaCountry'] = float(item['areaCountry']) + except ValueError: + raise DropItem("Não foi possível converter os dados") return item + + +class KafkaPipeline: + + def __init__(self, kafka_broker, kafka_topic): + self.kafka_broker = kafka_broker + self.kafka_topic = kafka_topic + self.items = [] + + @classmethod + def from_crawler(cls, crawler): + return cls( + kafka_broker=crawler.settings.get('KAFKA_BROKER'), + kafka_topic=crawler.settings.get('KAFKA_TOPIC') + ) + + def open_spider(self, spider): + self.producer = Producer({'bootstrap.servers': self.kafka_broker}) + + def close_spider(self, spider): + self.process_all_items() + self.producer.flush() + + def process_item(self, item, spider): + self.items.append(dict(item)) + return item + + def process_all_items(self): + if self.items: + content = json.dumps(self.items) + self.producer.produce(self.kafka_topic, content) diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py index 46f64a4..6618a33 100644 --- a/scraping/scraping/settings.py +++ b/scraping/scraping/settings.py @@ -63,9 +63,10 @@ # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# "scraping.pipelines.ScrapingPipeline": 300, -#} +ITEM_PIPELINES = { + "scraping.pipelines.DataProcessingPipeline": 300, + "scraping.pipelines.KafkaPipeline": 400, +} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html @@ -105,8 +106,11 @@ ] PROXIES = [ - 'http://20.206.106.192:8123', - 'http://177.12.118.160:80', - 'http://138.204.95.166:8080', - 'http://191.243.46.162:43241' -] \ No newline at end of file + 'http://181.191.94.126:8999', + 'http://201.91.82.155:3128', + 'http://191.243.46.162:43241', + 'http://20.206.106.192:80', +] + +KAFKA_BROKER = 'localhost:9092' +KAFKA_TOPIC = 'scraping' \ No newline at end of file diff --git a/scraping/scraping/spiders/countries_spider.py b/scraping/scraping/spiders/countries_spider.py index 28a58e1..d257081 100644 --- a/scraping/scraping/spiders/countries_spider.py +++ b/scraping/scraping/spiders/countries_spider.py @@ -1,4 +1,5 @@ import scrapy +from ..items import ScrapingItem class CountriesSpider(scrapy.Spider): name = "countries" @@ -11,12 +12,16 @@ def start_requests(self): yield scrapy.Request(url, callback=self.parse) def parse(self, response): + self.logger.info("User-Agent: " + response.request.headers['User-Agent'].decode('utf-8')) + self.logger.info("Proxy: " + response.request.meta['proxy']) + self.logger.info("Response status: " + str(response.status)) + self.logger.info("Coletando dados...") + for country in response.css('.country'): - # Exibe user-agent - self.logger.info("User-Agent: " + response.request.headers['User-Agent'].decode('utf-8')) - yield { - 'name': country.css('h3.country-name').xpath('normalize-space(.)').get(), - 'capital': country.css('.country-info .country-capital::text').get(), - 'population': country.css('.country-info .country-population::text').get(), - 'area': country.css('.country-info .country-area::text').get(), - } + content = ScrapingItem( + nameCountry=country.css('h3.country-name').xpath('normalize-space(.)').get(), + capitalCountry=country.css('.country-info .country-capital::text').get(), + populationCountry=country.css('.country-info .country-population::text').get(), + areaCountry=country.css('.country-info .country-area::text').get(), + ) + yield content From 8b8f07b7a6e7a65e847f06768dc11573c23442f3 Mon Sep 17 00:00:00 2001 From: mauro tony Date: Fri, 13 Oct 2023 14:38:32 -0300 Subject: [PATCH 06/16] =?UTF-8?q?Feat:=20Implementando=20frontend=20para?= =?UTF-8?q?=20visualiza=C3=A7=C3=A3o=20basica=20dos=20dados=20com=20stream?= =?UTF-8?q?lit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- streamlit-frontend.py | 62 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 streamlit-frontend.py diff --git a/streamlit-frontend.py b/streamlit-frontend.py new file mode 100644 index 0000000..f4a13c2 --- /dev/null +++ b/streamlit-frontend.py @@ -0,0 +1,62 @@ +import streamlit as st +import pandas as pd +import json +from confluent_kafka import Consumer + + +def kafka_config(): + return { + 'bootstrap.servers': 'localhost:9092', + 'group.id': 'streamlit111-group', + 'auto.offset.reset': 'earliest' + } + + +def consume_message(consumer): + msg = consumer.poll(1.0) + if msg is None: + return None, False + elif msg.error(): + st.write(f"Error: {msg.error()}") + return None, False + else: + return json.loads(msg.value().decode('utf-8')), True + + +def process_dataframe(record): + df = pd.DataFrame(record) + df["Densidade demográfica"] = df['populationCountry'] / df['areaCountry'] + df.rename(columns={'nameCountry': 'País', 'populationCountry': 'População', 'areaCountry': 'Área'}, inplace=True) + return df.sort_values(by='População', ascending=False).head(10), df + + +def main(): + st.title("Dashboard de dados do Kafka") + df_null = pd.DataFrame() + st.write("Top 10 Países por Densidade Demográfica") + chart = st.bar_chart(df_null) + + if 'msg_recived' not in st.session_state: + st.session_state['msg_recived'] = False + + if not st.session_state['msg_recived']: + consumer = Consumer(kafka_config()) + consumer.subscribe(['scraping']) + + while not st.session_state['msg_recived']: + record, is_msg_received = consume_message(consumer) + if is_msg_received: + df_highest_population, df = process_dataframe(record) + col1, col2 = st.columns(2) + with col1: + st.write("Tabela Completa") + st.write(df) + with col2: + chart.bar_chart(df_highest_population, y='Densidade demográfica', x='País', width=1000, height=700) + + consumer.close() + st.session_state['msg_recived'] = True + + +if __name__ == "__main__": + main() From 1c2d5351e9dca05eee5101d6b6ab2f9f76d2f88b Mon Sep 17 00:00:00 2001 From: mauro tony Date: Fri, 13 Oct 2023 14:39:51 -0300 Subject: [PATCH 07/16] =?UTF-8?q?Feat:=20Adicionando=20sistema=20de=20envi?= =?UTF-8?q?roments=20para=20obten=C3=A7=C3=A3o=20de=20variaveis=20de=20amb?= =?UTF-8?q?iente.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env | 2 ++ config.py | 13 +++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 .env create mode 100644 config.py diff --git a/.env b/.env new file mode 100644 index 0000000..f642015 --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +KAFKA_BROKER=localhost:9092 +KAFKA_TOPIC=scraping \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000..66116e9 --- /dev/null +++ b/config.py @@ -0,0 +1,13 @@ +import os +from dotenv import find_dotenv, load_dotenv + +load_dotenv(find_dotenv()) + + +class GeneralConfig: + KAFKA_TOPIC: str = os.getenv('KAFKA_TOPIC') + KAFKA_BROKER: str = os.getenv('KAFKA_BROKER') + + +def get_config() -> GeneralConfig: + return GeneralConfig() From 496d52ea15529c355a140f73502652b4ce04fef3 Mon Sep 17 00:00:00 2001 From: mauro tony Date: Fri, 13 Oct 2023 14:41:01 -0300 Subject: [PATCH 08/16] build: Ajustando docker compose --- docker-compose.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 6797d8b..1499032 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,4 +17,5 @@ services: KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092 KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 - KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true" \ No newline at end of file + KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true" + restart: on-failure \ No newline at end of file From 4895171999783df3c9afbd85a0e5f76cdb773916 Mon Sep 17 00:00:00 2001 From: mauro tony Date: Fri, 13 Oct 2023 14:42:45 -0300 Subject: [PATCH 09/16] =?UTF-8?q?refactor:=20refatorando=20comentarios=20p?= =?UTF-8?q?adr=C3=A3o=20do=20scrapy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scraping/scraping/items.py | 5 ----- scraping/scraping/middlewares.py | 2 -- scraping/scraping/pipelines.py | 6 ------ 3 files changed, 13 deletions(-) diff --git a/scraping/scraping/items.py b/scraping/scraping/items.py index a79daf0..a0a859c 100644 --- a/scraping/scraping/items.py +++ b/scraping/scraping/items.py @@ -1,8 +1,3 @@ -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - import scrapy diff --git a/scraping/scraping/middlewares.py b/scraping/scraping/middlewares.py index cf19f95..c509e80 100644 --- a/scraping/scraping/middlewares.py +++ b/scraping/scraping/middlewares.py @@ -6,8 +6,6 @@ import random import logging from scrapy import signals - -# useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter diff --git a/scraping/scraping/pipelines.py b/scraping/scraping/pipelines.py index 87b0fe3..d671f2e 100644 --- a/scraping/scraping/pipelines.py +++ b/scraping/scraping/pipelines.py @@ -1,10 +1,4 @@ -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import json - -# useful for handling different item types with a single interface from itemadapter import ItemAdapter from scrapy.exceptions import DropItem from confluent_kafka import Producer From a20921e0e67073963a4c9718b3ebbe5de93ebcb0 Mon Sep 17 00:00:00 2001 From: mauro tony Date: Fri, 13 Oct 2023 14:43:56 -0300 Subject: [PATCH 10/16] refactor: Ajustando dados para obter da env --- scraping/scraping/settings.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py index 6618a33..d488b21 100644 --- a/scraping/scraping/settings.py +++ b/scraping/scraping/settings.py @@ -6,6 +6,7 @@ # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html +from config import get_config BOT_NAME = "scraping" @@ -52,7 +53,7 @@ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { "scraping.middlewares.RandomUserAgentMiddleware": 200, - "scraping.middlewares.RandomProxyMiddleware": 100, + # "scraping.middlewares.RandomProxyMiddleware": 100, } # Enable or disable extensions @@ -112,5 +113,5 @@ 'http://20.206.106.192:80', ] -KAFKA_BROKER = 'localhost:9092' -KAFKA_TOPIC = 'scraping' \ No newline at end of file +KAFKA_BROKER = get_config().KAFKA_BROKER +KAFKA_TOPIC = get_config().KAFKA_TOPIC \ No newline at end of file From 5d093ff48d8cfd0774994d88b45e1c95bbb35687 Mon Sep 17 00:00:00 2001 From: mauro tony Date: Fri, 13 Oct 2023 14:44:17 -0300 Subject: [PATCH 11/16] feat: Adicionando script que executa o crawler --- runCrawler.py | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 runCrawler.py diff --git a/runCrawler.py b/runCrawler.py new file mode 100644 index 0000000..bba6340 --- /dev/null +++ b/runCrawler.py @@ -0,0 +1,7 @@ +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from scraping.scraping.spiders.countries_spider import CountriesSpider + +process = CrawlerProcess(get_project_settings()) +process.crawl(CountriesSpider) +process.start() From 75a52731dfca784b16ea9e4a9ee327798a63b647 Mon Sep 17 00:00:00 2001 From: mauro tony Date: Fri, 13 Oct 2023 14:45:21 -0300 Subject: [PATCH 12/16] refactor: Ajusando logger --- scraping/scraping/spiders/countries_spider.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scraping/scraping/spiders/countries_spider.py b/scraping/scraping/spiders/countries_spider.py index d257081..672d682 100644 --- a/scraping/scraping/spiders/countries_spider.py +++ b/scraping/scraping/spiders/countries_spider.py @@ -13,7 +13,6 @@ def start_requests(self): def parse(self, response): self.logger.info("User-Agent: " + response.request.headers['User-Agent'].decode('utf-8')) - self.logger.info("Proxy: " + response.request.meta['proxy']) self.logger.info("Response status: " + str(response.status)) self.logger.info("Coletando dados...") From e420226c0cb521d223ca376edf5aeeb9c19632ed Mon Sep 17 00:00:00 2001 From: mauro tony Date: Fri, 13 Oct 2023 14:46:17 -0300 Subject: [PATCH 13/16] feat: Reativando middleware controlador dos proxies --- scraping/scraping/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py index d488b21..e175ea5 100644 --- a/scraping/scraping/settings.py +++ b/scraping/scraping/settings.py @@ -53,7 +53,7 @@ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { "scraping.middlewares.RandomUserAgentMiddleware": 200, - # "scraping.middlewares.RandomProxyMiddleware": 100, + "scraping.middlewares.RandomProxyMiddleware": 100, } # Enable or disable extensions From d32ae5455e5296efb8ef2f4116d736a730697a41 Mon Sep 17 00:00:00 2001 From: mauro tony Date: Fri, 13 Oct 2023 14:47:57 -0300 Subject: [PATCH 14/16] docs: Adicionando requirements --- requirements.txt | Bin 0 -> 3246 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6a58d59c32b88ab8106c7cb1187c5dcd0333c31 GIT binary patch literal 3246 zcmZve-EJFK41~{hfIi9wmgJ=EMJ@^yC{VOW0O!7t^%vR7db3)^u^+zee4M4u$_a#( z43mc(4#_$D-@nUpC_`D6ed)_PecqJUWu|9dek!l@|Ig)1=`Jj@(K6^Ex<> zl9gN!@-`Z3Fdk}H#pywLwt9@^N@(!-Sr5qUw>^81XBL@^l`Nn(Vf$Xk{DJg(#BfwZm_O)Q=?r;2qnVe^Vx^ko8|G$#&2e_Jf7JsnBgf;{ z@Paeg9Mp%~u)+Ztm{_cTg!O|k4nj(m#BdI}* zv$HV|?Z`0aUBO;m>Y`sTyBE?X<^{IbdiVtu8^W|@tX7-3{I zZj^^z>BgD9oPobb@008|`g<#s#>2eTm}{Y6POzDi)a5pmBBOg3=(<^lPqtt=D#UJ# z4hmO-M;kMf`t8i-0IRjmxruM}q9-3z8@f%UAWKar0`^X4;gIQR^jMkigAx3bi-q`e zJp7rboqn@cd)0Wvp)1?)fc?3)A~ypjz)Yk>m3fjAJdUb*cFsX}zqhhe58~#e8aLxq z!~PV}llZMH`Q~Jo*kfaUimga3%I%q{24_S`eitV>{hsd{i4)coJa@w6-JSM}j@Nwe zg$SP;eg29XeaBUGhNV<5ZXNbvqdM6NcN$+hol|n@?=H1cQTN@OS{WR46JnXGM(2hz z6pI_dNWUl_IninqEF)2iv!Yboh^~|#*U84lw(yIpp`e~_cZlyA#kt5`Sz8E4cAY3w zpC)vIt|uzL4bjnFx#*EC)HOxb3Gs2V@8IxWSh;U>mRpt%m?7*H3AbA>Udgcu^*~e6 zYIG#ekac=yFFi`6)3=qQrUgZ+?`_6I9k4;&-1u~0dYk%f2ruS>`v-4-9B3RT32jthgVw=gbI5{qFev8|HA~gC_HK z(CHM)n>T4P2m*JDiaU8TTmOtT!pAWpBF_9C=e7BPz2E)7~a{Cc1a~ zQK(^it?b?aXQqB?dzKA8$?F+ql6>}AG0~;hdZuq*ReP%k=;>##J!GO>34OPA{13eS B)7Jn1 literal 0 HcmV?d00001 From 5658ae8dd04c7bd9581c4b77fc7149329ff0178f Mon Sep 17 00:00:00 2001 From: mauro tony Date: Fri, 13 Oct 2023 16:20:41 -0300 Subject: [PATCH 15/16] refactor: --- scraping/scraping/pipelines.py | 19 ++++++++++++++++--- scraping/scraping/settings.py | 6 ++++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/scraping/scraping/pipelines.py b/scraping/scraping/pipelines.py index d671f2e..6b77685 100644 --- a/scraping/scraping/pipelines.py +++ b/scraping/scraping/pipelines.py @@ -2,7 +2,9 @@ from itemadapter import ItemAdapter from scrapy.exceptions import DropItem from confluent_kafka import Producer +import logging +logger = logging.getLogger('MyPipelineLogger') class DataProcessingPipeline: @@ -33,9 +35,13 @@ def from_crawler(cls, crawler): ) def open_spider(self, spider): + + self.file = open("items.jsonl", "w") + self.file.write("ab re") self.producer = Producer({'bootstrap.servers': self.kafka_broker}) def close_spider(self, spider): + self.file.close() self.process_all_items() self.producer.flush() @@ -44,6 +50,13 @@ def process_item(self, item, spider): return item def process_all_items(self): - if self.items: - content = json.dumps(self.items) - self.producer.produce(self.kafka_topic, content) + try: + if self.items: + content = json.dumps(self.items) + teste = self.producer.produce(self.kafka_topic, content) + logger.info(teste) + logger.info(f"Enviando dados para o Kafka: {content}") + self.file.write("Envio para o Kafka") + except Exception as e: + self.file.write(str(e)) + logger.error(f"Erro ao enviar dados para o Kafka: {e}") diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py index e175ea5..4c4fa60 100644 --- a/scraping/scraping/settings.py +++ b/scraping/scraping/settings.py @@ -113,5 +113,7 @@ 'http://20.206.106.192:80', ] -KAFKA_BROKER = get_config().KAFKA_BROKER -KAFKA_TOPIC = get_config().KAFKA_TOPIC \ No newline at end of file +KAFKA_BROKER = 'localhost:9092' +KAFKA_TOPIC = 'scraping' +LOG_LEVEL = 'INFO' +LOG_FILE = 'my_spider.log' \ No newline at end of file From 00135cd912e42eac1ce3ec4d08d4da75d48e2f2d Mon Sep 17 00:00:00 2001 From: mauro tony Date: Fri, 13 Oct 2023 16:41:24 -0300 Subject: [PATCH 16/16] docs: Adicionando readme --- README.md | 91 +++++++++++-------------- runCrawler.py => scraping/runCrawler.py | 2 +- scraping/scraping/pipelines.py | 9 +-- scraping/scraping/settings.py | 7 +- streamlit-frontend.py | 79 +++++++++------------ 5 files changed, 76 insertions(+), 112 deletions(-) rename runCrawler.py => scraping/runCrawler.py (73%) diff --git a/README.md b/README.md index 9eaef99..ea42406 100644 --- a/README.md +++ b/README.md @@ -3,68 +3,55 @@ # Desafio Backend Python -**Objetivo:** Implementar um scraper web em Python para coletar dados da página web "Scrape This Site", estruturar esses dados em JSON, e enviá-los para uma fila Kafka. +**Objetivo:** O sistema consiste em um crawler que coleta dados de países de um site e os envia para uma fila no Kafka, no qual pode ser visualizado através de uma aplicação feita com streamlit. -**Requisitos:** +## Detalhes tecnicos -1. Coleta de Dados: +**Funcionamento do Crawler** - - Faça o scraping do site https://www.scrapethissite.com/pages/simple/. - - Colete os dados de todos os países listados, focando especificamente nos dados de população. + - O crawler é executado manualmente, coletando os dados de países do site https://www.scrapethissite.com/pages/simple/ e os envia para uma fila no Kafka. + - O crawler conta com um sistema de proxies rotativos, que são utilizados para evitar o bloqueio do site. (OBS: O crawler pode apresentar lentidão devido a utilização de proxies gratuitos) + - O crawler conta com um sistema de User-Agent rotativos, que também são utilizados para evitar o bloqueio do site. -2. Estruturação dos Dados: +**Integração com Kafka**: - - Estruture os dados coletados em JSON. - - Utilize classes ou dicionários em Python para representar a estrutura dos dados. A estrutura deve conter, no mínimo, os campos: "País" e "População". + - O crawler envia os dados para uma fila no Kafka, que é consumida pela aplicação feita com streamlit. + - O Kafka foi configurado utilizando o docker-compose, para facilitar a execução do projeto. -3. Integração com Kafka: +**Aplicação com streamlit**: - - Envie os dados estruturados para uma fila no Kafka. - - Providencie o arquivo Docker (Dockerfile e docker-compose, se aplicável) do Kafka utilizado no teste. + - A aplicação feita com streamlit consome os dados da fila no Kafka e os exibe em uma tabela além de apresentar um gráfico demonstrando o países com maiores densidades demográficas. +## Instalação -**Diferenciais:** +**Pré-requisitos:** -- Implemente lógicas e algoritmos para evitar o bloqueio do scraper, como: - - Uso de proxies rotativos. - - Intervals variáveis entre as requisições. - - Identificação e manipulação de headers (User-Agent) para simular diferentes browsers ou dispositivos. + - Docker Compose + - Python 3 + - Pip + - Git -**O que será avaliado:** +**Instalação:** -1. Qualidade do código e organização. -2. Capacidade de definir e utilizar classes ou dicionários em Python. -3. Integração com Kafka e a correta configuração do ambiente Docker para o Kafka. -4. Implementação dos diferenciais (se aplicável). -5. Documentação do código e instruções para execução. + - Clone o repositório + - `git clone git@github.com:MauroTony/Teste-Backend-Python.git` + - `cd Teste-Backend-Python` + - `git checkout main` + - Execute o docker-compose + - `docker-compose up -d` + - Instale as dependências do projeto + - `pip install -r requirements.txt` + - Configue as variáveis de ambiente + - Valide que a .env existe na raiz do projeto + - Valide a existencia da variável de ambiente KAFKA_HOST e KAFKA_PORT e configure-as caso necessário + +**Execução:** -**Instruções para a entrega:** - -1. O candidato deve dar fork neste repositório e após o termino do desenvolvimento, realizar um pull request para análise do time. -2. Inclua um README com instruções claras sobre como executar e testar o projeto. - ---- -#### LICENSE -``` -MIT License - -Copyright (c) 2016 ZenoX IA - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -``` + - Inicialize o kafka + - `docker-compose up -d` + - Inicialize o streamlit + - `streamlit run streamlit-frontend.py` + - Execute o crawler + - `cd scraping` + - `python runCrawler.py` + diff --git a/runCrawler.py b/scraping/runCrawler.py similarity index 73% rename from runCrawler.py rename to scraping/runCrawler.py index bba6340..6547117 100644 --- a/runCrawler.py +++ b/scraping/runCrawler.py @@ -1,6 +1,6 @@ from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings -from scraping.scraping.spiders.countries_spider import CountriesSpider +from scraping.spiders.countries_spider import CountriesSpider process = CrawlerProcess(get_project_settings()) process.crawl(CountriesSpider) diff --git a/scraping/scraping/pipelines.py b/scraping/scraping/pipelines.py index 6b77685..ef7900e 100644 --- a/scraping/scraping/pipelines.py +++ b/scraping/scraping/pipelines.py @@ -35,13 +35,9 @@ def from_crawler(cls, crawler): ) def open_spider(self, spider): - - self.file = open("items.jsonl", "w") - self.file.write("ab re") self.producer = Producer({'bootstrap.servers': self.kafka_broker}) def close_spider(self, spider): - self.file.close() self.process_all_items() self.producer.flush() @@ -53,10 +49,7 @@ def process_all_items(self): try: if self.items: content = json.dumps(self.items) - teste = self.producer.produce(self.kafka_topic, content) - logger.info(teste) + self.producer.produce(self.kafka_topic, content) logger.info(f"Enviando dados para o Kafka: {content}") - self.file.write("Envio para o Kafka") except Exception as e: - self.file.write(str(e)) logger.error(f"Erro ao enviar dados para o Kafka: {e}") diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py index 4c4fa60..16a6b90 100644 --- a/scraping/scraping/settings.py +++ b/scraping/scraping/settings.py @@ -6,7 +6,7 @@ # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html -from config import get_config +# from config import get_config BOT_NAME = "scraping" @@ -110,10 +110,7 @@ 'http://181.191.94.126:8999', 'http://201.91.82.155:3128', 'http://191.243.46.162:43241', - 'http://20.206.106.192:80', ] KAFKA_BROKER = 'localhost:9092' -KAFKA_TOPIC = 'scraping' -LOG_LEVEL = 'INFO' -LOG_FILE = 'my_spider.log' \ No newline at end of file +KAFKA_TOPIC = 'scraping' \ No newline at end of file diff --git a/streamlit-frontend.py b/streamlit-frontend.py index f4a13c2..c8136c7 100644 --- a/streamlit-frontend.py +++ b/streamlit-frontend.py @@ -3,60 +3,47 @@ import json from confluent_kafka import Consumer +st.title("Dashboard de dados do Kafka") -def kafka_config(): - return { +df_null = pd.DataFrame() +chart = st.bar_chart(df_null) + +if 'msg_recived' not in st.session_state: + st.session_state['msg_recived'] = False + +if not st.session_state['msg_recived']: + conf = { 'bootstrap.servers': 'localhost:9092', - 'group.id': 'streamlit111-group', + 'group.id': 'streamlit252-group', 'auto.offset.reset': 'earliest' } + consumer = Consumer(conf) + consumer.subscribe(['scraping']) -def consume_message(consumer): +while not st.session_state['msg_recived']: msg = consumer.poll(1.0) if msg is None: - return None, False + continue elif msg.error(): st.write(f"Error: {msg.error()}") - return None, False + continue else: - return json.loads(msg.value().decode('utf-8')), True - - -def process_dataframe(record): - df = pd.DataFrame(record) - df["Densidade demográfica"] = df['populationCountry'] / df['areaCountry'] - df.rename(columns={'nameCountry': 'País', 'populationCountry': 'População', 'areaCountry': 'Área'}, inplace=True) - return df.sort_values(by='População', ascending=False).head(10), df - - -def main(): - st.title("Dashboard de dados do Kafka") - df_null = pd.DataFrame() - st.write("Top 10 Países por Densidade Demográfica") - chart = st.bar_chart(df_null) - - if 'msg_recived' not in st.session_state: - st.session_state['msg_recived'] = False - - if not st.session_state['msg_recived']: - consumer = Consumer(kafka_config()) - consumer.subscribe(['scraping']) - - while not st.session_state['msg_recived']: - record, is_msg_received = consume_message(consumer) - if is_msg_received: - df_highest_population, df = process_dataframe(record) - col1, col2 = st.columns(2) - with col1: - st.write("Tabela Completa") - st.write(df) - with col2: - chart.bar_chart(df_highest_population, y='Densidade demográfica', x='País', width=1000, height=700) - - consumer.close() - st.session_state['msg_recived'] = True - - -if __name__ == "__main__": - main() + record = json.loads(msg.value().decode('utf-8')) + df = pd.DataFrame(record) + df_highest_population = df.sort_values(by='populationCountry', ascending=False).head(10) + df_highest_population["Densidade demográfica"] = df_highest_population['populationCountry'] / \ + df_highest_population['areaCountry'] + df_highest_population.rename( + columns={'nameCountry': 'País', 'populationCountry': 'População', 'areaCountry': 'Área'}, inplace=True) + #menu = st.selectbox("Escolha uma visualização:", ["Tabela Completa", "Top 10 Países por População"]) + col1, col2 = st.columns(2) + with col1: + print("Tabela Completa") + st.write(df) + with col2: + print("Top 10 Países por População") + chart.bar_chart(df_highest_population, y='Densidade demográfica', x='País', width=1000, height=700) + consumer.close() + st.session_state['msg_recived'] = True + break \ No newline at end of file