From c966788ec3ef7328a67fc475f113d0470aa71449 Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Tue, 10 Oct 2023 03:10:39 -0300
Subject: [PATCH 01/16] Feat: Iniciando projeto com Scrapy

---
 scraping/scraping/__init__.py         |   0
 scraping/scraping/items.py            |  12 +++
 scraping/scraping/middlewares.py      | 103 ++++++++++++++++++++++++++
 scraping/scraping/pipelines.py        |  13 ++++
 scraping/scraping/settings.py         |  93 +++++++++++++++++++++++
 scraping/scraping/spiders/__init__.py |   4 +
 scraping/scrapy.cfg                   |  11 +++
 7 files changed, 236 insertions(+)
 create mode 100644 scraping/scraping/__init__.py
 create mode 100644 scraping/scraping/items.py
 create mode 100644 scraping/scraping/middlewares.py
 create mode 100644 scraping/scraping/pipelines.py
 create mode 100644 scraping/scraping/settings.py
 create mode 100644 scraping/scraping/spiders/__init__.py
 create mode 100644 scraping/scrapy.cfg

diff --git a/scraping/scraping/__init__.py b/scraping/scraping/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scraping/scraping/items.py b/scraping/scraping/items.py
new file mode 100644
index 0000000..0fe9a20
--- /dev/null
+++ b/scraping/scraping/items.py
@@ -0,0 +1,12 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ScrapingItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
diff --git a/scraping/scraping/middlewares.py b/scraping/scraping/middlewares.py
new file mode 100644
index 0000000..8445d6c
--- /dev/null
+++ b/scraping/scraping/middlewares.py
@@ -0,0 +1,103 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class ScrapingSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class ScrapingDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
diff --git a/scraping/scraping/pipelines.py b/scraping/scraping/pipelines.py
new file mode 100644
index 0000000..db116b6
--- /dev/null
+++ b/scraping/scraping/pipelines.py
@@ -0,0 +1,13 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class ScrapingPipeline:
+    def process_item(self, item, spider):
+        return item
diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py
new file mode 100644
index 0000000..8a28084
--- /dev/null
+++ b/scraping/scraping/settings.py
@@ -0,0 +1,93 @@
+# Scrapy settings for scraping project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "scraping"
+
+SPIDER_MODULES = ["scraping.spiders"]
+NEWSPIDER_MODULE = "scraping.spiders"
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = "scraping (+http://www.yourdomain.com)"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "scraping.middlewares.ScrapingSpiderMiddleware": 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    "scraping.middlewares.ScrapingDownloaderMiddleware": 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    "scraping.pipelines.ScrapingPipeline": 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
diff --git a/scraping/scraping/spiders/__init__.py b/scraping/scraping/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/scraping/scraping/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/scraping/scrapy.cfg b/scraping/scrapy.cfg
new file mode 100644
index 0000000..0e0aaa1
--- /dev/null
+++ b/scraping/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = scraping.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = scraping

From a0631c0421fc36bf4f67b6e4b8d9345ee690985c Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Tue, 10 Oct 2023 03:35:55 -0300
Subject: [PATCH 02/16] Feat: Adicionando scraping do site
 https://www.scrapethissite.com/pages/simple/

---
 scraping/scraping/spiders/countries_spider.py | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 scraping/scraping/spiders/countries_spider.py

diff --git a/scraping/scraping/spiders/countries_spider.py b/scraping/scraping/spiders/countries_spider.py
new file mode 100644
index 0000000..4001b74
--- /dev/null
+++ b/scraping/scraping/spiders/countries_spider.py
@@ -0,0 +1,24 @@
+import scrapy
+
+class CountriesSpider(scrapy.Spider):
+    name = "countries"
+    start_urls = [
+        'https://www.scrapethissite.com/pages/simple/'
+    ]
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
+        'Accept-Language': 'pt-BR'
+    }
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url, headers=self.headers, callback=self.parse)
+
+    def parse(self, response):
+        for country in response.css('.country'):
+            yield {
+                'name': country.css('h3.country-name').xpath('normalize-space(.)').get(),
+                'capital': country.css('.country-info .country-capital::text').get(),
+                'population': country.css('.country-info .country-population::text').get(),
+                'area': country.css('.country-info .country-area::text').get(),
+            }

From d60c7f1e8a4c86a32cf989d7c5a15819a14112c4 Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Tue, 10 Oct 2023 03:44:34 -0300
Subject: [PATCH 03/16] =?UTF-8?q?Feat:=20Identifica=C3=A7=C3=A3o=20e=20man?=
 =?UTF-8?q?ipula=C3=A7=C3=A3o=20de=20headers=20(User-Agent)=20para=20simul?=
 =?UTF-8?q?ar=20diferentes=20browsers=20ou=20dispositivos.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scraping/scraping/middlewares.py              | 93 ++-----------------
 scraping/scraping/settings.py                 | 17 +++-
 scraping/scraping/spiders/countries_spider.py |  8 +-
 3 files changed, 23 insertions(+), 95 deletions(-)

diff --git a/scraping/scraping/middlewares.py b/scraping/scraping/middlewares.py
index 8445d6c..77d38ae 100644
--- a/scraping/scraping/middlewares.py
+++ b/scraping/scraping/middlewares.py
@@ -3,101 +3,20 @@
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 
+import random
 from scrapy import signals
 
 # useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
 
 
-class ScrapingSpiderMiddleware:
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the spider middleware does not modify the
-    # passed objects.
+class RandomUserAgentMiddleware:
+    def __init__(self, user_agents):
+        self.user_agents = user_agents
 
     @classmethod
     def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_spider_input(self, response, spider):
-        # Called for each response that goes through the spider
-        # middleware and into the spider.
-
-        # Should return None or raise an exception.
-        return None
-
-    def process_spider_output(self, response, result, spider):
-        # Called with the results returned from the Spider, after
-        # it has processed the response.
-
-        # Must return an iterable of Request, or item objects.
-        for i in result:
-            yield i
-
-    def process_spider_exception(self, response, exception, spider):
-        # Called when a spider or process_spider_input() method
-        # (from other spider middleware) raises an exception.
-
-        # Should return either None or an iterable of Request or item objects.
-        pass
-
-    def process_start_requests(self, start_requests, spider):
-        # Called with the start requests of the spider, and works
-        # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
-
-        # Must return only requests (not items).
-        for r in start_requests:
-            yield r
-
-    def spider_opened(self, spider):
-        spider.logger.info("Spider opened: %s" % spider.name)
-
-
-class ScrapingDownloaderMiddleware:
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the downloader middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
+        return cls(user_agents=crawler.settings.getlist('USER_AGENTS'))
 
     def process_request(self, request, spider):
-        # Called for each request that goes through the downloader
-        # middleware.
-
-        # Must either:
-        # - return None: continue processing this request
-        # - or return a Response object
-        # - or return a Request object
-        # - or raise IgnoreRequest: process_exception() methods of
-        #   installed downloader middleware will be called
-        return None
-
-    def process_response(self, request, response, spider):
-        # Called with the response returned from the downloader.
-
-        # Must either;
-        # - return a Response object
-        # - return a Request object
-        # - or raise IgnoreRequest
-        return response
-
-    def process_exception(self, request, exception, spider):
-        # Called when a download handler or a process_request()
-        # (from other downloader middleware) raises an exception.
-
-        # Must either:
-        # - return None: continue processing this exception
-        # - return a Response object: stops process_exception() chain
-        # - return a Request object: stops process_exception() chain
-        pass
-
-    def spider_opened(self, spider):
-        spider.logger.info("Spider opened: %s" % spider.name)
+        request.headers.setdefault('User-Agent', random.choice(self.user_agents))
diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py
index 8a28084..5fcbe07 100644
--- a/scraping/scraping/settings.py
+++ b/scraping/scraping/settings.py
@@ -50,9 +50,9 @@
 
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
-#    "scraping.middlewares.ScrapingDownloaderMiddleware": 543,
-#}
+DOWNLOADER_MIDDLEWARES = {
+    "scraping.middlewares.RandomUserAgentMiddleware": 100,
+}
 
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
@@ -91,3 +91,14 @@
 REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 FEED_EXPORT_ENCODING = "utf-8"
+
+USER_AGENTS = [
+    # Chrome 91 Windows 10
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+    # Firefox 89 Windows 10
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
+    # Safari 14 macOS
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
+    # iPhone X Safari
+    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
+]
\ No newline at end of file
diff --git a/scraping/scraping/spiders/countries_spider.py b/scraping/scraping/spiders/countries_spider.py
index 4001b74..28a58e1 100644
--- a/scraping/scraping/spiders/countries_spider.py
+++ b/scraping/scraping/spiders/countries_spider.py
@@ -5,17 +5,15 @@ class CountriesSpider(scrapy.Spider):
     start_urls = [
         'https://www.scrapethissite.com/pages/simple/'
     ]
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
-        'Accept-Language': 'pt-BR'
-    }
 
     def start_requests(self):
         for url in self.start_urls:
-            yield scrapy.Request(url, headers=self.headers, callback=self.parse)
+            yield scrapy.Request(url, callback=self.parse)
 
     def parse(self, response):
         for country in response.css('.country'):
+            # Exibe user-agent
+            self.logger.info("User-Agent: " + response.request.headers['User-Agent'].decode('utf-8'))
             yield {
                 'name': country.css('h3.country-name').xpath('normalize-space(.)').get(),
                 'capital': country.css('.country-info .country-capital::text').get(),

From ec3ca5bc4c00026fe4ee400acb0117be8545c204 Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Tue, 10 Oct 2023 04:04:27 -0300
Subject: [PATCH 04/16] Feat: Adicionando uso de proxies rotativos

---
 scraping/scraping/middlewares.py | 24 ++++++++++++++++++++++++
 scraping/scraping/settings.py    | 10 +++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/scraping/scraping/middlewares.py b/scraping/scraping/middlewares.py
index 77d38ae..cf19f95 100644
--- a/scraping/scraping/middlewares.py
+++ b/scraping/scraping/middlewares.py
@@ -4,6 +4,7 @@
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 
 import random
+import logging
 from scrapy import signals
 
 # useful for handling different item types with a single interface
@@ -20,3 +21,26 @@ def from_crawler(cls, crawler):
 
     def process_request(self, request, spider):
         request.headers.setdefault('User-Agent', random.choice(self.user_agents))
+
+
+class RandomProxyMiddleware:
+    def __init__(self, proxies):
+        self.proxies = proxies
+        self.logger = logging.getLogger(__name__)
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(proxies=crawler.settings.getlist('PROXIES'))
+
+    def process_request(self, request, spider):
+        proxy = random.choice(self.proxies)
+        request.meta['proxy'] = proxy
+
+    def process_exception(self, request, exception, spider):
+        if isinstance(exception, (ConnectionRefusedError, TimeoutError, )):
+            self.logger.warning(f"Failed to connect using proxy {request.meta['proxy']}, retrying a different proxy...")
+            new_request = request.copy()
+            new_request.dont_filter = True
+            new_request.priority = request.priority + 1
+            return new_request
+        return None
\ No newline at end of file
diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py
index 5fcbe07..46f64a4 100644
--- a/scraping/scraping/settings.py
+++ b/scraping/scraping/settings.py
@@ -51,7 +51,8 @@
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
-    "scraping.middlewares.RandomUserAgentMiddleware": 100,
+    "scraping.middlewares.RandomUserAgentMiddleware": 200,
+    "scraping.middlewares.RandomProxyMiddleware": 100,
 }
 
 # Enable or disable extensions
@@ -101,4 +102,11 @@
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
     # iPhone X Safari
     "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
+]
+
+PROXIES = [
+    'http://20.206.106.192:8123',
+    'http://177.12.118.160:80',
+    'http://138.204.95.166:8080',
+    'http://191.243.46.162:43241'
 ]
\ No newline at end of file

From 80fba962fb9fc961f9a92800b85e8f6fa4257908 Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Wed, 11 Oct 2023 09:13:11 -0300
Subject: [PATCH 05/16] =?UTF-8?q?Feat:=20Implementando=20integra=C3=A7?=
 =?UTF-8?q?=C3=A3o=20com=20o=20kafka?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docker-compose.yml                            | 20 ++++++++
 scraping/scraping/items.py                    |  7 +--
 scraping/scraping/pipelines.py                | 46 ++++++++++++++++++-
 scraping/scraping/settings.py                 | 20 ++++----
 scraping/scraping/spiders/countries_spider.py | 21 +++++----
 5 files changed, 93 insertions(+), 21 deletions(-)
 create mode 100644 docker-compose.yml

diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..6797d8b
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,20 @@
+version: '3.8'
+
+services:
+  zookeeper:
+      image: confluentinc/cp-zookeeper:latest
+      environment:
+        ZOOKEEPER_CLIENT_PORT: 2181
+
+  kafka:
+    image: confluentinc/cp-kafka:latest
+    depends_on:
+      - zookeeper
+    ports:
+      - "9092:9092"
+    environment:
+      KAFKA_BROKER_ID: 1
+      KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
+      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092
+      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
+      KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true"
\ No newline at end of file
diff --git a/scraping/scraping/items.py b/scraping/scraping/items.py
index 0fe9a20..a79daf0 100644
--- a/scraping/scraping/items.py
+++ b/scraping/scraping/items.py
@@ -7,6 +7,7 @@
 
 
 class ScrapingItem(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
-    pass
+    nameCountry = scrapy.Field()
+    capitalCountry = scrapy.Field()
+    populationCountry = scrapy.Field()
+    areaCountry = scrapy.Field()
diff --git a/scraping/scraping/pipelines.py b/scraping/scraping/pipelines.py
index db116b6..87b0fe3 100644
--- a/scraping/scraping/pipelines.py
+++ b/scraping/scraping/pipelines.py
@@ -2,12 +2,54 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-
+import json
 
 # useful for handling different item types with a single interface
 from itemadapter import ItemAdapter
+from scrapy.exceptions import DropItem
+from confluent_kafka import Producer
+
 
+class DataProcessingPipeline:
 
-class ScrapingPipeline:
     def process_item(self, item, spider):
+        if not item.get('populationCountry') or not item.get('areaCountry'):
+            raise DropItem("Item faltando campos necessários")
+
+        try:
+            item['populationCountry'] = int(item['populationCountry'])
+            item['areaCountry'] = float(item['areaCountry'])
+        except ValueError:
+            raise DropItem("Não foi possível converter os dados")
         return item
+
+
+class KafkaPipeline:
+
+    def __init__(self, kafka_broker, kafka_topic):
+        self.kafka_broker = kafka_broker
+        self.kafka_topic = kafka_topic
+        self.items = []
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(
+            kafka_broker=crawler.settings.get('KAFKA_BROKER'),
+            kafka_topic=crawler.settings.get('KAFKA_TOPIC')
+        )
+
+    def open_spider(self, spider):
+        self.producer = Producer({'bootstrap.servers': self.kafka_broker})
+
+    def close_spider(self, spider):
+        self.process_all_items()
+        self.producer.flush()
+
+    def process_item(self, item, spider):
+        self.items.append(dict(item))
+        return item
+
+    def process_all_items(self):
+        if self.items:
+            content = json.dumps(self.items)
+            self.producer.produce(self.kafka_topic, content)
diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py
index 46f64a4..6618a33 100644
--- a/scraping/scraping/settings.py
+++ b/scraping/scraping/settings.py
@@ -63,9 +63,10 @@
 
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    "scraping.pipelines.ScrapingPipeline": 300,
-#}
+ITEM_PIPELINES = {
+    "scraping.pipelines.DataProcessingPipeline": 300,
+    "scraping.pipelines.KafkaPipeline": 400,
+}
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
@@ -105,8 +106,11 @@
 ]
 
 PROXIES = [
-    'http://20.206.106.192:8123',
-    'http://177.12.118.160:80',
-    'http://138.204.95.166:8080',
-    'http://191.243.46.162:43241'
-]
\ No newline at end of file
+    'http://181.191.94.126:8999',
+    'http://201.91.82.155:3128',
+    'http://191.243.46.162:43241',
+    'http://20.206.106.192:80',
+]
+
+KAFKA_BROKER = 'localhost:9092'
+KAFKA_TOPIC = 'scraping'
\ No newline at end of file
diff --git a/scraping/scraping/spiders/countries_spider.py b/scraping/scraping/spiders/countries_spider.py
index 28a58e1..d257081 100644
--- a/scraping/scraping/spiders/countries_spider.py
+++ b/scraping/scraping/spiders/countries_spider.py
@@ -1,4 +1,5 @@
 import scrapy
+from ..items import ScrapingItem
 
 class CountriesSpider(scrapy.Spider):
     name = "countries"
@@ -11,12 +12,16 @@ def start_requests(self):
             yield scrapy.Request(url, callback=self.parse)
 
     def parse(self, response):
+        self.logger.info("User-Agent: " + response.request.headers['User-Agent'].decode('utf-8'))
+        self.logger.info("Proxy: " + response.request.meta['proxy'])
+        self.logger.info("Response status: " + str(response.status))
+        self.logger.info("Coletando dados...")
+
         for country in response.css('.country'):
-            # Exibe user-agent
-            self.logger.info("User-Agent: " + response.request.headers['User-Agent'].decode('utf-8'))
-            yield {
-                'name': country.css('h3.country-name').xpath('normalize-space(.)').get(),
-                'capital': country.css('.country-info .country-capital::text').get(),
-                'population': country.css('.country-info .country-population::text').get(),
-                'area': country.css('.country-info .country-area::text').get(),
-            }
+            content = ScrapingItem(
+                nameCountry=country.css('h3.country-name').xpath('normalize-space(.)').get(),
+                capitalCountry=country.css('.country-info .country-capital::text').get(),
+                populationCountry=country.css('.country-info .country-population::text').get(),
+                areaCountry=country.css('.country-info .country-area::text').get(),
+            )
+            yield content

From 8b8f07b7a6e7a65e847f06768dc11573c23442f3 Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Fri, 13 Oct 2023 14:38:32 -0300
Subject: [PATCH 06/16] =?UTF-8?q?Feat:=20Implementando=20frontend=20para?=
 =?UTF-8?q?=20visualiza=C3=A7=C3=A3o=20basica=20dos=20dados=20com=20stream?=
 =?UTF-8?q?lit?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 streamlit-frontend.py | 62 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 streamlit-frontend.py

diff --git a/streamlit-frontend.py b/streamlit-frontend.py
new file mode 100644
index 0000000..f4a13c2
--- /dev/null
+++ b/streamlit-frontend.py
@@ -0,0 +1,62 @@
+import streamlit as st
+import pandas as pd
+import json
+from confluent_kafka import Consumer
+
+
+def kafka_config():
+    return {
+        'bootstrap.servers': 'localhost:9092',
+        'group.id': 'streamlit111-group',
+        'auto.offset.reset': 'earliest'
+    }
+
+
+def consume_message(consumer):
+    msg = consumer.poll(1.0)
+    if msg is None:
+        return None, False
+    elif msg.error():
+        st.write(f"Error: {msg.error()}")
+        return None, False
+    else:
+        return json.loads(msg.value().decode('utf-8')), True
+
+
+def process_dataframe(record):
+    df = pd.DataFrame(record)
+    df["Densidade demográfica"] = df['populationCountry'] / df['areaCountry']
+    df.rename(columns={'nameCountry': 'País', 'populationCountry': 'População', 'areaCountry': 'Área'}, inplace=True)
+    return df.sort_values(by='População', ascending=False).head(10), df
+
+
+def main():
+    st.title("Dashboard de dados do Kafka")
+    df_null = pd.DataFrame()
+    st.write("Top 10 Países por Densidade Demográfica")
+    chart = st.bar_chart(df_null)
+
+    if 'msg_recived' not in st.session_state:
+        st.session_state['msg_recived'] = False
+
+    if not st.session_state['msg_recived']:
+        consumer = Consumer(kafka_config())
+        consumer.subscribe(['scraping'])
+
+    while not st.session_state['msg_recived']:
+        record, is_msg_received = consume_message(consumer)
+        if is_msg_received:
+            df_highest_population, df = process_dataframe(record)
+            col1, col2 = st.columns(2)
+            with col1:
+                st.write("Tabela Completa")
+                st.write(df)
+            with col2:
+                chart.bar_chart(df_highest_population, y='Densidade demográfica', x='País', width=1000, height=700)
+
+            consumer.close()
+            st.session_state['msg_recived'] = True
+
+
+if __name__ == "__main__":
+    main()

From 1c2d5351e9dca05eee5101d6b6ab2f9f76d2f88b Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Fri, 13 Oct 2023 14:39:51 -0300
Subject: [PATCH 07/16] =?UTF-8?q?Feat:=20Adicionando=20sistema=20de=20envi?=
 =?UTF-8?q?roments=20para=20obten=C3=A7=C3=A3o=20de=20variaveis=20de=20amb?=
 =?UTF-8?q?iente.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .env      |  2 ++
 config.py | 13 +++++++++++++
 2 files changed, 15 insertions(+)
 create mode 100644 .env
 create mode 100644 config.py

diff --git a/.env b/.env
new file mode 100644
index 0000000..f642015
--- /dev/null
+++ b/.env
@@ -0,0 +1,2 @@
+KAFKA_BROKER=localhost:9092
+KAFKA_TOPIC=scraping
\ No newline at end of file
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..66116e9
--- /dev/null
+++ b/config.py
@@ -0,0 +1,13 @@
+import os
+from dotenv import find_dotenv, load_dotenv
+
+load_dotenv(find_dotenv())
+
+
+class GeneralConfig:
+    KAFKA_TOPIC: str = os.getenv('KAFKA_TOPIC')
+    KAFKA_BROKER: str = os.getenv('KAFKA_BROKER')
+
+
+def get_config() -> GeneralConfig:
+    return GeneralConfig()

From 496d52ea15529c355a140f73502652b4ce04fef3 Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Fri, 13 Oct 2023 14:41:01 -0300
Subject: [PATCH 08/16] build: Ajustando docker compose

---
 docker-compose.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 6797d8b..1499032 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -17,4 +17,5 @@ services:
       KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092
       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
-      KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true"
\ No newline at end of file
+      KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true"
+    restart: on-failure
\ No newline at end of file

From 4895171999783df3c9afbd85a0e5f76cdb773916 Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Fri, 13 Oct 2023 14:42:45 -0300
Subject: [PATCH 09/16] =?UTF-8?q?refactor:=20refatorando=20comentarios=20p?=
 =?UTF-8?q?adr=C3=A3o=20do=20scrapy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scraping/scraping/items.py       | 5 -----
 scraping/scraping/middlewares.py | 2 --
 scraping/scraping/pipelines.py   | 6 ------
 3 files changed, 13 deletions(-)

diff --git a/scraping/scraping/items.py b/scraping/scraping/items.py
index a79daf0..a0a859c 100644
--- a/scraping/scraping/items.py
+++ b/scraping/scraping/items.py
@@ -1,8 +1,3 @@
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/items.html
-
 import scrapy
 
 
diff --git a/scraping/scraping/middlewares.py b/scraping/scraping/middlewares.py
index cf19f95..c509e80 100644
--- a/scraping/scraping/middlewares.py
+++ b/scraping/scraping/middlewares.py
@@ -6,8 +6,6 @@
 import random
 import logging
 from scrapy import signals
-
-# useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
 
 
diff --git a/scraping/scraping/pipelines.py b/scraping/scraping/pipelines.py
index 87b0fe3..d671f2e 100644
--- a/scraping/scraping/pipelines.py
+++ b/scraping/scraping/pipelines.py
@@ -1,10 +1,4 @@
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 import json
-
-# useful for handling different item types with a single interface
 from itemadapter import ItemAdapter
 from scrapy.exceptions import DropItem
 from confluent_kafka import Producer

From a20921e0e67073963a4c9718b3ebbe5de93ebcb0 Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Fri, 13 Oct 2023 14:43:56 -0300
Subject: [PATCH 10/16] refactor: Ajustando dados para obter da env

---
 scraping/scraping/settings.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py
index 6618a33..d488b21 100644
--- a/scraping/scraping/settings.py
+++ b/scraping/scraping/settings.py
@@ -6,6 +6,7 @@
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+from config import get_config
 
 BOT_NAME = "scraping"
 
@@ -52,7 +53,7 @@
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
     "scraping.middlewares.RandomUserAgentMiddleware": 200,
-    "scraping.middlewares.RandomProxyMiddleware": 100,
+   # "scraping.middlewares.RandomProxyMiddleware": 100,
 }
 
 # Enable or disable extensions
@@ -112,5 +113,5 @@
     'http://20.206.106.192:80',
 ]
 
-KAFKA_BROKER = 'localhost:9092'
-KAFKA_TOPIC = 'scraping'
\ No newline at end of file
+KAFKA_BROKER = get_config().KAFKA_BROKER
+KAFKA_TOPIC = get_config().KAFKA_TOPIC
\ No newline at end of file

From 5d093ff48d8cfd0774994d88b45e1c95bbb35687 Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Fri, 13 Oct 2023 14:44:17 -0300
Subject: [PATCH 11/16] feat: Adicionando script que executa o crawler

---
 runCrawler.py | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 runCrawler.py

diff --git a/runCrawler.py b/runCrawler.py
new file mode 100644
index 0000000..bba6340
--- /dev/null
+++ b/runCrawler.py
@@ -0,0 +1,7 @@
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+from scraping.scraping.spiders.countries_spider import CountriesSpider
+
+process = CrawlerProcess(get_project_settings())
+process.crawl(CountriesSpider)
+process.start()

From 75a52731dfca784b16ea9e4a9ee327798a63b647 Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Fri, 13 Oct 2023 14:45:21 -0300
Subject: [PATCH 12/16] refactor: Ajusando logger

---
 scraping/scraping/spiders/countries_spider.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scraping/scraping/spiders/countries_spider.py b/scraping/scraping/spiders/countries_spider.py
index d257081..672d682 100644
--- a/scraping/scraping/spiders/countries_spider.py
+++ b/scraping/scraping/spiders/countries_spider.py
@@ -13,7 +13,6 @@ def start_requests(self):
 
     def parse(self, response):
         self.logger.info("User-Agent: " + response.request.headers['User-Agent'].decode('utf-8'))
-        self.logger.info("Proxy: " + response.request.meta['proxy'])
         self.logger.info("Response status: " + str(response.status))
         self.logger.info("Coletando dados...")
 

From e420226c0cb521d223ca376edf5aeeb9c19632ed Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Fri, 13 Oct 2023 14:46:17 -0300
Subject: [PATCH 13/16] feat: Reativando middleware controlador dos proxies

---
 scraping/scraping/settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py
index d488b21..e175ea5 100644
--- a/scraping/scraping/settings.py
+++ b/scraping/scraping/settings.py
@@ -53,7 +53,7 @@
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
     "scraping.middlewares.RandomUserAgentMiddleware": 200,
-   # "scraping.middlewares.RandomProxyMiddleware": 100,
+   "scraping.middlewares.RandomProxyMiddleware": 100,
 }
 
 # Enable or disable extensions

From d32ae5455e5296efb8ef2f4116d736a730697a41 Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Fri, 13 Oct 2023 14:47:57 -0300
Subject: [PATCH 14/16] docs: Adicionando requirements

---
 requirements.txt | Bin 0 -> 3246 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e6a58d59c32b88ab8106c7cb1187c5dcd0333c31
GIT binary patch
literal 3246
zcmZve-EJFK41~{hfIi9wmgJ=EMJ@^yC{VOW0O!7t^%vR7db3)^u^+zee4M4u$_a#(
z43mc(4#_$D-@nUpC_`D6ed)_PecqJUWu|9dek!l@|Ig)1=`Jj@(K6^<Uyfm$m&JJQ
zDy+YildKOqz0@yp=lT%&O5gUmRnCJxd-?Qb9bVTLQEtQ9miw|9(Shu(o<(F?>Ex<>
zl9gN!@-`Z3Fdk}H#pywLwt9@^N@(!-Sr5qUw>^81XBL@^l`Nn<My{r`%GO79;D0{j
zBPLIt!nzWEvi=e=$?0rqYgGGQ&y!-F6mz3YZJdCsA4O}_k%N49I@gXh8!@O~M(-67
ze=xj^2xnA6pO<G3y6|?4^Pmjd$hOg6LupjaQvc!GT*6?j82_XEsA15#r|_Dc=PH@$
zW$B`CyU5>(Vf$Xk{DJg(#BfwZm_O)Q=?r;2qnVe^Vx^ko8|G$#&2e_Jf7JsnBgf;{
z@Paeg9Mp%~u)+Ztm{_cTg!O|k4nj(m#BdI}<gQZ7CUStrdECK7N|l{#y|B_f&&K>*
zv$HV|?Z`0aUBO;m>Y`sTyBE?X<^{IbdiVtu8^W|@tX7<Wi`k-jcHRuU+({U~&++I_
zZXR4rW7-eC@x{x#mviXce@2aW@|j1*<jWg(7l@9k3GAQ3gY)Jc*6$S66XiY6DYZ`2
z-u=k0(OH;Y2?O(iTA%`s=lZ8Q;LW+0{j-=o1)5Y6<2{sbvY9&&kl#uKu3o)d>-3{I
zZj^^z>BgD9oPobb@008|`g<#s#>2eTm}{Y6POzDi)a5pmBBOg3=(<^lPqtt=D#UJ#
z4hmO-M;kMf`t8i-0IRjmxruM}q9-3z8@f%UAWKar0`^X4;gIQR^jMkigAx3bi-q`e
zJp7rboqn@cd)0Wvp)1?)fc?3)A~ypjz)Yk>m3fjAJdUb*cFsX}zqhhe58~#e8aLxq
z!~PV}llZMH`Q~Jo*kfaUimga3%I%q{24_S`eitV>{hsd{i4)coJa@w6-JSM}j@Nwe
zg$SP;eg29XeaBUGhNV<5ZXNbvqdM6NcN$+hol|n@?=H1cQTN@OS{WR46JnXGM(2hz
z6pI_dNWUl_IninqEF)2iv!Yboh^~|#*U84lw(yIpp`e~_cZlyA#kt5`Sz8E4cAY3w
zpC)vIt|uzL4bjnFx#*EC)HOxb3Gs2V@8IxWSh;U>mRpt%m?7*H3AbA>Udgcu^*~e6
zYIG#ekac=yFFi`6)3=qQrUgZ+?`_6I9k4;&-1u~0dYk%f2ruS>`v-<Izvm*=i5m!v
z-y$BK^|q_b_#JSnV6{W344xSyth0)>4-9B3RT32jthgVw=gbI5{qFev8|HA~gC_HK
z(CHM)n>T4P2m*JDiaU8TTmOtT!pAWp<Qo^0S#LyUe|iT`cfgE3i!t)&?OSKdD(`(*
zoaX~Pehd2=XLu)IW8uZ~Y%VKj%ug%4RAg<-Nq-b;b>BF_9C=e7BPz2E)7~a{Cc1a~
zQK(^it?b?aXQqB?dzKA8$?F+ql6>}AG0~;hdZuq*ReP%k=;>##J!GO>34OPA{13eS
B)7Jn1

literal 0
HcmV?d00001


From 5658ae8dd04c7bd9581c4b77fc7149329ff0178f Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Fri, 13 Oct 2023 16:20:41 -0300
Subject: [PATCH 15/16] refactor:

---
 scraping/scraping/pipelines.py | 19 ++++++++++++++++---
 scraping/scraping/settings.py  |  6 ++++--
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/scraping/scraping/pipelines.py b/scraping/scraping/pipelines.py
index d671f2e..6b77685 100644
--- a/scraping/scraping/pipelines.py
+++ b/scraping/scraping/pipelines.py
@@ -2,7 +2,9 @@
 from itemadapter import ItemAdapter
 from scrapy.exceptions import DropItem
 from confluent_kafka import Producer
+import logging
 
+logger = logging.getLogger('MyPipelineLogger')
 
 class DataProcessingPipeline:
 
@@ -33,9 +35,13 @@ def from_crawler(cls, crawler):
         )
 
     def open_spider(self, spider):
+
+        self.file = open("items.jsonl", "w")
+        self.file.write("ab re")
         self.producer = Producer({'bootstrap.servers': self.kafka_broker})
 
     def close_spider(self, spider):
+        self.file.close()
         self.process_all_items()
         self.producer.flush()
 
@@ -44,6 +50,13 @@ def process_item(self, item, spider):
         return item
 
     def process_all_items(self):
-        if self.items:
-            content = json.dumps(self.items)
-            self.producer.produce(self.kafka_topic, content)
+        try:
+            if self.items:
+                content = json.dumps(self.items)
+                teste = self.producer.produce(self.kafka_topic, content)
+                logger.info(teste)
+                logger.info(f"Enviando dados para o Kafka: {content}")
+                self.file.write("Envio para o Kafka")
+        except Exception as e:
+            self.file.write(str(e))
+            logger.error(f"Erro ao enviar dados para o Kafka: {e}")
diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py
index e175ea5..4c4fa60 100644
--- a/scraping/scraping/settings.py
+++ b/scraping/scraping/settings.py
@@ -113,5 +113,7 @@
     'http://20.206.106.192:80',
 ]
 
-KAFKA_BROKER = get_config().KAFKA_BROKER
-KAFKA_TOPIC = get_config().KAFKA_TOPIC
\ No newline at end of file
+KAFKA_BROKER = 'localhost:9092'
+KAFKA_TOPIC = 'scraping'
+LOG_LEVEL = 'INFO'
+LOG_FILE = 'my_spider.log'
\ No newline at end of file

From 00135cd912e42eac1ce3ec4d08d4da75d48e2f2d Mon Sep 17 00:00:00 2001
From: mauro tony <maurotony23@gmail.com>
Date: Fri, 13 Oct 2023 16:41:24 -0300
Subject: [PATCH 16/16] docs: Adicionando readme

---
 README.md                               | 91 +++++++++++--------------
 runCrawler.py => scraping/runCrawler.py |  2 +-
 scraping/scraping/pipelines.py          |  9 +--
 scraping/scraping/settings.py           |  7 +-
 streamlit-frontend.py                   | 79 +++++++++------------
 5 files changed, 76 insertions(+), 112 deletions(-)
 rename runCrawler.py => scraping/runCrawler.py (73%)

diff --git a/README.md b/README.md
index 9eaef99..ea42406 100644
--- a/README.md
+++ b/README.md
@@ -3,68 +3,55 @@
 
 # Desafio Backend Python
 
-**Objetivo:** Implementar um scraper web em Python para coletar dados da página web "Scrape This Site", estruturar esses dados em JSON, e enviá-los para uma fila Kafka.
+**Objetivo:** O sistema consiste em um crawler que coleta dados de países de um site e os envia para uma fila no Kafka, no qual pode ser visualizado através de uma aplicação feita com streamlit.
 
-**Requisitos:**
+## Detalhes tecnicos
 
-1. Coleta de Dados:
+**Funcionamento do Crawler**
 
- - Faça o scraping do site https://www.scrapethissite.com/pages/simple/.
- - Colete os dados de todos os países listados, focando especificamente nos dados de população.
+ - O crawler é executado manualmente, coletando os dados de países do site https://www.scrapethissite.com/pages/simple/ e os envia para uma fila no Kafka.
+ - O crawler conta com um sistema de proxies rotativos, que são utilizados para evitar o bloqueio do site. (OBS: O crawler pode apresentar lentidão devido a utilização de proxies gratuitos)
+ - O crawler conta com um sistema de User-Agent rotativos, que também são utilizados para evitar o bloqueio do site.
 
-2. Estruturação dos Dados:
+**Integração com Kafka**:
 
- - Estruture os dados coletados em JSON.
- - Utilize classes ou dicionários em Python para representar a estrutura dos dados. A estrutura deve conter, no mínimo, os campos: "País" e "População".
+ - O crawler envia os dados para uma fila no Kafka, que é consumida pela aplicação feita com streamlit.
+ - O Kafka foi configurado utilizando o docker-compose, para facilitar a execução do projeto.
 
-3. Integração com Kafka:
+**Aplicação com streamlit**:
 
- - Envie os dados estruturados para uma fila no Kafka.
- - Providencie o arquivo Docker (Dockerfile e docker-compose, se aplicável) do Kafka utilizado no teste.
+ - A aplicação feita com streamlit consome os dados da fila no Kafka e os exibe em uma tabela além de apresentar um gráfico demonstrando o países com maiores densidades demográficas.
 
+## Instalação
 
-**Diferenciais:**
+**Pré-requisitos:**
 
-- Implemente lógicas e algoritmos para evitar o bloqueio do scraper, como:
-   - Uso de proxies rotativos.
-   - Intervals variáveis entre as requisições.
-   - Identificação e manipulação de headers (User-Agent) para simular diferentes browsers ou dispositivos.
+ - Docker Compose
+ - Python 3
+ - Pip
+ - Git
 
-**O que será avaliado:**
+**Instalação:**
 
-1. Qualidade do código e organização.
-2. Capacidade de definir e utilizar classes ou dicionários em Python.
-3. Integração com Kafka e a correta configuração do ambiente Docker para o Kafka.
-4. Implementação dos diferenciais (se aplicável).
-5. Documentação do código e instruções para execução.
+ - Clone o repositório
+    - `git clone git@github.com:MauroTony/Teste-Backend-Python.git`
+    - `cd Teste-Backend-Python`
+    - `git checkout main`
+ - Execute o docker-compose
+    - `docker-compose up -d`
+ - Instale as dependências do projeto
+    - `pip install -r requirements.txt`
+ - Configue as variáveis de ambiente
+    - Valide que a .env existe na raiz do projeto
+    - Valide a existencia da variável de ambiente KAFKA_HOST e KAFKA_PORT e configure-as caso necessário
+   
+**Execução:**
 
-**Instruções para a entrega:**
-
-1. O candidato deve dar fork neste repositório e após o termino do desenvolvimento, realizar um pull request para análise do time.
-2. Inclua um README com instruções claras sobre como executar e testar o projeto.
-
----
-#### LICENSE
-```
-MIT License
-
-Copyright (c) 2016 ZenoX IA
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-```
+ - Inicialize o kafka
+    - `docker-compose up -d`
+ - Inicialize o streamlit
+    - `streamlit run streamlit-frontend.py`
+ - Execute o crawler
+    - `cd scraping`
+    - `python runCrawler.py`
+ 
diff --git a/runCrawler.py b/scraping/runCrawler.py
similarity index 73%
rename from runCrawler.py
rename to scraping/runCrawler.py
index bba6340..6547117 100644
--- a/runCrawler.py
+++ b/scraping/runCrawler.py
@@ -1,6 +1,6 @@
 from scrapy.crawler import CrawlerProcess
 from scrapy.utils.project import get_project_settings
-from scraping.scraping.spiders.countries_spider import CountriesSpider
+from scraping.spiders.countries_spider import CountriesSpider
 
 process = CrawlerProcess(get_project_settings())
 process.crawl(CountriesSpider)
diff --git a/scraping/scraping/pipelines.py b/scraping/scraping/pipelines.py
index 6b77685..ef7900e 100644
--- a/scraping/scraping/pipelines.py
+++ b/scraping/scraping/pipelines.py
@@ -35,13 +35,9 @@ def from_crawler(cls, crawler):
         )
 
     def open_spider(self, spider):
-
-        self.file = open("items.jsonl", "w")
-        self.file.write("ab re")
         self.producer = Producer({'bootstrap.servers': self.kafka_broker})
 
     def close_spider(self, spider):
-        self.file.close()
         self.process_all_items()
         self.producer.flush()
 
@@ -53,10 +49,7 @@ def process_all_items(self):
         try:
             if self.items:
                 content = json.dumps(self.items)
-                teste = self.producer.produce(self.kafka_topic, content)
-                logger.info(teste)
+                self.producer.produce(self.kafka_topic, content)
                 logger.info(f"Enviando dados para o Kafka: {content}")
-                self.file.write("Envio para o Kafka")
         except Exception as e:
-            self.file.write(str(e))
             logger.error(f"Erro ao enviar dados para o Kafka: {e}")
diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py
index 4c4fa60..16a6b90 100644
--- a/scraping/scraping/settings.py
+++ b/scraping/scraping/settings.py
@@ -6,7 +6,7 @@
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-from config import get_config
+# from config import get_config
 
 BOT_NAME = "scraping"
 
@@ -110,10 +110,7 @@
     'http://181.191.94.126:8999',
     'http://201.91.82.155:3128',
     'http://191.243.46.162:43241',
-    'http://20.206.106.192:80',
 ]
 
 KAFKA_BROKER = 'localhost:9092'
-KAFKA_TOPIC = 'scraping'
-LOG_LEVEL = 'INFO'
-LOG_FILE = 'my_spider.log'
\ No newline at end of file
+KAFKA_TOPIC = 'scraping'
\ No newline at end of file
diff --git a/streamlit-frontend.py b/streamlit-frontend.py
index f4a13c2..c8136c7 100644
--- a/streamlit-frontend.py
+++ b/streamlit-frontend.py
@@ -3,60 +3,47 @@
 import json
 from confluent_kafka import Consumer
 
+st.title("Dashboard de dados do Kafka")
 
-def kafka_config():
-    return {
+df_null = pd.DataFrame()
+chart = st.bar_chart(df_null)
+
+if 'msg_recived' not in st.session_state:
+    st.session_state['msg_recived'] = False
+
+if not st.session_state['msg_recived']:
+    conf = {
         'bootstrap.servers': 'localhost:9092',
-        'group.id': 'streamlit111-group',
+        'group.id': 'streamlit252-group',
         'auto.offset.reset': 'earliest'
     }
 
+    consumer = Consumer(conf)
+    consumer.subscribe(['scraping'])
 
-def consume_message(consumer):
+while not st.session_state['msg_recived']:
     msg = consumer.poll(1.0)
     if msg is None:
-        return None, False
+        continue
     elif msg.error():
         st.write(f"Error: {msg.error()}")
-        return None, False
+        continue
     else:
-        return json.loads(msg.value().decode('utf-8')), True
-
-
-def process_dataframe(record):
-    df = pd.DataFrame(record)
-    df["Densidade demográfica"] = df['populationCountry'] / df['areaCountry']
-    df.rename(columns={'nameCountry': 'País', 'populationCountry': 'População', 'areaCountry': 'Área'}, inplace=True)
-    return df.sort_values(by='População', ascending=False).head(10), df
-
-
-def main():
-    st.title("Dashboard de dados do Kafka")
-    df_null = pd.DataFrame()
-    st.write("Top 10 Países por Densidade Demográfica")
-    chart = st.bar_chart(df_null)
-
-    if 'msg_recived' not in st.session_state:
-        st.session_state['msg_recived'] = False
-
-    if not st.session_state['msg_recived']:
-        consumer = Consumer(kafka_config())
-        consumer.subscribe(['scraping'])
-
-    while not st.session_state['msg_recived']:
-        record, is_msg_received = consume_message(consumer)
-        if is_msg_received:
-            df_highest_population, df = process_dataframe(record)
-            col1, col2 = st.columns(2)
-            with col1:
-                st.write("Tabela Completa")
-                st.write(df)
-            with col2:
-                chart.bar_chart(df_highest_population, y='Densidade demográfica', x='País', width=1000, height=700)
-
-            consumer.close()
-            st.session_state['msg_recived'] = True
-
-
-if __name__ == "__main__":
-    main()
+        record = json.loads(msg.value().decode('utf-8'))
+        df = pd.DataFrame(record)
+        df_highest_population = df.sort_values(by='populationCountry', ascending=False).head(10)
+        df_highest_population["Densidade demográfica"] = df_highest_population['populationCountry'] / \
+                                                         df_highest_population['areaCountry']
+        df_highest_population.rename(
+            columns={'nameCountry': 'País', 'populationCountry': 'População', 'areaCountry': 'Área'}, inplace=True)
+        #menu = st.selectbox("Escolha uma visualização:", ["Tabela Completa", "Top 10 Países por População"])
+        col1, col2 = st.columns(2)
+        with col1:
+            print("Tabela Completa")
+            st.write(df)
+        with col2:
+            print("Top 10 Países por População")
+            chart.bar_chart(df_highest_population, y='Densidade demográfica', x='País', width=1000, height=700)
+        consumer.close()
+        st.session_state['msg_recived'] = True
+    break
\ No newline at end of file