diff --git a/.env b/.env new file mode 100644 index 0000000..f642015 --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +KAFKA_BROKER=localhost:9092 +KAFKA_TOPIC=scraping \ No newline at end of file diff --git a/README.md b/README.md index 9eaef99..ea42406 100644 --- a/README.md +++ b/README.md @@ -3,68 +3,55 @@ # Desafio Backend Python -**Objetivo:** Implementar um scraper web em Python para coletar dados da página web "Scrape This Site", estruturar esses dados em JSON, e enviá-los para uma fila Kafka. +**Objetivo:** O sistema consiste em um crawler que coleta dados de países de um site e os envia para uma fila no Kafka, no qual pode ser visualizado através de uma aplicação feita com streamlit. -**Requisitos:** +## Detalhes tecnicos -1. Coleta de Dados: +**Funcionamento do Crawler** - - Faça o scraping do site https://www.scrapethissite.com/pages/simple/. - - Colete os dados de todos os países listados, focando especificamente nos dados de população. + - O crawler é executado manualmente, coletando os dados de países do site https://www.scrapethissite.com/pages/simple/ e os envia para uma fila no Kafka. + - O crawler conta com um sistema de proxies rotativos, que são utilizados para evitar o bloqueio do site. (OBS: O crawler pode apresentar lentidão devido a utilização de proxies gratuitos) + - O crawler conta com um sistema de User-Agent rotativos, que também são utilizados para evitar o bloqueio do site. -2. Estruturação dos Dados: +**Integração com Kafka**: - - Estruture os dados coletados em JSON. - - Utilize classes ou dicionários em Python para representar a estrutura dos dados. A estrutura deve conter, no mínimo, os campos: "País" e "População". + - O crawler envia os dados para uma fila no Kafka, que é consumida pela aplicação feita com streamlit. + - O Kafka foi configurado utilizando o docker-compose, para facilitar a execução do projeto. -3. Integração com Kafka: +**Aplicação com streamlit**: - - Envie os dados estruturados para uma fila no Kafka. - - Providencie o arquivo Docker (Dockerfile e docker-compose, se aplicável) do Kafka utilizado no teste. + - A aplicação feita com streamlit consome os dados da fila no Kafka e os exibe em uma tabela além de apresentar um gráfico demonstrando o países com maiores densidades demográficas. +## Instalação -**Diferenciais:** +**Pré-requisitos:** -- Implemente lógicas e algoritmos para evitar o bloqueio do scraper, como: - - Uso de proxies rotativos. - - Intervals variáveis entre as requisições. - - Identificação e manipulação de headers (User-Agent) para simular diferentes browsers ou dispositivos. + - Docker Compose + - Python 3 + - Pip + - Git -**O que será avaliado:** +**Instalação:** -1. Qualidade do código e organização. -2. Capacidade de definir e utilizar classes ou dicionários em Python. -3. Integração com Kafka e a correta configuração do ambiente Docker para o Kafka. -4. Implementação dos diferenciais (se aplicável). -5. Documentação do código e instruções para execução. + - Clone o repositório + - `git clone git@github.com:MauroTony/Teste-Backend-Python.git` + - `cd Teste-Backend-Python` + - `git checkout main` + - Execute o docker-compose + - `docker-compose up -d` + - Instale as dependências do projeto + - `pip install -r requirements.txt` + - Configue as variáveis de ambiente + - Valide que a .env existe na raiz do projeto + - Valide a existencia da variável de ambiente KAFKA_HOST e KAFKA_PORT e configure-as caso necessário + +**Execução:** -**Instruções para a entrega:** - -1. O candidato deve dar fork neste repositório e após o termino do desenvolvimento, realizar um pull request para análise do time. -2. Inclua um README com instruções claras sobre como executar e testar o projeto. - ---- -#### LICENSE -``` -MIT License - -Copyright (c) 2016 ZenoX IA - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -``` + - Inicialize o kafka + - `docker-compose up -d` + - Inicialize o streamlit + - `streamlit run streamlit-frontend.py` + - Execute o crawler + - `cd scraping` + - `python runCrawler.py` + diff --git a/config.py b/config.py new file mode 100644 index 0000000..66116e9 --- /dev/null +++ b/config.py @@ -0,0 +1,13 @@ +import os +from dotenv import find_dotenv, load_dotenv + +load_dotenv(find_dotenv()) + + +class GeneralConfig: + KAFKA_TOPIC: str = os.getenv('KAFKA_TOPIC') + KAFKA_BROKER: str = os.getenv('KAFKA_BROKER') + + +def get_config() -> GeneralConfig: + return GeneralConfig() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..1499032 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,21 @@ +version: '3.8' + +services: + zookeeper: + image: confluentinc/cp-zookeeper:latest + environment: + ZOOKEEPER_CLIENT_PORT: 2181 + + kafka: + image: confluentinc/cp-kafka:latest + depends_on: + - zookeeper + ports: + - "9092:9092" + environment: + KAFKA_BROKER_ID: 1 + KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true" + restart: on-failure \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e6a58d5 Binary files /dev/null and b/requirements.txt differ diff --git a/scraping/runCrawler.py b/scraping/runCrawler.py new file mode 100644 index 0000000..6547117 --- /dev/null +++ b/scraping/runCrawler.py @@ -0,0 +1,7 @@ +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from scraping.spiders.countries_spider import CountriesSpider + +process = CrawlerProcess(get_project_settings()) +process.crawl(CountriesSpider) +process.start() diff --git a/scraping/scraping/__init__.py b/scraping/scraping/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scraping/scraping/items.py b/scraping/scraping/items.py new file mode 100644 index 0000000..a0a859c --- /dev/null +++ b/scraping/scraping/items.py @@ -0,0 +1,8 @@ +import scrapy + + +class ScrapingItem(scrapy.Item): + nameCountry = scrapy.Field() + capitalCountry = scrapy.Field() + populationCountry = scrapy.Field() + areaCountry = scrapy.Field() diff --git a/scraping/scraping/middlewares.py b/scraping/scraping/middlewares.py new file mode 100644 index 0000000..c509e80 --- /dev/null +++ b/scraping/scraping/middlewares.py @@ -0,0 +1,44 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +import random +import logging +from scrapy import signals +from itemadapter import is_item, ItemAdapter + + +class RandomUserAgentMiddleware: + def __init__(self, user_agents): + self.user_agents = user_agents + + @classmethod + def from_crawler(cls, crawler): + return cls(user_agents=crawler.settings.getlist('USER_AGENTS')) + + def process_request(self, request, spider): + request.headers.setdefault('User-Agent', random.choice(self.user_agents)) + + +class RandomProxyMiddleware: + def __init__(self, proxies): + self.proxies = proxies + self.logger = logging.getLogger(__name__) + + @classmethod + def from_crawler(cls, crawler): + return cls(proxies=crawler.settings.getlist('PROXIES')) + + def process_request(self, request, spider): + proxy = random.choice(self.proxies) + request.meta['proxy'] = proxy + + def process_exception(self, request, exception, spider): + if isinstance(exception, (ConnectionRefusedError, TimeoutError, )): + self.logger.warning(f"Failed to connect using proxy {request.meta['proxy']}, retrying a different proxy...") + new_request = request.copy() + new_request.dont_filter = True + new_request.priority = request.priority + 1 + return new_request + return None \ No newline at end of file diff --git a/scraping/scraping/pipelines.py b/scraping/scraping/pipelines.py new file mode 100644 index 0000000..ef7900e --- /dev/null +++ b/scraping/scraping/pipelines.py @@ -0,0 +1,55 @@ +import json +from itemadapter import ItemAdapter +from scrapy.exceptions import DropItem +from confluent_kafka import Producer +import logging + +logger = logging.getLogger('MyPipelineLogger') + +class DataProcessingPipeline: + + def process_item(self, item, spider): + if not item.get('populationCountry') or not item.get('areaCountry'): + raise DropItem("Item faltando campos necessários") + + try: + item['populationCountry'] = int(item['populationCountry']) + item['areaCountry'] = float(item['areaCountry']) + except ValueError: + raise DropItem("Não foi possível converter os dados") + return item + + +class KafkaPipeline: + + def __init__(self, kafka_broker, kafka_topic): + self.kafka_broker = kafka_broker + self.kafka_topic = kafka_topic + self.items = [] + + @classmethod + def from_crawler(cls, crawler): + return cls( + kafka_broker=crawler.settings.get('KAFKA_BROKER'), + kafka_topic=crawler.settings.get('KAFKA_TOPIC') + ) + + def open_spider(self, spider): + self.producer = Producer({'bootstrap.servers': self.kafka_broker}) + + def close_spider(self, spider): + self.process_all_items() + self.producer.flush() + + def process_item(self, item, spider): + self.items.append(dict(item)) + return item + + def process_all_items(self): + try: + if self.items: + content = json.dumps(self.items) + self.producer.produce(self.kafka_topic, content) + logger.info(f"Enviando dados para o Kafka: {content}") + except Exception as e: + logger.error(f"Erro ao enviar dados para o Kafka: {e}") diff --git a/scraping/scraping/settings.py b/scraping/scraping/settings.py new file mode 100644 index 0000000..16a6b90 --- /dev/null +++ b/scraping/scraping/settings.py @@ -0,0 +1,116 @@ +# Scrapy settings for scraping project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# from config import get_config + +BOT_NAME = "scraping" + +SPIDER_MODULES = ["scraping.spiders"] +NEWSPIDER_MODULE = "scraping.spiders" + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = "scraping (+http://www.yourdomain.com)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "scraping.middlewares.ScrapingSpiderMiddleware": 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { + "scraping.middlewares.RandomUserAgentMiddleware": 200, + "scraping.middlewares.RandomProxyMiddleware": 100, +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + "scraping.pipelines.DataProcessingPipeline": 300, + "scraping.pipelines.KafkaPipeline": 400, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" + +USER_AGENTS = [ + # Chrome 91 Windows 10 + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + # Firefox 89 Windows 10 + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0", + # Safari 14 macOS + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15", + # iPhone X Safari + "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1" +] + +PROXIES = [ + 'http://181.191.94.126:8999', + 'http://201.91.82.155:3128', + 'http://191.243.46.162:43241', +] + +KAFKA_BROKER = 'localhost:9092' +KAFKA_TOPIC = 'scraping' \ No newline at end of file diff --git a/scraping/scraping/spiders/__init__.py b/scraping/scraping/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/scraping/scraping/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/scraping/scraping/spiders/countries_spider.py b/scraping/scraping/spiders/countries_spider.py new file mode 100644 index 0000000..672d682 --- /dev/null +++ b/scraping/scraping/spiders/countries_spider.py @@ -0,0 +1,26 @@ +import scrapy +from ..items import ScrapingItem + +class CountriesSpider(scrapy.Spider): + name = "countries" + start_urls = [ + 'https://www.scrapethissite.com/pages/simple/' + ] + + def start_requests(self): + for url in self.start_urls: + yield scrapy.Request(url, callback=self.parse) + + def parse(self, response): + self.logger.info("User-Agent: " + response.request.headers['User-Agent'].decode('utf-8')) + self.logger.info("Response status: " + str(response.status)) + self.logger.info("Coletando dados...") + + for country in response.css('.country'): + content = ScrapingItem( + nameCountry=country.css('h3.country-name').xpath('normalize-space(.)').get(), + capitalCountry=country.css('.country-info .country-capital::text').get(), + populationCountry=country.css('.country-info .country-population::text').get(), + areaCountry=country.css('.country-info .country-area::text').get(), + ) + yield content diff --git a/scraping/scrapy.cfg b/scraping/scrapy.cfg new file mode 100644 index 0000000..0e0aaa1 --- /dev/null +++ b/scraping/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = scraping.settings + +[deploy] +#url = http://localhost:6800/ +project = scraping diff --git a/streamlit-frontend.py b/streamlit-frontend.py new file mode 100644 index 0000000..c8136c7 --- /dev/null +++ b/streamlit-frontend.py @@ -0,0 +1,49 @@ +import streamlit as st +import pandas as pd +import json +from confluent_kafka import Consumer + +st.title("Dashboard de dados do Kafka") + +df_null = pd.DataFrame() +chart = st.bar_chart(df_null) + +if 'msg_recived' not in st.session_state: + st.session_state['msg_recived'] = False + +if not st.session_state['msg_recived']: + conf = { + 'bootstrap.servers': 'localhost:9092', + 'group.id': 'streamlit252-group', + 'auto.offset.reset': 'earliest' + } + + consumer = Consumer(conf) + consumer.subscribe(['scraping']) + +while not st.session_state['msg_recived']: + msg = consumer.poll(1.0) + if msg is None: + continue + elif msg.error(): + st.write(f"Error: {msg.error()}") + continue + else: + record = json.loads(msg.value().decode('utf-8')) + df = pd.DataFrame(record) + df_highest_population = df.sort_values(by='populationCountry', ascending=False).head(10) + df_highest_population["Densidade demográfica"] = df_highest_population['populationCountry'] / \ + df_highest_population['areaCountry'] + df_highest_population.rename( + columns={'nameCountry': 'País', 'populationCountry': 'População', 'areaCountry': 'Área'}, inplace=True) + #menu = st.selectbox("Escolha uma visualização:", ["Tabela Completa", "Top 10 Países por População"]) + col1, col2 = st.columns(2) + with col1: + print("Tabela Completa") + st.write(df) + with col2: + print("Top 10 Países por População") + chart.bar_chart(df_highest_population, y='Densidade demográfica', x='País', width=1000, height=700) + consumer.close() + st.session_state['msg_recived'] = True + break \ No newline at end of file