/
trilhas_tdc.py
32 lines (28 loc) · 1.14 KB
/
trilhas_tdc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import scrapy
from scrapy import Request
class TrilhasTDC(scrapy.Spider):
name = "trilhas_tdc"
start_urls = [
"http://www.thedevelopersconference.com.br/tdc/2018/saopaulo/trilhas"
]
def parse(self, response):
colunas = response.xpath('//div[contains(@class, "col-sp")]')
for coluna in colunas:
dia = coluna.xpath('./h4/text()').extract_first()
links_trilhas = coluna.xpath('./a/@href').extract()
for link_trilha in links_trilhas:
yield Request(
url=response.urljoin(link_trilha),
callback=self.parse_trilha,
meta={
'dia' : dia,
}
)
def parse_trilha(self,response):
yield{
'dia' : response.meta.get('dia'),
'titulo' : response.xpath('//h1[@class="titulo-trilha"]/text()').extract_first(),
'subtitulo': response.xpath('//h1[@class="titulo-trilha"]/small/text()').extract_first(),
'descricao': response.xpath('//div[@class="lead"]//p/text()').extract(),
'link' : response.url,
}