In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import json

class PoemLinkSpider(scrapy.Spider):
    name = "poem_link_spider"
    start_urls = ["https://www.diwanalarab.com/-%D8%AF%D9%8A%D9%88%D8%A7%D9%86-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1-"]

    def parse(self, response):
        # Extract poem links from the current page
        poem_links = response.css('article.block a::attr(href)').getall()

        # Follow each poem link to scrape the content of each poem page
        for poem_link in poem_links:
            yield response.follow(poem_link, callback=self.parse_poem)

        # Follow the link to the next page if available
        next_page_link = response.css('span.next a::attr(href)').get()
        if next_page_link:
            yield response.follow(next_page_link, callback=self.parse)

    def parse_poem(self, response):
        # Extract poem text from the poem page
        paragraphs = response.css('div.texte p::text').getall()

        # Clean and join poem text
        cleaned_poem_text = [text.strip() for text in paragraphs if text.strip()]
        poem = "\n".join(cleaned_poem_text)

        # Extract date, title, and subtitle
        date = response.css('div.date::text').get()
        title = response.css('h1::text').get()
        subtitle = response.css('div.soustitre::text').get()

        yield {
            'date': date.strip() if date else None,
            'title': title.strip() if title else None,
            'author': subtitle.strip() if subtitle else None,
            'poem': poem
        }

# Initialize a CrawlerProcess
process = CrawlerProcess(settings={
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    'FEED_FORMAT': 'json',
    'FEED_URI': 'output.json',  # File to save the scraped data
    'FEED_EXPORT_ENCODING': 'utf-8'  # Specify UTF-8 encoding
})

# Start the crawling process with the spider
process.crawl(PoemLinkSpider)
process.start()

# Read the JSON file and ensure proper encoding
with open('output.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

2024-04-07 16:53:03 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2024-04-07 16:53:03 [scrapy.utils.log] INFO: Versions: lxml 5.2.0.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.9.0, w3lib 2.1.2, Twisted 24.3.0, Python 3.12.2 (tags/v3.12.2:6abddd9, Feb  6 2024, 21:26:36) [MSC v.1937 64 bit (AMD64)], pyOpenSSL 24.1.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.5, Platform Windows-10-10.0.19045-SP0
2024-04-07 16:53:03 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-04-07 16:53:03 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2024-04-07 16:53:03 [scrapy.extensions.telnet] INFO: Telnet Password: 22163d23d2e20212
  exporter = cls(crawler)

2024-04-07 16:53:03 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsol