In [1]:
import scrapy
from rank_bm25 import BM25Okapi
from googlesearch import search

In [2]:
class SoilWaterCapacitySpider(scrapy.Spider):
    name = "soil_water_capacity"

    def __init__(self, query="tools or resources to help farmers assess their soil's water-holding capacity", num_results=10, *args, **kwargs):
        super(SoilWaterCapacitySpider, self).__init__(*args, **kwargs)
        self.query = query
        self.num_results = int(num_results)
        self.urls = list(search(self.query, num_results=self.num_results))
        self.corpus = []
        self.results = []

    def start_requests(self):
        for url in self.urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        text = response.css("body ::text").extract()
        full_text = " ".join(text)
        self.corpus.append(full_text.lower().split())
        self.results.append({"url": response.url, "full_text": full_text})

        if len(self.corpus) == len(self.urls):
            self.rank_and_yield()

    def rank_and_yield(self):
        bm25 = BM25Okapi(self.corpus)
        tokenized_query = self.query.lower().split()
        doc_scores = bm25.get_scores(tokenized_query)
        ranked_results = sorted(zip(doc_scores, self.results), key=lambda x: x[0], reverse=True)

        for score, result in ranked_results:
            yield {
                "title": response.css("title::text").get() if (response := scrapy.Selector(text=requests.get(result['url']).text)) else "No Title Found", # added request to get response object for title.
                "url": result["url"],
                "score": score,
                "text": result["full_text"],
            }

In [3]:
# Usage Example:
if __name__ == "__main__":
    from scrapy.crawler import CrawlerProcess
    from scrapy.utils.project import get_project_settings

    process = CrawlerProcess(get_project_settings())
    process.crawl(SoilWaterCapacitySpider, query="soil water holding capacity practical guide", num_results=5) # Example usage with different search terms and number of results.
    process.start()

2025-03-01 09:11:23 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
2025-03-01 09:11:23 [scrapy.utils.log] INFO: Versions: lxml 5.3.1.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.10.0, w3lib 2.3.1, Twisted 24.11.0, Python 3.13.2 (tags/v3.13.2:4f8bb39, Feb  4 2025, 15:23:48) [MSC v.1942 64 bit (AMD64)], pyOpenSSL 25.0.0 (OpenSSL 3.4.1 11 Feb 2025), cryptography 44.0.1, Platform Windows-11-10.0.22631-SP0
