In [1]:
villes=["Mont Saint Michel",
"St Malo",
"Bayeux",
"Le Havre",
"Rouen",
"Paris",
"Amiens",
"Lille",
"Strasbourg",
"Chateau du Haut Koenigsbourg",
"Colmar",
"Eguisheim",
"Besancon",
"Dijon",
"Annecy",
"Grenoble",
"Lyon",
"Gorges du Verdon",
"Bormes les Mimosas",
"Cassis",
"Marseille",
"Aix en Provence",
"Avignon",
"Uzes",
"Nimes",
"Aigues Mortes",
"Saintes Maries de la mer",
"Collioure",
"Carcassonne",
"Ariege",
"Toulouse",
"Montauban",
"Biarritz",
"Bayonne",
"La Rochelle"]

In [2]:
import scrapy
import os
import logging
from scrapy.crawler import CrawlerProcess
from scrapy.http import Request
import pandas as pd

df=pd.read_csv('villes.csv')
df.head()
villes=df.iloc[:,0].to_list()

class BookingSpiderTest(scrapy.Spider):

    locs = villes
    name = "booking"
    init_url = dict()
    start_urls = ["https://www.booking.com/"]


    def parse(self, response):

        for loc in self.locs:

            yield scrapy.FormRequest.from_response(
                response,
                formdata={'ss': loc},
                callback=self.after_search ,
                cb_kwargs = {'location':loc, 'page_no':0}
                )





    def after_search(self, response, location, page_no):

        print(location, page_no, end=' ')
        if page_no == 0:
            self.init_url[location] = response.url

        containers = response.css('div.a826ba81c4.fe821aea6c.fa2f36ad22.afd256fc79.d08f526e0d.ed11e24d01.da89aeb942')

        for container in containers:
            
            try:
                name = container.css('div.fcab3ed991.a23c043802::text').get()
            except:
                print(f"Probleme avec le nom de l'hotel à {location} ")
                name = None

            try:
                url = container.css('a.e13098a59f').attrib['href']
            except:
                print(f"Probleme avec l'url de {name} à {location}")
                url = None
            try:
                description = container.css('div.d8eab2cf7f::text').get()
            except:
                print(f"Probleme avec la description de {name} à {location}")
                description = None
            try:
                score = float(container.css('div.b5cd09854e.d10a6220b4::text').get())
            except:
                score = None
                print(f"Probleme avec le score de {name} à {location}")

            dic= {'location' : location,
                   'url' : url,
                   'name' : name,
                   'score' : score,
                   'description' : description}
            try:
                yield response.follow(url=url, callback=self.parse_hotel, cb_kwargs = {'dic':dic})
            except:
                print(f'\n getting to {name}, {location} webpage did not work')
                dic['lat']=None
                dic['lon']=None
                yield dic
            
            
        if page_no<=1:
            next_page = self.init_url[location]+"&offset="+str(25*(page_no+1))
            yield  response.follow(next_page, callback=self.after_search, cb_kwargs = {'location':location, 'page_no':page_no+1})
        
        
    def parse_hotel(self, response, dic): #uniquement pour récupérer les coordonnées GPS
        try:
            ll = response.css('a#hotel_sidebar_static_map').attrib['data-atlas-latlng']
        except:
            print("mais pas réussi à récupérer le selecteur")
            dic['lat']=None
            dic['lon']=None
            yield dic
            return None
        try:
            ll = ll.split(',')
            dic['lat'] = float(ll[0])
            dic['lon'] = float(ll[1])
            yield dic
        except:
            print("mais pas réussi à séparer latitude et longitude")
            dic['lat']=None
            dic['lon']=None
            yield dic

In [3]:
filename = "booking_results.json"

if filename in os.listdir('.'):
    os.remove(filename)
    

process = CrawlerProcess(settings = {
    'USER_AGENT': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        filename : {"format": "json"},
    }
})

process.crawl(BookingSpiderTest)
process.start()

2022-04-10 23:04:14 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapybot)
2022-04-10 23:04:14 [scrapy.utils.log] INFO: Versions: lxml 4.8.0.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.2.0, Python 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:20:46) - [GCC 9.4.0], pyOpenSSL 22.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 36.0.1, Platform Linux-5.4.170+-x86_64-with-glibc2.31
2022-04-10 23:04:14 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:98.0) '
               'Gecko/20100101 Firefox/98.0'}
2022-04-10 23:04:14 [scrapy.extensions.telnet] INFO: Telnet Password: 91bdb6d506c5d925
2022-04-10 23:04:14 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2022-

Mont Saint Michel 0 Le Havre 0 Amiens 0 Probleme avec le score de Le Saint Vincent - 4/6P - 300m de la plage à Le Havre
Bayeux 0 Probleme avec le score de Bonvallet - Studio paisible et chaleureux avec balcon à Amiens
St Malo 0 Lille 0 Rouen 0 Paris 0 Strasbourg 0 Chateau du Haut Koenigsbourg 0 Eguisheim 0 Colmar 0 Besancon 0 Grenoble 0 Dijon 0 Annecy 0 La Rochelle 0 Mont Saint Michel 1 Paris 1 Strasbourg 1 Annecy 1 La Rochelle 1 Probleme avec le score de COSY FLAT MARKET à La Rochelle
Mont Saint Michel 2 Paris 2 Strasbourg 2 Annecy 2 Probleme avec le score de Les Mouettes du Lac n2 - Grand Studio à 100 m du Lac d'Annecy à Annecy
La Rochelle 2 Probleme avec le score de Studio La Rochelle, 2 pièces, 3 personnes - FR-1-246-210 à La Rochelle
Probleme avec le score de Studio La Rochelle, 1 pièce, 2 personnes - FR-1-246-265 à La Rochelle
Probleme avec le score de PLEIN CHARME AUX MINIMES CALME + PARKING + WIFI à La Rochelle


2022-04-10 23:04:37 [scrapy.crawler] INFO: Received SIGINT, shutting down gracefully. Send again to force 
2022-04-10 23:04:37 [scrapy.core.engine] INFO: Closing spider (shutdown)
2022-04-10 23:04:41 [scrapy.extensions.feedexport] INFO: Stored json feed (74 items) in: booking_results.json
2022-04-10 23:04:41 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 167036,
 'downloader/request_count': 103,
 'downloader/request_method_count/GET': 103,
 'downloader/response_bytes': 19165990,
 'downloader/response_count': 103,
 'downloader/response_status_count/200': 102,
 'downloader/response_status_count/302': 1,
 'elapsed_time_seconds': 27.058711,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'shutdown',
 'finish_time': datetime.datetime(2022, 4, 10, 23, 4, 41, 908647),
 'httpcompression/response_bytes': 131450905,
 'httpcompression/response_count': 102,
 'item_scraped_count': 74,
 'log_count/INFO': 12,
 'memusage/max': 133009408,
 'memusage/