### Open file and prepare Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("515k-hotel-reviews-data-in-europe.zip")

In [3]:
def get_country(adress):
    country = adress.split()[-1]
    if country == "Kingdom":
        return("United Kingdom")
    else:
        return(country)
    
df['Country'] = df.Hotel_Address.apply(lambda x: get_country(x))
df.Country.value_counts()

United Kingdom    262301
Spain              60149
France             59928
Netherlands        57214
Austria            38939
Italy              37207
Name: Country, dtype: int64

In [4]:
def get_city(adress, country):
    city = adress.split()[-2]
    if country == "United Kingdom":
        return(adress.split()[-5])
    else:
        return(city)

df['City'] = df[['Hotel_Address','Country']].apply(lambda x: get_city(x[0], x[1]), axis=1)
df.City.value_counts()

London       262301
Barcelona     60149
Paris         59928
Amsterdam     57214
Vienna        38939
Milan         37207
Name: City, dtype: int64

### Scrapping Hotel Prices
There's a lot more of information missing which we think is important to get better insights. 
To get all this information the first thing i need is to get the exact address that directs me into the booking page containing information of each Hotel. 
This first step will be done through scrapping the links in Google.
Once scrapped the final address in booking, another scrapping exercise will be needed in order to get the desired information for each Hotel.

In [83]:
hotel_city = df[['Hotel_Name','City']].groupby(['Hotel_Name','City']).count().reset_index()
scrap = hotel_city.apply(lambda x: ('Hotel ' + x[0] + ' ' + x[1]), axis=1)
scrap.head()

0                    Hotel 11 Cadogan Gardens London
1                               Hotel 1K Hotel Paris
2    Hotel 25hours Hotel beim MuseumsQuartier Vienna
3                                    Hotel 41 London
4    Hotel 45 Park Lane Dorchester Collection London
dtype: object

This is the adress i will use in Google. The idea is that i will pick the first adress coming from booking.com as the final address of the hotel in the sistem. Once i will gather all of the addresses i will try to scrap the prices in each one of them

In [114]:
google_address = ['https://www.google.com/search?sxsrf=&q=booking.com+'+i.replace(" ", "+") for i in scrap]
#google_address = ['https://duckduckgo.com/?q=booking.com+'+i+"&ia=web".replace(" ", "+") for i in scrap]

Once i've got all the addresses of a Google search i start scrapping the addresses in Booking

#### Scrapping of Booking addresses

In [115]:
import requests
from bs4 import BeautifulSoup
import time

In [116]:
bookingadress = []

for i, hotel in enumerate(google_address[:5]):
    URL = hotel
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    bkng = []
    for page in soup.find_all('a', href=True):
        if page['href'][:37] == '/url?q=https://www.booking.com/hotel/':
            bkng.append(page['href'][7:].split('&', 1)[0])
    if bkng[0][-8:] != '.es.html':
        bkng[0] = bkng[0][:-5]+'.es.html'
        
    bookingadress.append(bkng[0])
    print(i,bkng[0])
    time.sleep(3)

0 https://www.booking.com/hotel/gb/number-eleven.es.html
1 https://www.booking.com/hotel/fr/1-k-hotel.es.html
2 https://www.booking.com/hotel/at/25hours-wien.es.html
3 https://www.booking.com/hotel/gb/41clubredcarnations.es.html
4 https://www.booking.com/hotel/gb/parklane.es.html


#### Scrapping of other Booking information (sample of 1)

In [43]:
import json
i = 9

In [44]:
head = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36"}
URL = bookingadress[i]
page = requests.get(URL, headers=head)
soup = BeautifulSoup(page.content, 'html.parser')
print(bookingadress[i])

https://www.booking.com/hotel/es/achotelsbarcelona.es.html


Get the Header of the page ¿?

In [45]:
data = soup.select("[type='application/ld+json']")[0]

Get the price from the Header

In [46]:
pricetag = json.loads(data.text)['priceRange']
price = int(pricetag.split("€",1)[1].split()[0])

Get the stars of the Hotel

In [47]:
stars = soup.find(class_='star_track')['title']

Get the name of the Hotel

In [48]:
name = json.loads(data.text)['name']

Get the facilities from Booking

In [49]:
facilities = {}
raw = soup.find_all(class_='facilitiesChecklistSection')
for i in raw:
    dept = i.getText().split('\n')
    facilities_dept = [x for x in i.getText().split('\n') if x]
    dept = facilities_dept[0]
    items = facilities_dept[1:]    
    facilities[dept] = items

Create the Dictionary with the data

In [50]:
hotels = {}
hotels[name] = {'Price': price, 'Stars': stars, 'Facilities': facilities, 'Header': json.loads(data.text)}

#### Scrapping of other Booking information (complete dataset for loop)

In [62]:
import json

In [63]:
head = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36"}
hotels = {}
name_scrap = []

for num, url in enumerate(bookingadress):
    
    print(num, url)
    
    URL = url
    page = requests.get(URL, headers=head)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Get Header
    data = soup.select("[type='application/ld+json']")[0]
    
    # Get Hotel Name
    name = json.loads(data.text)['name']
    name_scrap.append(name)
    
    # Get Price
    pricetag = json.loads(data.text)['priceRange']
    price = int(pricetag.split("€",1)[1].split()[0])
    
    # Get Stars
    stars = soup.find(class_='star_track')['title']
    
    # Get Facilities    
    facilities = {}
    raw = soup.find_all(class_='facilitiesChecklistSection')
    for i in raw:
        dept = i.getText().split('\n')
        facilities_dept = [x for x in i.getText().split('\n') if x]
        dept = facilities_dept[0]
        items = facilities_dept[1:]    
        facilities[dept] = items
        
    # Append Information to Dicctionary
    hotels[name] = {'Price': price, 'Stars': stars, 'Facilities': facilities, 'Header': json.loads(data.text)}

0 https://www.booking.com/hotel/gb/number-eleven.es.html
1 https://www.booking.com/hotel/fr/1-k-hotel.es.html
2 https://www.booking.com/hotel/at/25hours-wien.es.html
3 https://www.booking.com/hotel/gb/41clubredcarnations.es.html
4 https://www.booking.com/hotel/gb/parklane.es.html
5 https://www.booking.com/hotel/gb/88-studios.es.html
6 https://www.booking.com/hotel/fr/9hotel-republique.es.html
7 https://www.booking.com/hotel/fr/a-la-villa-madame.es.html
8 https://www.booking.com/hotel/es/abac-barcelona.es.html
9 https://www.booking.com/hotel/es/achotelsbarcelona.es.html
10 https://www.booking.com/hotel/es/ac-marriott-diagonal-lilla.es.html
11 https://www.booking.com/hotel/es/acirla.es.html
12 https://www.booking.com/hotel/it/ac-milano.es.html
13 https://www.booking.com/hotel/fr/ac-paris-porte-maillot-by-marriott.es.html
14 https://www.booking.com/hotel/es/ac-sants.es.html


In [65]:
name_scrap

['11 Cadogan Gardens',
 '1K Paris',
 '25hours Hotel beim MuseumsQuartier',
 '41',
 '45 Park Lane - Dorchester Collection',
 '88 Studios',
 '9Hotel Republique',
 'La Villa Madame',
 'ABaC Restaurant Hotel Barcelona GL Monumento',
 'AC Hotel Barcelona Forum',
 'AC Hotel Diagonal LÂ´Illa',
 'AC Hotel Irla',
 'AC Hotel Milano',
 'AC Hotel Paris Porte Maillot by Marriott',
 'AC Hotel Sants']

In [53]:
hotels

{'11 Cadogan Gardens': {'Price': 268,
  'Stars': 'hotel de 5 estrellas',
  'Facilities': {'Exteriores': ['Mobiliario exterior', 'Terraza', 'Jardín'],
   'Servicios y extras': ['Entradas para lugares de interés o espectáculos',
    'De pago'],
   'Mascotas': ['No se admiten.'],
   'Actividades': ['Eventos deportivos en directo (emisión)',
    'Fuera del alojamiento',
    'De pago',
    'Música / espectáculos en directo',
    'Fuera del alojamiento',
    'De pago',
    'Tours a pie',
    'De pago',
    'Noches de cine',
    'Fuera del alojamiento',
    'De pago',
    'Comedia en vivo',
    'Fuera del alojamiento',
    'De pago',
    'Ruta de bares',
    'De pago',
    'Galerías de arte temporales',
    'Fuera del alojamiento',
    'Biblioteca'],
   'Comida y bebida': ['Bombones o galletas',
    'De pago',
    'Fruta',
    'De pago',
    'Botella de agua',
    'De pago',
    'Vino / champán',
    'De pago',
    'Menú para niños',
    'De pago',
    'Menús dietéticos (bajo petición)',
    

In [54]:
[i for i in hotels]

['11 Cadogan Gardens',
 '1K Paris',
 '25hours Hotel beim MuseumsQuartier',
 '41',
 '45 Park Lane - Dorchester Collection',
 '88 Studios',
 '9Hotel Republique',
 'La Villa Madame',
 'ABaC Restaurant Hotel Barcelona GL Monumento',
 'AC Hotel Barcelona Forum',
 'AC Hotel Diagonal LÂ´Illa',
 'AC Hotel Irla',
 'AC Hotel Milano',
 'AC Hotel Paris Porte Maillot by Marriott',
 'AC Hotel Sants']

In [61]:
for i in hotels:
    print(hotels[i]['Header']['url'])

https://www.booking.com/hotel/gb/number-eleven.es.html
https://www.booking.com/hotel/fr/1-k-hotel.es.html
https://www.booking.com/hotel/at/25hours-wien.es.html
https://www.booking.com/hotel/gb/41clubredcarnations.es.html
https://www.booking.com/hotel/gb/parklane.es.html
https://www.booking.com/hotel/gb/88-studios.es.html
https://www.booking.com/hotel/fr/9hotel-republique.es.html
https://www.booking.com/hotel/fr/a-la-villa-madame.es.html
https://www.booking.com/hotel/es/abac-barcelona.es.html
https://www.booking.com/hotel/es/achotelsbarcelona.es.html
https://www.booking.com/hotel/es/ac-marriott-diagonal-lilla.es.html
https://www.booking.com/hotel/es/acirla.es.html
https://www.booking.com/hotel/it/ac-milano.es.html
https://www.booking.com/hotel/fr/ac-paris-porte-maillot-by-marriott.es.html
https://www.booking.com/hotel/es/ac-sants.es.html
