In [1]:
# Installing dotenv library to manage confidential keys saved as environment variables
#!pip install python-dotenv

In [2]:
import json
import pandas as pd
import re
import datetime as dt

import boto3

import os
from urllib.request import urlopen
from dotenv import load_dotenv

load_dotenv()

True

### Creating a table with information about hotels

In [3]:
# Loading .json file with hotel urls
url = "https://kayak-booking-bucket-12-12-2022.s3.eu-west-3.amazonaws.com/booking_search_page.json"
response_hotel_urls = urlopen(url)
hotel_urls = json.loads(response_hotel_urls.read())

In [4]:
# Creating a dataframe from .json file
hotel_urls_df = pd.DataFrame.from_records(hotel_urls)

display(hotel_urls_df.head())

# Checking the length of the dataframe
print("Length of the dataframe:", len(hotel_urls_df))

Unnamed: 0,hotel_name,hotel_url,booking_city_url
0,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,<HtmlResponse 200 https://www.booking.com/sear...
1,GuestReady - Cozy perfection in the city centre,https://www.booking.com/hotel/fr/guestready-to...,<HtmlResponse 200 https://www.booking.com/sear...
2,Les gîtes de Beille,https://www.booking.com/hotel/fr/les-gites-de-...,<HtmlResponse 200 https://www.booking.com/sear...
3,Le petit nid,https://www.booking.com/hotel/fr/le-petit-nid-...,<HtmlResponse 200 https://www.booking.com/sear...
4,Appartement Fontargente,https://www.booking.com/hotel/fr/appartement-f...,<HtmlResponse 200 https://www.booking.com/sear...


Length of the dataframe: 1000


The "booking_city_url" column contains the url that was used to obtain a list of hotels in a given city. In order to know the city where each of the hotels is situated, let us extract the name of the city from the url.

In [5]:
# Example of text in column "booking_city_url"
print(hotel_urls_df.iloc[0]["booking_city_url"])

# Splitting the url using "=" and "&" as delimiters and conserving part of url as the name of the city
hotel_urls_df["city_name"] = hotel_urls_df["booking_city_url"].apply(lambda x: re.split(r"=|&", x)[1])

# Replacing "+" with " " in the names of cities consisting of several words
hotel_urls_df["city_name"] = hotel_urls_df["city_name"].apply(lambda x: x.replace("+", " "))

# Dropping "booking_city_url" column
hotel_urls_df = hotel_urls_df.drop("booking_city_url", axis=1)

hotel_urls_df.head()

<HtmlResponse 200 https://www.booking.com/searchresults.en-us.html?ss=Ariege&ssne=Ariege&ssne_untouched=Ariege&order=distance_from_search>


Unnamed: 0,hotel_name,hotel_url,city_name
0,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,Ariege
1,GuestReady - Cozy perfection in the city centre,https://www.booking.com/hotel/fr/guestready-to...,Annecy
2,Les gîtes de Beille,https://www.booking.com/hotel/fr/les-gites-de-...,Ariege
3,Le petit nid,https://www.booking.com/hotel/fr/le-petit-nid-...,Ariege
4,Appartement Fontargente,https://www.booking.com/hotel/fr/appartement-f...,Ariege


We will now use join the table containing hotel urls with the table containing other data for each hotel (score, address, description etc).

In [6]:
# Loading .json file with hotels data
url = "https://kayak-booking-bucket-12-12-2022.s3.eu-west-3.amazonaws.com/booking_hotels_data.json"
response_hotels_data = urlopen(url)
hotels = json.loads(response_hotels_data.read())

In [7]:
# Creating a dataframe from .json file
hotels_df = pd.DataFrame.from_records(hotels)

hotels_df.head()

Unnamed: 0,hotel_name,score,description,location,latitide,longtitude,hotel_url
0,GuestReady - Cozy perfection in the city centre,8.0,"Located in the centre of Annecy, 37 km from Ro...","\n8 Rue Camille Dunant, 74000 Annecy, France\n",45.8999407,6.1270546,<HtmlResponse 200 https://www.booking.com/hote...
1,Appartement Fontargente,9.2,Appartement Fontargente is located in Ax-les-T...,\npremier étage 19 Avenue Docteur François Gom...,42.71847547,1.84064515,<HtmlResponse 200 https://www.booking.com/hote...
2,Résidence Néméa Les Balcons d'Ax,7.1,Résidence Néméa Les Balcons d'Ax is located in...,"\nStation De Bonascre, 09110 Ax-les-Thermes, F...",42.70237131,1.81530745,<HtmlResponse 200 https://www.booking.com/hote...
3,Lagrange Vacances Les Chalets d’Ax,7.6,You're eligible for a Genius discount at Lagra...,\nQuartier De Castel Maou - Chemin d'Aouredou ...,42.71383156,1.83920681,<HtmlResponse 200 https://www.booking.com/hote...
4,Terres de France - Domaine du Palais,7.5,Located in Saint-Lizier in the Midi-Pyrénées r...,"\nChemin du Parc Le Palais des Evêques, 09190 ...",43.003051,1.13713,<HtmlResponse 200 https://www.booking.com/hote...


In [8]:
# Renaming columns for clarity
hotels_df.rename(columns = {'latitide':'hotel_latitude'}, inplace = True)
hotels_df.rename(columns = {'longtitude':'hotel_longtitude'}, inplace = True)

# Addresses of hotels contain html tag /n, we will use .strip() in  order to get rid of it
hotels_df["location"] = hotels_df["location"].apply(lambda x: str.strip(x))

# Checking the resulting dataframe
hotels_df.head()

Unnamed: 0,hotel_name,score,description,location,hotel_latitude,hotel_longtitude,hotel_url
0,GuestReady - Cozy perfection in the city centre,8.0,"Located in the centre of Annecy, 37 km from Ro...","8 Rue Camille Dunant, 74000 Annecy, France",45.8999407,6.1270546,<HtmlResponse 200 https://www.booking.com/hote...
1,Appartement Fontargente,9.2,Appartement Fontargente is located in Ax-les-T...,premier étage 19 Avenue Docteur François Gomma...,42.71847547,1.84064515,<HtmlResponse 200 https://www.booking.com/hote...
2,Résidence Néméa Les Balcons d'Ax,7.1,Résidence Néméa Les Balcons d'Ax is located in...,"Station De Bonascre, 09110 Ax-les-Thermes, France",42.70237131,1.81530745,<HtmlResponse 200 https://www.booking.com/hote...
3,Lagrange Vacances Les Chalets d’Ax,7.6,You're eligible for a Genius discount at Lagra...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681,<HtmlResponse 200 https://www.booking.com/hote...
4,Terres de France - Domaine du Palais,7.5,Located in Saint-Lizier in the Midi-Pyrénées r...,"Chemin du Parc Le Palais des Evêques, 09190 Sa...",43.003051,1.13713,<HtmlResponse 200 https://www.booking.com/hote...


Some hotel descriptions begin with promotional information about a discount. We will remove these phrases from descriptions.

In [9]:
# Example of hotel description with promotional information
print("Description before cleaning:")
print("'", hotels_df.iloc[3]["description"], "'")
print()

# If the description contains words "Genius discount", we only keep the text the follows the phrase ""To save at this property, all you have to do is  ."
hotels_df["description"] = hotels_df["description"].apply(lambda x: x.split("To save at this property, all you have to do is  . ")[1] if 'Genius discount' in x else x)

print("Description after cleaning: ")
print("'", hotels_df.iloc[3]["description"], "'")

Description before cleaning:
' You're eligible for a Genius discount at Lagrange Vacances Les Chalets d’Ax! To save at this property, all you have to do is  . Lagrange Vacances Les Chalets d’Ax is located in Ax-Les-Thermes, 800 m from the town centre and ski lifts. It offers WiFi in the apartments at an extra cost and an outdoor seasonal heated swimming pool. Lagrange Vacances Les Chalets d’Ax provides studios and apartments with a private balcony or terrace. They each have a kitchen equipped with a dishwasher and microwave. Local leisure activities include fishing in the Ariege River, cycling and hiking. In winter, during holidays and week-ends, free shuttles run from the property to Ax-les-Thermes ski lift. Tarascon Prehistoric Art Park and the Niaux caves can be visited a 30-minute drive from Lagrange Vacances Les Chalets d’Ax which provides free private parking.  '

Description after cleaning: 
' Lagrange Vacances Les Chalets d’Ax is located in Ax-Les-Thermes, 800 m from the town c

In [10]:
# Removing part of text from "hotel_url" column and keeping only the cleaned url
hotels_df["hotel_url"] = hotels_df["hotel_url"].apply(lambda x: x.strip("<HtmlResponse 200 ").split("?")[0])

# Some hotel urls contain "en_gb." part indicating that the url corresponds to the English-language version of the page. 
print("Example of url:", hotels_df.iloc[5]["hotel_url"])

# The hotel url remains functional without the part that indicates the language version of the page,
# so we'll remove this part of string for uniformity (the url remains functional).
hotels_df["hotel_url"] = hotels_df["hotel_url"].apply(lambda x: x.replace("en-gb.", "") if "en-gb." in x else x)

hotels_df.head()

Example of url: https://www.booking.com/hotel/fr/chalet-bois-au-milieu-des-pyrenees.en-gb.html


Unnamed: 0,hotel_name,score,description,location,hotel_latitude,hotel_longtitude,hotel_url
0,GuestReady - Cozy perfection in the city centre,8.0,"Located in the centre of Annecy, 37 km from Ro...","8 Rue Camille Dunant, 74000 Annecy, France",45.8999407,6.1270546,https://www.booking.com/hotel/fr/guestready-to...
1,Appartement Fontargente,9.2,Appartement Fontargente is located in Ax-les-T...,premier étage 19 Avenue Docteur François Gomma...,42.71847547,1.84064515,https://www.booking.com/hotel/fr/appartement-f...
2,Résidence Néméa Les Balcons d'Ax,7.1,Résidence Néméa Les Balcons d'Ax is located in...,"Station De Bonascre, 09110 Ax-les-Thermes, France",42.70237131,1.81530745,https://www.booking.com/hotel/fr/residence-les...
3,Lagrange Vacances Les Chalets d’Ax,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681,https://www.booking.com/hotel/fr/residence-les...
4,Terres de France - Domaine du Palais,7.5,Located in Saint-Lizier in the Midi-Pyrénées r...,"Chemin du Parc Le Palais des Evêques, 09190 Sa...",43.003051,1.13713,https://www.booking.com/hotel/fr/terres-de-fra...


Merging the table containing hotel urls and the table containing other information about hotels

In [11]:
# Rearranging order of columns to prepare the table for the merge with hotel_urls_df
hotels_df = hotels_df[['hotel_url', 'hotel_name', 'score', 'description', 'location', 'hotel_latitude',
       'hotel_longtitude']]

# Merging tables
hotel_info_df = hotel_urls_df.merge(hotels_df, how="left")

# Checking the resulting table
hotel_info_df.head()

Unnamed: 0,hotel_name,hotel_url,city_name,score,description,location,hotel_latitude,hotel_longtitude
0,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,Ariege,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681
1,GuestReady - Cozy perfection in the city centre,https://www.booking.com/hotel/fr/guestready-to...,Annecy,8.0,"Located in the centre of Annecy, 37 km from Ro...","8 Rue Camille Dunant, 74000 Annecy, France",45.8999407,6.1270546
2,Les gîtes de Beille,https://www.booking.com/hotel/fr/les-gites-de-...,Ariege,8.1,Located in Les Cabannes in the Midi-Pyrénées r...,"25 Quartier la Bexane, 09310 Les Cabannes, France",42.7855571,1.68166541
3,Le petit nid,https://www.booking.com/hotel/fr/le-petit-nid-...,Ariege,9.2,"Offering garden views, Le petit nid is an acco...","RDC 1 Rue de la Place, 09330 Montgaillard, France",42.93227802,1.63516787
4,Appartement Fontargente,https://www.booking.com/hotel/fr/appartement-f...,Ariege,9.2,Appartement Fontargente is located in Ax-les-T...,premier étage 19 Avenue Docteur François Gomma...,42.71847547,1.84064515


In [12]:
# Checking if there are any missing values in the table
hotel_info_df.isnull().sum()

hotel_name           0
hotel_url            0
city_name            0
score               95
description          0
location             0
hotel_latitude       0
hotel_longtitude     0
dtype: int64

It turns out that there are a number of hotels with missing scores. Let's have a closer look at them to see if there's a problem.

In [13]:
display(hotel_info_df[hotel_info_df["score"].isnull()].head())

# Saving indexes of rows where the hotel score is missing
no_score_indexes = hotel_info_df[hotel_info_df["score"].isnull()].index

Unnamed: 0,hotel_name,hotel_url,city_name,score,description,location,hotel_latitude,hotel_longtitude
41,Le Factory : Duplex en vieille ville côté lac,https://www.booking.com/hotel/fr/le-factory-du...,Annecy,,"Offering free WiFi, Le Factory: Duplex en viei...","9 Rue Grenette, 74000 Annecy, France",45.8991014,6.1270855
48,Appartement de 2 chambres avec wifi a Annecy,https://www.booking.com/hotel/fr/apartment-rue...,Annecy,,"Located in the centre of Annecy, 37 km from Ro...","23 Rue Notre Dame Haute-Savoie, 74000 Annecy, ...",45.8999708,6.1262985
56,Le Saboly,https://www.booking.com/hotel/fr/le-saboly-avi...,Avignon,,Situated less than 1 km from Avignon Central S...,"2B Place Nicolas Saboly, 84000 Avignon, France",43.9487782,4.8073266
62,"Les Logis de Halley, au cœur de la cité des Papes",https://www.booking.com/hotel/fr/au-coeur-de-l...,Avignon,,"Set in the centre of Avignon, just 400 m from ...","2ème étage 2 Rue Edmond Halley, 84000 Avignon,...",43.94883313,4.80700983
66,Loft hyper centre,https://www.booking.com/hotel/fr/loft-hyper-ce...,Avignon,,"Situated in the centre of Avignon, just 400 m ...","4 Rue de la Rappe, 84000 Avignon, France",43.948483,4.8074583


If we print the descriptions of hotels that have no score, it turns out that most of these are in fact apartments or villas.
If they have been present on booking.com not for a long time, it is normal that they do not have a score yet as there are many less visitors than in hotels.

In [14]:
# (code below commented to avoid long output)
"""for index in no_score_indexes:
    print(hotel_info_df["description"].iloc[index])
    print()"""

'for index in no_score_indexes:\n    print(hotel_info_df["description"].iloc[index])\n    print()'

### Enriching the table with city coordinates

In [15]:
# Loading .csv file with city coordinates from S3 bucket
url = 'https://kayak-booking-bucket-12-12-2022.s3.eu-west-3.amazonaws.com/city_coordinates.csv'
city_coord = pd.read_csv(url)
city_coord.head(5)

Unnamed: 0,place_id,city_name,lat,lon
0,156094680,Mont Saint-Michel,48.635954,-1.51146
1,297756747,Saint-Malo,48.649518,-2.026041
2,297981358,Bayeux,49.276462,-0.702474
3,298137491,Le Havre,49.493898,0.107973
4,297518815,Rouen,49.440459,1.093966


The names of the city in the table "city_coord" are written with accents, hyphens and use "Saint" instead of "St" in names like "Saint-Malo". I would like to keep this variant in the final table.
I will replace the names of the cities in "hotel_info_df" table with the names of the cities as they are written in "city_coord" table. Then I'll use the city names as keys to merge the two tables.

In [16]:
# Saving city names as a list
city_names_coord= city_coord["city_name"].tolist()

# Sorting city names list in the alphabetical order
city_names_coord.sort()

# We replace St with Saint (otherwise, the cities in the two lists will not be sorted in identical order)
hotel_info_df["city_name"] = hotel_info_df["city_name"].apply(lambda x: x.replace("St ", "Saint ") if "St " in x else x)

# Saving city names from "hotel_iinfo_df" table as a list
city_names_hotel = hotel_info_df["city_name"].unique().tolist()

# Sorting city names list in the alphabetical order
city_names_hotel.sort()

# Checking if the order of cities in the two lists is correct
print(list(zip(city_names_hotel, city_names_coord)))

[('Aigues Mortes', 'Aigues-Mortes'), ('Aix en Provence', 'Aix-en-Provence'), ('Amiens', 'Amiens'), ('Annecy', 'Annecy'), ('Ariege', 'Ariège'), ('Avignon', 'Avignon'), ('Bayeux', 'Bayeux'), ('Bayonne', 'Bayonne'), ('Besancon', 'Besançon'), ('Biarritz', 'Biarritz'), ('Bormes les Mimosas', 'Bormes-les-Mimosas'), ('Carcassonne', 'Carcassonne'), ('Cassis', 'Cassis'), ('Chateau du Haut Koenigsbourg', 'Château du Haut-Kœnigsbourg'), ('Collioure', 'Collioure'), ('Colmar', 'Colmar'), ('Dijon', 'Dijon'), ('Eguisheim', 'Eguisheim'), ('Gorges du Verdon', 'Gorges du Verdon'), ('Grenoble', 'Grenoble'), ('La Rochelle', 'La Rochelle'), ('Le Havre', 'Le Havre'), ('Lille', 'Lille'), ('Lyon', 'Lyon'), ('Marseille', 'Marseilla'), ('Mont Saint Michel', 'Mont Saint-Michel'), ('Montauban', 'Montauban'), ('Nimes', 'Nîmes'), ('Paris', 'Paris'), ('Rouen', 'Rouen'), ('Saint Malo', 'Saint-Malo'), ('Saintes Maries de la mer', 'Saintes-Maries-de-la-Mer'), ('Strasbourg', 'Strasbourg'), ('Toulouse', 'Toulouse'), ('Uz

In [17]:
# "Marseille" is spelled as "Marseilla" in one of the lists, so we'll correct the mistake
city_names_coord = [city.replace("Marseilla", "Marseille") for city in city_names_coord]

# Replacing city names in "hotel_info_df" table
for i in range(0, len(hotel_info_df["city_name"])):
    if hotel_info_df["city_name"][i] in city_names_hotel:
        index = city_names_hotel.index(hotel_info_df["city_name"].loc[i])
        hotel_info_df["city_name"] = hotel_info_df["city_name"].replace(hotel_info_df["city_name"][i], city_names_coord[index])
    else: 
        pass
        

# Checking the resulting table
hotel_info_df.head()

Unnamed: 0,hotel_name,hotel_url,city_name,score,description,location,hotel_latitude,hotel_longtitude
0,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,Ariège,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681
1,GuestReady - Cozy perfection in the city centre,https://www.booking.com/hotel/fr/guestready-to...,Annecy,8.0,"Located in the centre of Annecy, 37 km from Ro...","8 Rue Camille Dunant, 74000 Annecy, France",45.8999407,6.1270546
2,Les gîtes de Beille,https://www.booking.com/hotel/fr/les-gites-de-...,Ariège,8.1,Located in Les Cabannes in the Midi-Pyrénées r...,"25 Quartier la Bexane, 09310 Les Cabannes, France",42.7855571,1.68166541
3,Le petit nid,https://www.booking.com/hotel/fr/le-petit-nid-...,Ariège,9.2,"Offering garden views, Le petit nid is an acco...","RDC 1 Rue de la Place, 09330 Montgaillard, France",42.93227802,1.63516787
4,Appartement Fontargente,https://www.booking.com/hotel/fr/appartement-f...,Ariège,9.2,Appartement Fontargente is located in Ax-les-T...,premier étage 19 Avenue Docteur François Gomma...,42.71847547,1.84064515


In [18]:
# Checking the order of columns 
print(city_coord.columns)

# Rearranging the order of columns to prepare for merging with "hotel_info_df"
city_coord = city_coord[['place_id',  'lat', 'lon', 'city_name']]

# Merging the two tables
city_and_hotel_info_df = city_coord.merge(hotel_info_df, how="right")

Index(['place_id', 'city_name', 'lat', 'lon'], dtype='object')


In [19]:
# Checking the order of the columns in the resulting table
print(city_and_hotel_info_df.columns)

# Rearranging the order of the columns for clarity
city_and_hotel_info_df = city_and_hotel_info_df[['city_name', 'place_id', 'lat', 'lon', 'hotel_name', 'hotel_url',
       'score', 'description', 'location', 'hotel_latitude', 'hotel_longtitude']]

# Checking the resulting table
city_and_hotel_info_df.head()

Index(['place_id', 'lat', 'lon', 'city_name', 'hotel_name', 'hotel_url',
       'score', 'description', 'location', 'hotel_latitude',
       'hotel_longtitude'],
      dtype='object')


Unnamed: 0,city_name,place_id,lat,lon,hotel_name,hotel_url,score,description,location,hotel_latitude,hotel_longtitude
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681
1,Annecy,298516920.0,45.899235,6.128885,GuestReady - Cozy perfection in the city centre,https://www.booking.com/hotel/fr/guestready-to...,8.0,"Located in the centre of Annecy, 37 km from Ro...","8 Rue Camille Dunant, 74000 Annecy, France",45.8999407,6.1270546
2,Ariège,297389050.0,42.945537,1.406554,Les gîtes de Beille,https://www.booking.com/hotel/fr/les-gites-de-...,8.1,Located in Les Cabannes in the Midi-Pyrénées r...,"25 Quartier la Bexane, 09310 Les Cabannes, France",42.7855571,1.68166541
3,Ariège,297389050.0,42.945537,1.406554,Le petit nid,https://www.booking.com/hotel/fr/le-petit-nid-...,9.2,"Offering garden views, Le petit nid is an acco...","RDC 1 Rue de la Place, 09330 Montgaillard, France",42.93227802,1.63516787
4,Ariège,297389050.0,42.945537,1.406554,Appartement Fontargente,https://www.booking.com/hotel/fr/appartement-f...,9.2,Appartement Fontargente is located in Ax-les-T...,premier étage 19 Avenue Docteur François Gomma...,42.71847547,1.84064515


### Enriching the table with weather data

We need to add 7-day weather forecast for each hotel based on the city where the hotel is situated. To do this, we repeat each line containing information about a hotel 7 times.

In [20]:
# Saving the number of hotels in a variable, we will need it later
nb_hotels = len(city_and_hotel_info_df)

# Repeating every line 7 times
city_and_hotel_info_df = city_and_hotel_info_df.loc[city_and_hotel_info_df.index.repeat(7)]

# Checking the result
city_and_hotel_info_df.head(10)

Unnamed: 0,city_name,place_id,lat,lon,hotel_name,hotel_url,score,description,location,hotel_latitude,hotel_longtitude
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681
1,Annecy,298516920.0,45.899235,6.128885,GuestReady - Cozy perfection in the city centre,https://www.booking.com/hotel/fr/guestready-to...,8.0,"Located in the centre of Annecy, 37 km from Ro...","8 Rue Camille Dunant, 74000 Annecy, France",45.8999407,6.1270546
1,Annecy,298516920.0,45.899235,6.128885,GuestReady - Cozy perfection in the city centre,https://www.booking.com/hotel/fr/guestready-to...,8.0,"Located in the centre of Annecy, 37 km from Ro...","8 Rue Camille Dunant, 74000 Annecy, France",45.8999407,6.1270546
1,Annecy,298516920.0,45.899235,6.128885,GuestReady - Cozy perfection in the city centre,https://www.booking.com/hotel/fr/guestready-to...,8.0,"Located in the centre of Annecy, 37 km from Ro...","8 Rue Camille Dunant, 74000 Annecy, France",45.8999407,6.1270546


We wil now create a key that will later help us merge the dataframe with the hotel information and the dataframe with the weather information.

In [21]:
# Creating a list of seven numbers that correspont to the next seven days
days = [1, 2, 3, 4, 5, 6, 7]

# Creating a column that will contain, for each hotel, a number of the day
day_numbers = days * nb_hotels
city_and_hotel_info_df["day_in_city"] = day_numbers

# For each day, we add the name of the city where the hotel is situated. 
# This column will be used to indicate weather forecast for each hotel based on the city.
city_and_hotel_info_df["day_in_city"] = city_and_hotel_info_df['city_name'] + " day "+ city_and_hotel_info_df["day_in_city"].astype(str)

# Checking the resulting table
city_and_hotel_info_df.head()

Unnamed: 0,city_name,place_id,lat,lon,hotel_name,hotel_url,score,description,location,hotel_latitude,hotel_longtitude,day_in_city
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681,Ariège day 1
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681,Ariège day 2
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681,Ariège day 3
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681,Ariège day 4
0,Ariège,297389050.0,42.945537,1.406554,Lagrange Vacances Les Chalets d’Ax,https://www.booking.com/hotel/fr/residence-les...,7.6,Lagrange Vacances Les Chalets d’Ax is located ...,"Quartier De Castel Maou - Chemin d'Aouredou -,...",42.71383156,1.83920681,Ariège day 5


In [22]:
# Loading .csv file with weather data from S3 bucket
url = 'https://kayak-booking-bucket-12-12-2022.s3.eu-west-3.amazonaws.com/weather_forecast.csv'
weather_df = pd.read_csv(url)

# Each line of the table is contains weather information for a specific city
# and for a specific date for the next 7 days.
weather_df.head(10)

Unnamed: 0,dt,sunrise,sunset,pressure,humidity,dew_point,wind_speed,wind_deg,wind_gust,clouds,...,temp_eve,temp_morn,feels_like_day,feels_like_night,feels_like_eve,feels_like_morn,weather_id,weather_main,weather_description,city_name
0,2023-02-22 13:00:00,2023-02-22 08:02:13,2023-02-22 18:37:18,1014,93,7.63,7.12,330,10.53,94,...,8.39,6.85,6.03,3.88,5.9,5.31,500.0,Rain,light rain,Mont Saint-Michel
1,2023-02-23 13:00:00,2023-02-23 08:00:22,2023-02-23 18:38:55,1017,84,4.75,9.05,41,14.73,100,...,7.08,3.98,3.84,2.39,2.81,1.19,500.0,Rain,light rain,Mont Saint-Michel
2,2023-02-24 13:00:00,2023-02-24 07:58:29,2023-02-24 18:40:31,1012,65,2.63,8.4,44,14.43,78,...,6.79,4.68,6.06,3.78,3.61,0.51,803.0,Clouds,broken clouds,Mont Saint-Michel
3,2023-02-25 13:00:00,2023-02-25 07:56:35,2023-02-25 18:42:07,1016,62,0.45,8.55,49,12.59,69,...,5.15,4.42,3.74,-0.56,0.84,0.63,803.0,Clouds,broken clouds,Mont Saint-Michel
4,2023-02-26 13:00:00,2023-02-26 07:54:41,2023-02-26 18:43:42,1024,48,-4.39,9.55,51,14.8,8,...,3.52,1.85,0.85,-3.96,-1.56,-2.5,800.0,Clear,clear sky,Mont Saint-Michel
5,2023-02-27 13:00:00,2023-02-27 07:52:45,2023-02-27 18:45:17,1029,43,-6.21,7.96,47,12.29,2,...,2.38,-1.3,1.11,-4.65,-2.61,-6.57,800.0,Clear,clear sky,Mont Saint-Michel
6,2023-02-28 13:00:00,2023-02-28 07:50:49,2023-02-28 18:46:52,1022,47,-4.98,5.88,45,9.82,4,...,2.98,-2.49,1.98,-2.56,-0.79,-7.3,800.0,Clear,clear sky,Mont Saint-Michel
7,2023-02-22 13:00:00,2023-02-22 08:04:18,2023-02-22 18:39:21,1014,88,6.97,8.32,335,10.83,98,...,8.13,7.65,5.77,3.65,4.82,5.52,500.0,Rain,light rain,Saint-Malo
8,2023-02-23 13:00:00,2023-02-23 08:02:26,2023-02-23 18:40:57,1017,81,4.37,11.52,40,15.1,100,...,7.57,5.78,3.33,2.29,2.9,2.66,500.0,Rain,light rain,Saint-Malo
9,2023-02-24 13:00:00,2023-02-24 08:00:33,2023-02-24 18:42:33,1013,71,2.73,10.32,43,14.48,79,...,7.47,5.82,3.82,3.83,3.79,1.01,803.0,Clouds,broken clouds,Saint-Malo


In [23]:
# We create the columnn which the city name and the number of the day
nb_cities = weather_df["city_name"].nunique()
days = [1, 2, 3, 4, 5, 6, 7]
day_numbers = days * nb_cities
weather_df["day_in_city"] = day_numbers
weather_df["day_in_city"] = weather_df['city_name'] + " day "+ weather_df["day_in_city"].astype(str)

# Checking the result
weather_df.tail(10)

Unnamed: 0,dt,sunrise,sunset,pressure,humidity,dew_point,wind_speed,wind_deg,wind_gust,clouds,...,temp_morn,feels_like_day,feels_like_night,feels_like_eve,feels_like_morn,weather_id,weather_main,weather_description,city_name,day_in_city
235,2023-02-26 13:00:00,2023-02-26 07:48:16,2023-02-26 18:49:49,1013,59,-0.85,4.62,332,7.53,87,...,3.77,4.17,0.78,4.9,1.89,804.0,Clouds,overcast clouds,Bayonne,Bayonne day 5
236,2023-02-27 13:00:00,2023-02-27 07:46:38,2023-02-27 18:51:06,1021,49,-5.39,4.2,324,6.91,4,...,-0.9,2.05,0.49,1.54,-3.87,800.0,Clear,clear sky,Bayonne,Bayonne day 6
237,2023-02-28 13:00:00,2023-02-28 07:44:59,2023-02-28 18:52:23,1017,64,-1.44,2.3,317,2.59,82,...,0.31,3.05,2.24,3.14,0.31,803.0,Clouds,broken clouds,Bayonne,Bayonne day 7
238,2023-02-22 13:00:00,2023-02-22 07:57:02,2023-02-22 18:39:37,1016,94,9.3,6.53,349,9.26,94,...,10.0,9.75,5.26,9.55,8.29,500.0,Rain,light rain,La Rochelle,La Rochelle day 1
239,2023-02-23 13:00:00,2023-02-23 07:55:19,2023-02-23 18:41:05,1014,89,3.88,8.72,350,10.69,100,...,5.67,1.79,6.06,3.41,1.95,500.0,Rain,light rain,La Rochelle,La Rochelle day 2
240,2023-02-24 13:00:00,2023-02-24 07:53:35,2023-02-24 18:42:32,1007,93,7.48,9.82,50,15.87,100,...,8.69,6.3,3.6,4.46,4.57,501.0,Rain,moderate rain,La Rochelle,La Rochelle day 3
241,2023-02-25 13:00:00,2023-02-25 07:51:50,2023-02-25 18:43:59,1011,68,1.96,9.13,59,13.54,97,...,6.29,3.06,-0.25,2.9,1.68,500.0,Rain,light rain,La Rochelle,La Rochelle day 4
242,2023-02-26 13:00:00,2023-02-26 07:50:05,2023-02-26 18:45:25,1017,48,-4.64,11.85,45,15.83,5,...,2.49,-0.12,-3.41,-1.48,-3.6,800.0,Clear,clear sky,La Rochelle,La Rochelle day 5
243,2023-02-27 13:00:00,2023-02-27 07:48:18,2023-02-27 18:46:51,1024,45,-6.1,9.92,37,15.48,0,...,-0.05,-0.59,-3.54,-1.02,-6.69,800.0,Clear,clear sky,La Rochelle,La Rochelle day 6
244,2023-02-28 13:00:00,2023-02-28 07:46:31,2023-02-28 18:48:17,1019,52,-4.47,7.99,33,13.48,98,...,-0.35,0.48,-0.45,0.73,-6.33,804.0,Clouds,overcast clouds,La Rochelle,La Rochelle day 7


In [24]:
# Checking the order of columns in the dataframe
print(weather_df.columns)

# Changing the order of columns in order to be able to merge dataframes easily.
# We will keep 'city_name' column in both dataframes in order to check later if the merge has been done correctly.
weather_df = weather_df[['day_in_city', 'city_name', 'dt', 'sunrise', 'sunset', 'pressure', 'humidity',
       'dew_point', 'wind_speed', 'wind_deg', 'wind_gust', 'clouds', 'pop',
       'uvi', 'rain', 'snow', 'temp_day', 'temp_min', 'temp_max', 'temp_night',
       'temp_eve', 'temp_morn', 'feels_like_day', 'feels_like_night',
       'feels_like_eve', 'feels_like_morn', 'weather_id', 'weather_main',
       'weather_description']]

# Merging the dataframes
kayak_df = city_and_hotel_info_df.merge(weather_df, how="left")

# Checking the result (commented to avoid long output)
#pd.set_option('display.max_columns', None)
#kayak_df.tail(15)

Index(['dt', 'sunrise', 'sunset', 'pressure', 'humidity', 'dew_point',
       'wind_speed', 'wind_deg', 'wind_gust', 'clouds', 'pop', 'rain', 'uvi',
       'snow', 'temp_day', 'temp_min', 'temp_max', 'temp_night', 'temp_eve',
       'temp_morn', 'feels_like_day', 'feels_like_night', 'feels_like_eve',
       'feels_like_morn', 'weather_id', 'weather_main', 'weather_description',
       'city_name', 'day_in_city'],
      dtype='object')


Now we have the dataframe with all the necessary information, but some adjustments have to be made.

In [25]:
# Renaming some columns for clarity
kayak_df.rename(columns = {'place_id':'city_id'}, inplace = True)
kayak_df.rename(columns = {'lat':'city_latitude'}, inplace = True)
kayak_df.rename(columns = {'lon':'city_longtitude'}, inplace = True)
kayak_df.rename(columns = {'dt':'date'}, inplace = True)

Values in some columns are still in string format while they should be in float or datetime format:

In [26]:
# Checking data types present in the dataframe (commented to avoid long output)
#kayak_df.dtypes

# Converting the values in the columns "date", "sunrise", "sunset" to datetime type
columns_to_convert = ["date", "sunrise", "sunset"]
for column in columns_to_convert: 
    kayak_df[column] = pd.to_datetime(kayak_df[column], format='%Y-%m-%d %H:%M:%S')

# Creating a column with the name of the day of the week    
kayak_df["day_of_week"] = kayak_df["date"].apply(lambda x: x.day_name())

# Checking what are the remaining columns that contain values in string ("object") format
print((kayak_df.select_dtypes(include=['object'])).columns)

# Converting the values in the columns 'score', 'hotel_latitude', 'hotel_longtitude' to float type
columns_to_convert = ['score', 'hotel_latitude', 'hotel_longtitude']
for column in columns_to_convert: 
    kayak_df[column] = kayak_df[column].astype(float)

Index(['city_name', 'hotel_name', 'hotel_url', 'score', 'description',
       'location', 'hotel_latitude', 'hotel_longtitude', 'day_in_city',
       'weather_main', 'weather_description', 'day_of_week'],
      dtype='object')


In [27]:
# Checking data types (commented to avoid long output)
#kayak_df.dtypes

# Checking the names of columns
print(kayak_df.columns)

# Rearranging the order of columns for clarity
kayak_df = kayak_df[['city_name', 'city_id', 'city_latitude', 'city_longtitude',
       'hotel_name', 'hotel_url', 'score', 'description', 'location',
       'hotel_latitude', 'hotel_longtitude', 'date', 'day_of_week','day_in_city','sunrise',
       'sunset', 'weather_main', 'weather_description', 'weather_id','pressure', 
       'humidity', 'dew_point', 'wind_speed', 'wind_deg',
       'wind_gust', 'clouds', 'pop', 'uvi', 'rain', 'snow', 'temp_day',
       'temp_min', 'temp_max', 'temp_night', 'temp_eve', 'temp_morn',
       'feels_like_day', 'feels_like_night', 'feels_like_eve','feels_like_morn']]

Index(['city_name', 'city_id', 'city_latitude', 'city_longtitude',
       'hotel_name', 'hotel_url', 'score', 'description', 'location',
       'hotel_latitude', 'hotel_longtitude', 'day_in_city', 'date', 'sunrise',
       'sunset', 'pressure', 'humidity', 'dew_point', 'wind_speed', 'wind_deg',
       'wind_gust', 'clouds', 'pop', 'uvi', 'rain', 'snow', 'temp_day',
       'temp_min', 'temp_max', 'temp_night', 'temp_eve', 'temp_morn',
       'feels_like_day', 'feels_like_night', 'feels_like_eve',
       'feels_like_morn', 'weather_id', 'weather_main', 'weather_description',
       'day_of_week'],
      dtype='object')


Saving the .csv file locally in the working folder

In [28]:
kayak_df.to_csv("kayak.csv", index=False)

Saving the .csv file to S3 bucket

In [29]:
# (The cell is commented to avoid unnecessary rewriting).
"""# Access key for user with access to write in S3 bucket
S3_ACCESS_KEY_ID =  os.getenv("S3_ACCESS_KEY_ID")
# Secret key for user with access to write in S3 bucket 
S3_SECRET_ACCESS_KEY =  os.getenv("S3_SECRET_ACCESS_KEY")

# Writing the .csv file to bucket S3
session = boto3.Session(aws_access_key_id=S3_ACCESS_KEY_ID, 
                      aws_secret_access_key=S3_SECRET_ACCESS_KEY)
s3 = session.resource("s3")
bucket = s3.Bucket("kayak-booking-bucket-12-12-2022") 
bucket.upload_file("kayak.csv", Key="kayak.csv")"""

'# Access key for user with access to write in S3 bucket\nS3_ACCESS_KEY_ID =  os.getenv("S3_ACCESS_KEY_ID")\n# Secret key for user with access to write in S3 bucket \nS3_SECRET_ACCESS_KEY =  os.getenv("S3_SECRET_ACCESS_KEY")\n\n# Writing the .csv file to bucket S3\nsession = boto3.Session(aws_access_key_id=S3_ACCESS_KEY_ID, \n                      aws_secret_access_key=S3_SECRET_ACCESS_KEY)\ns3 = session.resource("s3")\nbucket = s3.Bucket("kayak-booking-bucket-12-12-2022") \nbucket.upload_file("kayak.csv", Key="kayak.csv")'