
# 🌍 Kayak Destination Recommender Project

Ce notebook implémente le pipeline complet pour le projet Kayak :  
- Géocodage des villes avec **Nominatim**
- Récupération de la météo sur 7 jours via **OpenWeatherMap**
- Scraping des hôtels depuis **Booking.com**
- Agrégation et enrichissement des données
- Export vers **CSV**, **S3**, et **RDS**
- Visualisation des résultats avec **Plotly**


In [29]:

import os
import time
import requests
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import plotly.express as px
from datetime import datetime
from bs4 import BeautifulSoup
import boto3
from dotenv import load_dotenv

loaded = load_dotenv()
print("dotenv loaded:", loaded)

# Flags d’exécution
DO_GEOCODE = True
DO_WEATHER = True
DO_SCRAPE_HOTELS = False
DO_UPLOAD_S3 = False
DO_LOAD_RDS = False

# Clés/API via ENV
OWM_API_KEY = os.getenv("OWM_API_KEY")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
S3_BUCKET = os.getenv("S3_BUCKET", "projet-kayak")
S3_REGION = os.getenv("S3_REGION", "eu-west-3")

PG_HOST = os.getenv("PG_HOST", "kayakdatabase.cluster-cxissw2oqngk.eu-west-3.rds.amazonaws.com")
PG_DB   = os.getenv("PG_DB", "kayakdatabase")
PG_USER = os.getenv("PG_USER", "postgres")
PG_PWD  = os.getenv("PG_PWD") 

dotenv loaded: True


In [30]:

cities = [
"Mont Saint Michel","St Malo","Bayeux","Le Havre","Rouen","Paris","Amiens","Lille","Strasbourg",
"Chateau du Haut Koenigsbourg","Colmar","Eguisheim","Besancon","Dijon","Annecy","Grenoble","Lyon",
"Gorges du Verdon","Bormes les Mimosas","Cassis","Marseille","Aix en Provence","Avignon","Uzes",
"Nimes","Aigues Mortes","Saintes Maries de la mer","Collioure","Carcassonne","Ariege","Toulouse",
"Montauban","Biarritz","Bayonne","La Rochelle"
]
df_cities = pd.DataFrame({"city": cities, "id": range(1, len(cities)+1)})
df_cities.head()

Unnamed: 0,city,id
0,Mont Saint Michel,1
1,St Malo,2
2,Bayeux,3
3,Le Havre,4
4,Rouen,5


In [31]:

if DO_GEOCODE:
    coords = []
    for city in df_cities['city']:
        url = f"https://nominatim.openstreetmap.org/search?q={city},France&format=json&limit=1"
        try:
            r = requests.get(url, headers={"User-Agent":"kayak-project"}, timeout=10)
            r.raise_for_status()
            data = r.json()
            if data:
                coords.append((float(data[0]['lat']), float(data[0]['lon'])))
            else:
                coords.append((None, None))
        except:
            coords.append((None, None))
        time.sleep(1)
    df_cities['lat'], df_cities['lon'] = zip(*coords)
df_cities.head()


Unnamed: 0,city,id,lat,lon
0,Mont Saint Michel,1,48.635954,-1.51146
1,St Malo,2,48.649518,-2.026041
2,Bayeux,3,49.276462,-0.702474
3,Le Havre,4,49.493898,0.107973
4,Rouen,5,49.440459,1.093966


In [32]:
OWM_API_KEY = os.getenv("OWM_API_KEY")
assert OWM_API_KEY and OWM_API_KEY not in ("None",""), "OWM_API_KEY est vide/introuvable. Définis-la avant de lancer."


if {"latitude","longitude"}.issubset(df_cities.columns):
    LAT_COL, LON_COL = "latitude", "longitude"
elif {"lat","lon"}.issubset(df_cities.columns):
    LAT_COL, LON_COL = "lat", "lon"
else:
    raise KeyError("df_cities doit contenir (latitude, longitude) ou (lat, lon).")


def compute_weather_score_forecast(forecast_list):
    temps = []
    rains = []
    for d in forecast_list:
        main = d.get("main", {})
        if "temp" in main:
            temps.append(float(main["temp"]))
        rains.append(float(d.get("rain", {}).get("3h", 0) or 0))
    if not temps:
        return np.nan
    return float(np.mean(temps) - np.mean(rains))

def fetch_forecast(lat, lon, api_key, timeout=15):
    url = "https://api.openweathermap.org/data/2.5/forecast"
    params = {"lat": lat, "lon": lon, "units": "metric", "appid": api_key}
    r = requests.get(url, params=params, timeout=timeout)
    r.raise_for_status()
    return r.json()


if DO_WEATHER:
    scores = []
    for _, row in df_cities.iterrows():
        lat, lon = row[LAT_COL], row[LON_COL]
        if pd.isna(lat) or pd.isna(lon):
            scores.append(np.nan)
            continue
        try:
            js = fetch_forecast(lat, lon, OWM_API_KEY)
            L = js.get("list", [])
            if not L:
                # log utile pour diagnostiquer
                print(f"[WARN] Pas de 'list' pour {row['city']} — clés: {list(js.keys())}")
                scores.append(np.nan)
            else:
                scores.append(compute_weather_score_forecast(L))
        except requests.HTTPError as e:
            status = e.response.status_code if e.response is not None else "?"
            body   = e.response.text[:140] if e.response is not None else ""
            print(f"[HTTP {status}] {row['city']}: {body}")
            scores.append(np.nan)
        except Exception as e:
            print(f"[ERR] {row['city']}: {e}")
            scores.append(np.nan)
        time.sleep(1) 
    df_cities["weather_score"] = scores

df_cities.sort_values("weather_score", ascending=False).head(10)


Unnamed: 0,city,id,lat,lon,weather_score
20,Marseille,21,43.296174,5.369953,24.155
21,Aix en Provence,22,43.529842,5.447474,23.68875
24,Nimes,25,43.837425,4.360069,23.30675
22,Avignon,23,43.949249,4.805901,23.2925
19,Cassis,20,43.214036,5.539632,23.15525
27,Collioure,28,42.52505,3.083155,22.91775
25,Aigues Mortes,26,43.566152,4.19154,22.886
18,Bormes les Mimosas,19,43.150697,6.341928,22.77425
26,Saintes Maries de la mer,27,43.451592,4.42772,22.4425
23,Uzes,24,44.012128,4.419672,22.2505


In [33]:
import os, time, re
import pandas as pd
from datetime import datetime, timedelta

# SCRAPING BOOKING AVEC UC (anti-bot)
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

DEBUG_DIR = "debug_booking"
os.makedirs(DEBUG_DIR, exist_ok=True)

def build_booking_url(city: str, nights: int = 2):
    checkin = (datetime.today() + timedelta(days=14)).strftime("%Y-%m-%d")
    checkout = (datetime.today() + timedelta(days=14+nights)).strftime("%Y-%m-%d")
    return (
        "https://www.booking.com/searchresults.fr.html"
        f"?ss={city.replace(' ', '+')},France"
        f"&checkin={checkin}&checkout={checkout}"
        "&group_adults=2&no_rooms=1&group_children=0"
        "&selected_currency=EUR&order=popularity"
        "&sb_travel_purpose=leisure"
        "&lang=fr&srpvid=abcd"
    )

def start_uc(headless: bool = True):
    opts = uc.ChromeOptions()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--start-maximized")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--lang=fr-FR")
    opts.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/124.0.0.0 Safari/537.36")
    driver = uc.Chrome(options=opts)
    return driver

def accept_cookies(driver):
    selectors = [
        "#onetrust-accept-btn-handler",
        "button[aria-label='Accepter']",
        "button[aria-label='Accept']",
        "button[aria-label*='cookies']",
        "[data-testid='accept-all-cookies-button']",
    ]
    for sel in selectors:
        try:
            btn = WebDriverWait(driver, 4).until(EC.element_to_be_clickable((By.CSS_SELECTOR, sel)))
            btn.click()
            time.sleep(1)
            return True
        except:
            pass
    return False

def slow_scroll(driver, steps=4, pause=1.2):
    h = driver.execute_script("return document.body.scrollHeight")
    for i in range(1, steps+1):
        y = int(h * i / steps)
        driver.execute_script(f"window.scrollTo(0, {y});")
        time.sleep(pause)

def parse_cards(driver, max_hotels=5):
    
    cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='property-card']")
    if not cards:
        cards = driver.find_elements(By.CSS_SELECTOR, "div.sr_property_block")
    rows = []
    for card in cards[:max_hotels]:

        name = None
        for sel in ["div[data-testid='title']", ".sr-hotel__name", ".fcab3ed991"]:
            try:
                txt = card.find_element(By.CSS_SELECTOR, sel).text.strip()
                if txt:
                    name = txt
                    break
            except:
                continue

        url = None
        for sel in ["a[data-testid='title-link']", "a.sr-hotel__name", "a"]:
            try:
                href = card.find_element(By.CSS_SELECTOR, sel).get_attribute("href")
                if href and "booking.com" in href:
                    url = href
                    break
            except:
                continue
        score = None
        for sel in ["div[data-testid='review-score']", ".bui-review-score__badge", ".b5cd09854e"]:
            try:
                t = card.find_element(By.CSS_SELECTOR, sel).text.strip()
                if t:
                    score = t
                    break
            except:
                continue
        rows.append({"hotel_name": name, "url": url, "score": score})
    return rows

def scrape_hotels_city_uc(city: str, max_hotels: int = 5, headless: bool = True, debug_save=True):
    driver = start_uc(headless=headless)
    url = build_booking_url(city)
    out = []
    try:
        driver.get(url)
        time.sleep(3)
        accept_cookies(driver)


        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='property-card'], div.sr_property_block"))
            )
        except:
            pass


        slow_scroll(driver, steps=5, pause=1.0)


        if debug_save:
            html_path = os.path.join(DEBUG_DIR, f"{city.replace(' ','_')}.html")
            png_path  = os.path.join(DEBUG_DIR, f"{city.replace(' ','_')}.png")
            with open(html_path, "w", encoding="utf-8") as f:
                f.write(driver.page_source)
            try:
                driver.save_screenshot(png_path)
            except:
                pass

        rows = parse_cards(driver, max_hotels=max_hotels)


        cleaned = []
        for r in rows:
            sc = r.get("score")
            score_float = None
            if sc:
                m = re.search(r"(\d+[.,]?\d*)", sc)
                if m:
                    score_float = float(m.group(1).replace(",", "."))
            cleaned.append({"city": city, "hotel_name": r.get("hotel_name"), "url": r.get("url"), "score": score_float})
        out = [row for row in cleaned if any([row["hotel_name"], row["url"], row["score"] is not None])]

    finally:
        driver.quit()
    return out

# BOUCLE PRINCIPALE
DO_SCRAPE_HOTELS = True

if DO_SCRAPE_HOTELS:
    all_hotels = []
    test_batch = cities
    for c in test_batch:
        print(f"Booking – {c} ...")
        rows = scrape_hotels_city_uc(c, max_hotels=5, headless=False, debug_save=True)
        print(f"  -> {len(rows)} hôtels")
        all_hotels.extend(rows)
        time.sleep(3)

    df_hotels = pd.DataFrame(all_hotels).drop_duplicates(subset=["city","hotel_name"], keep="first")
    print("Total hôtels collectés :", len(df_hotels))
    df_hotels.to_csv("hotels.csv", index=False)
else:
    df_hotels = pd.DataFrame(columns=["city","hotel_name","url","score"])

df_hotels.head(10)



Booking – Mont Saint Michel ...
  -> 5 hôtels
Booking – St Malo ...
  -> 5 hôtels
Booking – Bayeux ...
  -> 5 hôtels
Booking – Le Havre ...
  -> 5 hôtels
Booking – Rouen ...
  -> 5 hôtels
Booking – Paris ...
  -> 5 hôtels
Booking – Amiens ...
  -> 5 hôtels
Booking – Lille ...
  -> 5 hôtels
Booking – Strasbourg ...
  -> 5 hôtels
Booking – Chateau du Haut Koenigsbourg ...
  -> 5 hôtels
Booking – Colmar ...
  -> 5 hôtels
Booking – Eguisheim ...
  -> 5 hôtels
Booking – Besancon ...
  -> 5 hôtels
Booking – Dijon ...
  -> 5 hôtels
Booking – Annecy ...
  -> 5 hôtels
Booking – Grenoble ...
  -> 5 hôtels
Booking – Lyon ...
  -> 5 hôtels
Booking – Gorges du Verdon ...
  -> 5 hôtels
Booking – Bormes les Mimosas ...
  -> 5 hôtels
Booking – Cassis ...
  -> 5 hôtels
Booking – Marseille ...
  -> 5 hôtels
Booking – Aix en Provence ...
  -> 5 hôtels
Booking – Avignon ...
  -> 5 hôtels
Booking – Uzes ...
  -> 5 hôtels
Booking – Nimes ...
  -> 5 hôtels
Booking – Aigues Mortes ...
  -> 5 hôtels
Booking – 

Unnamed: 0,city,hotel_name,url,score
0,Mont Saint Michel,Apparthôtel Mont Saint Michel - Résidence Fleu...,https://www.booking.com/hotel/fr/residence-fle...,8.4
1,Mont Saint Michel,Gîtes le Mont Desclos Saint Michel,https://www.booking.com/hotel/fr/gites-le-mont...,9.1
2,Mont Saint Michel,Hotel Rose,https://www.booking.com/hotel/fr/gue-de-beauvo...,8.5
3,Mont Saint Michel,La Greve Saint Michel,https://www.booking.com/hotel/fr/la-greve-sain...,8.9
4,Mont Saint Michel,Résidence Beauvoir le Mont-Saint-Michel (9 gît...,https://www.booking.com/hotel/fr/residence-bea...,9.5
5,St Malo,La Rotonde,https://www.booking.com/hotel/fr/la-rotonde-st...,7.5
6,St Malo,Ker Edouard,https://www.booking.com/hotel/fr/ker-edouard-t...,8.2
7,St Malo,Brit Hotel Le Surcouf,https://www.booking.com/hotel/fr/le-surcouf.fr...,7.8
8,St Malo,Best Western Alexandra,https://www.booking.com/hotel/fr/best-western-...,8.4
9,St Malo,Hotel Ar Terra Nova,https://www.booking.com/hotel/fr/ar-terra-nova...,7.8


In [34]:

df_final = df_cities.merge(df_hotels, on="city", how="left")
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
df_final.to_csv(f"kayak_final_{timestamp}.csv", index=False)
df_final.head()


Unnamed: 0,city,id,lat,lon,weather_score,hotel_name,url,score
0,Mont Saint Michel,1,48.635954,-1.51146,16.905,Apparthôtel Mont Saint Michel - Résidence Fleu...,https://www.booking.com/hotel/fr/residence-fle...,8.4
1,Mont Saint Michel,1,48.635954,-1.51146,16.905,Gîtes le Mont Desclos Saint Michel,https://www.booking.com/hotel/fr/gites-le-mont...,9.1
2,Mont Saint Michel,1,48.635954,-1.51146,16.905,Hotel Rose,https://www.booking.com/hotel/fr/gue-de-beauvo...,8.5
3,Mont Saint Michel,1,48.635954,-1.51146,16.905,La Greve Saint Michel,https://www.booking.com/hotel/fr/la-greve-sain...,8.9
4,Mont Saint Michel,1,48.635954,-1.51146,16.905,Résidence Beauvoir le Mont-Saint-Michel (9 gît...,https://www.booking.com/hotel/fr/residence-bea...,9.5


In [35]:
import pandas as pd
from datetime import datetime

# On part de :
# df_cities : columns => city, latitude, longitude, (optionnel: weather_score)
# df_hotels : columns => city, hotel_name, url, score

# A. ID de ville + normalisation de noms pour join robuste
def norm(s): 
    return str(s).lower().replace("-", " ").replace("'", " ").replace("  "," ").strip()

df_cities = df_cities.reset_index(drop=True).copy()
if "city_id" not in df_cities.columns:
    df_cities.insert(0, "city_id", range(1, len(df_cities) + 1))
df_cities["city_norm"] = df_cities["city"].map(norm)

df_hotels = df_hotels.copy()
df_hotels["city_norm"] = df_hotels["city"].map(norm)

# B. Jointure villes <-> hotels (pour rattacher lat/lon & city_id)
df_hotels_city = df_hotels.merge(
    df_cities[["city_id","city","city_norm","lat","lon"]],
    on="city_norm", how="left", suffixes=("","_city")
)

# C. KPIs hôtels par ville (nb, moyenne, max)
df_hotels_city["score"] = pd.to_numeric(df_hotels_city["score"], errors="coerce")
agg_hotels = df_hotels_city.groupby(["city_id","city"], as_index=False).agg(
    nb_hotels=("hotel_name","nunique"),
    avg_score=("score","mean"),
    max_score=("score","max")
)

# D. Fusion avec la météo (assumée dans df_cities : weather_score)
cols_keep = ["city_id","city","lat","lon"]
if "weather_score" in df_cities.columns:
    cols_keep.append("weather_score")

df_final = df_cities[cols_keep].merge(agg_hotels, on=["city_id","city"], how="left")

# E. Exports CSV (horodatés)
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
hotels_csv = f"hotels_{timestamp}.csv"
final_csv  = f"final_dataset_{timestamp}.csv"

df_hotels_city[["city_id","city","hotel_name","url","score"]].to_csv(hotels_csv, index=False)
df_final.to_csv(final_csv, index=False)

print("Fichiers écrits :")
print(" -", hotels_csv)
print(" -", final_csv)

df_final.head()

Fichiers écrits :
 - hotels_20250826_113312.csv
 - final_dataset_20250826_113312.csv


  timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


Unnamed: 0,city_id,city,lat,lon,weather_score,nb_hotels,avg_score,max_score
0,1,Mont Saint Michel,48.635954,-1.51146,16.905,5,8.88,9.5
1,2,St Malo,48.649518,-2.026041,17.459,5,7.94,8.4
2,3,Bayeux,49.276462,-0.702474,16.82375,5,8.44,9.5
3,4,Le Havre,49.493898,0.107973,17.62575,5,8.24,8.6
4,5,Rouen,49.440459,1.093966,17.1915,5,7.86,8.6


In [36]:
import os, io, boto3, pandas as pd
from sqlalchemy import create_engine, text
from urllib.parse import quote_plus

# Params
S3_BUCKET = os.getenv("S3_BUCKET", "projet-kayak")
S3_REGION = os.getenv("S3_REGION", "eu-west-3")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

PG_HOST = os.getenv("PG_HOST")
PG_DB   = os.getenv("PG_DB", "kayakdatabase")
PG_USER = os.getenv("PG_USER", "postgres")
PG_PWD  = os.getenv("PG_PWD")


# CSV -> S3
session = boto3.session.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=S3_REGION
)
s3 = session.client("s3")

s3_key_hotels = f"kayak/{os.path.basename(hotels_csv)}"
s3_key_final  = f"kayak/{os.path.basename(final_csv)}"


s3.upload_file(hotels_csv, S3_BUCKET, s3_key_hotels)
s3.upload_file(final_csv , S3_BUCKET, s3_key_final)
print("Upload S3 OK :")
print(f" - s3://{S3_BUCKET}/{s3_key_hotels}")
print(f" - s3://{S3_BUCKET}/{s3_key_final}")

# Lire depuis S3 -> DataFrame
obj_final  = s3.get_object(Bucket=S3_BUCKET, Key=s3_key_final)
df_final_s3  = pd.read_csv(io.BytesIO(obj_final["Body"].read()))
obj_hotels = s3.get_object(Bucket=S3_BUCKET, Key=s3_key_hotels)
df_hotels_s3 = pd.read_csv(io.BytesIO(obj_hotels["Body"].read()))

# Créer la DB 'kayakdatabase' si besoin (depuis 'postgres')
engine_default = create_engine(
    f"postgresql+psycopg2://{quote_plus(PG_USER)}:{quote_plus(PG_PWD)}@{PG_HOST}:5432/postgres",
    isolation_level="AUTOCOMMIT",
)
with engine_default.connect() as con:
    exists = con.execute(text("SELECT 1 FROM pg_database WHERE datname=:d;"), {"d": PG_DB}).scalar()
    if not exists:
        con.execute(text(f"CREATE DATABASE {PG_DB};"))
        print(f"✅ Base '{PG_DB}' créée.")
    else:
        print(f"ℹ️  Base '{PG_DB}' déjà présente.")

# Charger en RDS (dans la DB cible)
engine = create_engine(
    f"postgresql+psycopg2://{quote_plus(PG_USER)}:{quote_plus(PG_PWD)}@{PG_HOST}:5432/{PG_DB}"
)
with engine.begin() as conn:
    # tables minimales
    df_final_s3.to_sql("destinations", conn, if_exists="replace", index=False)
    df_hotels_s3.to_sql("hotels", conn, if_exists="replace", index=False)
print("✅ Chargement RDS OK : tables 'destinations' et 'hotels'")


Upload S3 OK :
 - s3://projet-kayak/kayak/hotels_20250826_113312.csv
 - s3://projet-kayak/kayak/final_dataset_20250826_113312.csv
ℹ️  Base 'kayakdatabase' déjà présente.
✅ Chargement RDS OK : tables 'destinations' et 'hotels'


In [40]:
import plotly.express as px

# Top 5 villes
top5_dest = df_final.sort_values("weather_score", ascending=False).head(5)

fig1 = px.scatter_map(
    top5_dest,
    lat="lat", lon="lon",
    size="nb_hotels",
    color="weather_score",
    hover_name="city",
    zoom=5,
    map_style="open-street-map",
    title="🌞 Top 5 Destinations en France (Météo + Hôtels)"
)
fig1.show()

In [42]:
hotels_plot = df_hotels_city.copy()

# Nettoyage des types
for c in ["lat","lon","score"]:
    if c in hotels_plot.columns:
        hotels_plot[c] = pd.to_numeric(hotels_plot[c], errors="coerce")

# Garder seulement les hôtels avec coordonnées + score
hotels_plot = hotels_plot.dropna(subset=["lat","lon","score"])

# Top 20 selon la note
top20_hotels = hotels_plot.sort_values("score", ascending=False).head(20).copy()

# Bulle lisible
top20_hotels["size_bubble"] = (top20_hotels["score"] - top20_hotels["score"].min()) * 4 + 8

fig2 = px.scatter_map(
    top20_hotels,
    lat="lat", lon="lon",
    size="size_bubble",
    color="score",
    hover_name="hotel_name",
    hover_data={"city": True, "url": False, "score":":.2f"},
    zoom=5,
    map_style="open-street-map",
    title="🏨 Top 20 Hôtels en France (note Booking)"
)
fig2.show()