In [None]:
!echo "TMDB_API_KEY=3cfa35042af236cd2d3d3ad121a6e865" > .env

In [None]:
!cat .env

TMDB_API_KEY=3cfa35042af236cd2d3d3ad121a6e865


In [None]:
!pip install dotenv

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, dotenv
Successfully installed dotenv-0.9.9 python-dotenv-1.1.0


In [None]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from google.cloud import bigquery, storage

# Charger la clé API de manière sécurisée
load_dotenv(".env")
API_KEY = os.getenv("TMDB_API_KEY")

# Variables GCP à jour
bucket_name = "tmdb-data-bucket"
bq_project = "tmdb-elt-project-460312"
dataset_raw = "tmdb_raw"
dataset_clean = "tmdb_clean"
table_raw = f"{bq_project}.{dataset_raw}.movies"
table_clean = f"{bq_project}.{dataset_clean}.movies"
gcs_path = f"gs://{bucket_name}/movies_raw.csv"

# Étapes d’initialisation (création si besoin)
def create_bucket_if_not_exists(bucket_name, location="EU"):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        bucket = storage_client.create_bucket(bucket_name, location=location)
        print(f"Bucket '{bucket_name}' créé.")
    else:
        print(f"Bucket '{bucket_name}' déjà existant.")

def create_dataset_if_not_exists(dataset_id):
    client = bigquery.Client()
    try:
        client.get_dataset(dataset_id)
        print(f"Dataset '{dataset_id}' déjà existant.")
    except Exception:
        dataset = bigquery.Dataset(dataset_id)
        dataset.location = "EU"
        client.create_dataset(dataset)
        print(f"Dataset '{dataset_id}' créé.")

# Pipeline complet
def run_pipeline():
    create_bucket_if_not_exists(bucket_name)
    create_dataset_if_not_exists(f"{bq_project}.{dataset_raw}")
    create_dataset_if_not_exists(f"{bq_project}.{dataset_clean}")

    print("1) Extraction depuis TMDB API...")
    BASE_URL = "https://api.themoviedb.org/3/movie/popular"
    all_movies = []
    for page in range(1, 6):
        url = f"{BASE_URL}?api_key={API_KEY}&language=en-US&page={page}"
        res = requests.get(url)
        if res.status_code == 200:
            all_movies.extend(res.json()["results"])
    df = pd.DataFrame(all_movies)
    df.to_csv("movies_raw.csv", index=False)
    print("Fichier CSV généré")

    print("2) Upload vers GCS...")
    os.system(f"gsutil cp movies_raw.csv {gcs_path}")
    print("OK! Upload GCS terminé")

    print("3) Load brut dans BigQuery...")
    client = bigquery.Client()
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=1,
        autodetect=True
    )
    load_job = client.load_table_from_uri(gcs_path, table_raw, job_config=job_config)
    load_job.result()
    print("Données chargées dans BigQuery (brutes)")

    print("4) Transformation SQL...")
    query = f"""
CREATE OR REPLACE TABLE `{table_clean}` AS
SELECT
  id,
  title,
  release_date,
  vote_average,
  vote_count,
  genre_ids,
  popularity,
  original_language,
  backdrop_path,
  CONCAT('https://image.tmdb.org/t/p/w500', poster_path) AS poster_path,
  overview
FROM `{table_raw}`
WHERE release_date IS NOT NULL
"""



    client.query(query).result()
    print(" Pipeline ELT terminé avec succès.")


run_pipeline()


Bucket 'tmdb-data-bucket' déjà existant.
Dataset 'tmdb-elt-project-460312.tmdb_raw' déjà existant.
Dataset 'tmdb-elt-project-460312.tmdb_clean' déjà existant.
1) Extraction depuis TMDB API...
Fichier CSV généré
2) Upload vers GCS...
OK! Upload GCS terminé
3) Load brut dans BigQuery...
Données chargées dans BigQuery (brutes)
4) Transformation SQL...
 Pipeline ELT terminé avec succès.


In [None]:
pd.read_csv("movies_raw.csv").head()


Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/2Nti3gYAX513wvhp8IiLL6ZDyOm.jpg,"[10751, 35, 12, 14]",950387,en,A Minecraft Movie,Four misfits find themselves struggling with o...,899.2165,/yFHHfHcUgGAxziP1C3lLt0q2T4s.jpg,2025-03-31,A Minecraft Movie,False,6.511,1280
1,False,/j0NUh5irX7q2jIRtbLo8TZyRn6y.jpg,"[27, 9648]",574475,en,Final Destination Bloodlines,"Plagued by a violent recurring nightmare, coll...",503.2304,/6WxhEvFsauuACfv8HyoVX6mZKFj.jpg,2025-05-09,Final Destination Bloodlines,False,7.152,240
2,False,/fTrQsdMS2MUw00RnzH0r3JWHhts.jpg,"[28, 80, 53]",1197306,en,A Working Man,Levon Cade left behind a decorated military ca...,365.7532,/6FRFIogh3zFnVWn7Z6zcYnIbRcX.jpg,2025-03-26,A Working Man,False,6.586,827
3,False,/cJvUJEEQ86LSjl4gFLkYpdCJC96.jpg,"[10752, 28]",1241436,en,Warfare,A platoon of Navy SEALs embarks on a dangerous...,340.5692,/srj9rYrjefyWqkLc6l2xjTGeBGO.jpg,2025-04-09,Warfare,False,7.158,367
4,False,/tyfO9jHgkhypUFizRVYD0bytPjP.jpg,"[10751, 14]",447273,en,Snow White,"Following the benevolent King's disappearance,...",375.4531,/oLxWocqheC8XbXbxqJ3x422j9PW.jpg,2025-03-12,Snow White,False,4.335,869


In [None]:
!gsutil ls gs://tmdb-data-bucket/


gs://tmdb-data-bucket/movies_raw.csv


In [None]:
client = bigquery.Client()
df_raw = client.query(f"SELECT * FROM `{table_raw}` LIMIT 10").to_dataframe()
df_raw

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/oz4U9eA6ilYf1tyiVuGmkftdLac.jpg,"[10749, 18]",1010581,es,Culpa mía,"Noah must leave her city, boyfriend, and frien...",60.4883,/w46Vw536HwNnEzOa7J24YH9DPRS.jpg,2023-06-08,My Fault,False,7.811,3684
1,False,/oz4U9eA6ilYf1tyiVuGmkftdLac.jpg,"[10749, 18]",1010581,es,Culpa mía,"Noah must leave her city, boyfriend, and frien...",60.4883,/w46Vw536HwNnEzOa7J24YH9DPRS.jpg,2023-06-08,My Fault,False,7.811,3684
2,False,/oz4U9eA6ilYf1tyiVuGmkftdLac.jpg,"[10749, 18]",1010581,es,Culpa mía,"Noah must leave her city, boyfriend, and frien...",60.4883,/w46Vw536HwNnEzOa7J24YH9DPRS.jpg,2023-06-08,My Fault,False,7.811,3684
3,False,/oz4U9eA6ilYf1tyiVuGmkftdLac.jpg,"[10749, 18]",1010581,es,Culpa mía,"Noah must leave her city, boyfriend, and frien...",60.4883,/w46Vw536HwNnEzOa7J24YH9DPRS.jpg,2023-06-08,My Fault,False,7.811,3684
4,False,/oz4U9eA6ilYf1tyiVuGmkftdLac.jpg,"[10749, 18]",1010581,es,Culpa mía,"Noah must leave her city, boyfriend, and frien...",60.4883,/w46Vw536HwNnEzOa7J24YH9DPRS.jpg,2023-06-08,My Fault,False,7.811,3684
5,False,/9qYD7pAOEdYBoQblv5VAMMSVAC8.jpg,"[10749, 18]",1078600,en,Marked Men: Rule + Shaw,Shaw Landon has loved Rule Archer from the mom...,51.8838,/gZQU8BtCH2IbMZq6QETiU4HXQSR.jpg,2025-01-22,Marked Men: Rule + Shaw,False,6.276,67
6,False,/9qYD7pAOEdYBoQblv5VAMMSVAC8.jpg,"[10749, 18]",1078600,en,Marked Men: Rule + Shaw,Shaw Landon has loved Rule Archer from the mom...,51.8838,/gZQU8BtCH2IbMZq6QETiU4HXQSR.jpg,2025-01-22,Marked Men: Rule + Shaw,False,6.276,67
7,False,/9qYD7pAOEdYBoQblv5VAMMSVAC8.jpg,"[10749, 18]",1078600,en,Marked Men: Rule + Shaw,Shaw Landon has loved Rule Archer from the mom...,51.8838,/gZQU8BtCH2IbMZq6QETiU4HXQSR.jpg,2025-01-22,Marked Men: Rule + Shaw,False,6.276,67
8,False,/9qYD7pAOEdYBoQblv5VAMMSVAC8.jpg,"[10749, 18]",1078600,en,Marked Men: Rule + Shaw,Shaw Landon has loved Rule Archer from the mom...,51.8838,/gZQU8BtCH2IbMZq6QETiU4HXQSR.jpg,2025-01-22,Marked Men: Rule + Shaw,False,6.276,67
9,False,/9qYD7pAOEdYBoQblv5VAMMSVAC8.jpg,"[10749, 18]",1078600,en,Marked Men: Rule + Shaw,Shaw Landon has loved Rule Archer from the mom...,51.8838,/gZQU8BtCH2IbMZq6QETiU4HXQSR.jpg,2025-01-22,Marked Men: Rule + Shaw,False,6.276,67


In [None]:
def orchestrate_pipeline():
    print("Début de l'orchestration ELT")
    try:
        create_bucket_if_not_exists(bucket_name)
        create_dataset_if_not_exists(f"{bq_project}.{dataset_raw}")
        create_dataset_if_not_exists(f"{bq_project}.{dataset_clean}")
        run_pipeline()
        print("Orchestration terminée avec succès")
    except Exception as e:
        print("Orchestration échouée :", e)

# Lancer la version orchestrée
orchestrate_pipeline()


Début de l'orchestration ELT
Bucket 'tmdb-data-bucket' déjà existant.
Dataset 'tmdb-elt-project-460312.tmdb_raw' déjà existant.
Dataset 'tmdb-elt-project-460312.tmdb_clean' déjà existant.
Bucket 'tmdb-data-bucket' déjà existant.
Dataset 'tmdb-elt-project-460312.tmdb_raw' déjà existant.
Dataset 'tmdb-elt-project-460312.tmdb_clean' déjà existant.
1) Extraction depuis TMDB API...
Fichier CSV généré
2) Upload vers GCS...
OK! Upload GCS terminé
3) Load brut dans BigQuery...
Données chargées dans BigQuery (brutes)
4) Transformation SQL...
 Pipeline ELT terminé avec succès.
Orchestration terminée avec succès


In [None]:
updated_code = '''
import os
import requests
import pandas as pd
from google.cloud import bigquery, storage

def orchestrate_pipeline():
    bucket_name = "tmdb-data-bucket"
    bq_project = "tmdb-elt-project-460312"
    dataset_raw = "tmdb_raw"
    dataset_clean = "tmdb_clean"
    table_raw = f"{bq_project}.{dataset_raw}.movies"
    table_clean = f"{bq_project}.{dataset_clean}.movies"
    gcs_path = f"gs://{bucket_name}/movies_raw.csv"

    client = bigquery.Client()
    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        storage_client.create_bucket(bucket, location="EU")

    for dataset in [dataset_raw, dataset_clean]:
        dataset_id = f"{bq_project}.{dataset}"
        try:
            client.get_dataset(dataset_id)
        except:
            ds = bigquery.Dataset(dataset_id)
            ds.location = "EU"
            client.create_dataset(ds)

    all_movies = []
    for page in range(1, 6):
        url = f"https://api.themoviedb.org/3/movie/popular?api_key={os.environ.get('TMDB_API_KEY')}&language=en-US&page={page}"
        res = requests.get(url)
        if res.status_code == 200:
            all_movies.extend(res.json()["results"])
    df = pd.DataFrame(all_movies)
    df.to_csv("/tmp/movies_raw.csv", index=False)

    os.system(f"gsutil cp /tmp/movies_raw.csv {gcs_path}")

    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=1,
        autodetect=True
    )
    client.load_table_from_uri(gcs_path, table_raw, job_config=job_config).result()

    query = f\"\"\"
    CREATE OR REPLACE TABLE `{table_clean}` AS
    SELECT
      id,
      title,
      release_date,
      vote_average,
      vote_count,
      popularity,
      genre_ids,
      original_language,
      CONCAT('https://image.tmdb.org/t/p/w780', backdrop_path) AS backdrop_path,
      CONCAT('https://image.tmdb.org/t/p/w500', poster_path) AS poster_path,
      overview
    FROM `{table_raw}`
    WHERE release_date IS NOT NULL
    \"\"\"
    client.query(query).result()
    return "Pipeline terminé."

def main(request):
    return orchestrate_pipeline()
'''

with open("/tmp/function_src/main.py", "w") as f:
    f.write(updated_code)


In [12]:
# 🔁 Recharger les films depuis l'API TMDB (5 pages)
all_movies = []
for page in range(1, 6):
    url = f"https://api.themoviedb.org/3/movie/popular?api_key={os.environ.get('TMDB_API_KEY')}&language=en-US&page={page}"
    res = requests.get(url)
    if res.status_code == 200:
        all_movies.extend(res.json()["results"])

df = pd.DataFrame(all_movies)


In [13]:
import requests
import pandas as pd
from google.cloud import bigquery
import os

client = bigquery.Client()

# 1️⃣ Charger la liste officielle des genres depuis l'API TMDB
genre_url = f"https://api.themoviedb.org/3/genre/movie/list?api_key={os.environ.get('TMDB_API_KEY')}&language=en-US"
res = requests.get(genre_url)
genres_list = res.json().get("genres", [])

df_genres = pd.DataFrame(genres_list)
df_genres.columns = ["genre_id", "genre_name"]

# 💾 Sauvegarde + upload dans GCS
df_genres.to_csv("/tmp/dim_genres.csv", index=False)
os.system("gsutil cp /tmp/dim_genres.csv gs://tmdb-data-bucket/dim_genres.csv")

# 🛢️ Chargement dans BigQuery
job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,
    autodetect=True
)

client.load_table_from_uri(
    "gs://tmdb-data-bucket/dim_genres.csv",
    "tmdb-elt-project-460312.tmdb_clean.dim_genres",
    job_config=job_config
).result()
print("✅ Table dim_genres chargée")

# 2️⃣ Créer la table de relation film / genre depuis le DataFrame déjà en mémoire
movie_genres = df[['id', 'genre_ids']].explode('genre_ids').dropna()
movie_genres.columns = ['movie_id', 'genre_id']
movie_genres['genre_id'] = movie_genres['genre_id'].astype(int)

# 💾 Sauvegarde + upload dans GCS
movie_genres.to_csv("/tmp/movie_genres.csv", index=False)
os.system("gsutil cp /tmp/movie_genres.csv gs://tmdb-data-bucket/movie_genres.csv")

# 🛢️ Chargement dans BigQuery
client.load_table_from_uri(
    "gs://tmdb-data-bucket/movie_genres.csv",
    "tmdb-elt-project-460312.tmdb_clean.movie_genres",
    job_config=job_config
).result()
print("✅ Table movie_genres chargée")


✅ Table dim_genres chargée
✅ Table movie_genres chargée


In [None]:
!gcloud functions deploy orchestrate_pipeline \
  --entry-point main \
  --runtime python310 \
  --trigger-http \
  --allow-unauthenticated \
  --source=/tmp/function_src \
  --region=europe-west1 \
  --project=tmdb-elt-project-460312 \
  --set-env-vars TMDB_API_KEY=3cfa35042af236cd2d3d3ad121a6e865
