# Books to Movie adaptation
----

## Liens utiles 

- Pour avoir les infos sur les films (API TMDB) : https://developer.themoviedb.org/reference/movie-details                        


# 1 - Récupération des films + Infos sur TMDB (environ 7000 films)

In [156]:
import requests 
import pandas as pd  
import time  
from tqdm import tqdm  
import numpy as np

**Memento**:
- genres (id/name)
- imdb_id
- original_title (just in case for the wiki)
- release_date
- runtime
- title
- {"id": 818, "name": "based on novel"}
- https://m.imdb.com/fr/search/title/?keywords=based-on-novel&explore=keywords (35K mais way too much non?)
- A rajouter les nominations si possible https://www.omdbapi.com/ ou a scrapper si besoin

In [30]:
## ======= 1 - Récupération des adaptations de livres + Données disponibles avec TMBD ========== ##

def get_movie_dataset():
    
    
    all_movies = []

    # 1 - Récupération de tous les films basés sur des livres
    movie_ids = []
    url = "https://api.themoviedb.org/3/discover/movie"
    params = {
        'api_key': API_KEY,
        'with_keywords': '818',
        'page': 1
    }
    response = requests.get(url, params=params)
    data = response.json()
    total_pages = data.get('total_pages', 500)  
    total_results = data.get('total_results', 0)
    
    
    for page in tqdm(range(1, total_pages + 1), desc="Page"):
        url = "https://api.themoviedb.org/3/discover/movie"
        params = {
            'api_key': API_KEY,
            'with_keywords': '818',  
            'page': page
        }
        response = requests.get(url, params=params)
        data = response.json()
        
        for movie in data['results']:
            movie_ids.append(movie['id'])
        
        time.sleep(0.25)
    
    # 2 - Pour chaque film, récupérations des infos
    for movie_id in tqdm(movie_ids, desc="Movies"):
        
        
        url = f"https://api.themoviedb.org/3/movie/{movie_id}"
        params = {
            'api_key': API_KEY,
            'append_to_response': 'credits,keywords' # keywords pour l'instant, à retirer si pas trop utile finalement
        }
        response = requests.get(url, params=params)
        details = response.json()

        
        # Crew -> df séparé?
        director = None
        if 'credits' in details and 'crew' in details['credits']:
            for person in details['credits']['crew']:
                if person['job'] == 'Director':
                    director = person['name']
                    break
        actors = [None, None, None]
        if 'credits' in details and 'cast' in details['credits']:
            for i in range(min(3, len(details['credits']['cast']))):
                actors[i] = details['credits']['cast'][i]['name']
        
        
        genres = ', '.join([g['name'] for g in details.get('genres', [])])
        
        keywords_list = []
        if 'keywords' in details and 'keywords' in details['keywords']:
            for kw in details['keywords']['keywords']:
                keywords_list.append(kw['name'])
        keywords = ', '.join(keywords_list)
        
        release_date = details.get('release_date', '')
        year = release_date[:4] if release_date else None
        
        
        movie_data = {
            'tmdb_id': details.get('id'),
            'imdb_id': details.get('imdb_id'),
            'title': details.get('title'),
            'original_title': details.get('original_title'),
            'release_date': release_date,
            'year': year,
            'runtime': details.get('runtime'),
            'budget': details.get('budget', None),
            'revenue': details.get('revenue', None),
            'vote_average': details.get('vote_average', None),
            'vote_count': details.get('vote_count', None),
            'popularity': details.get('popularity', None),
            'director': director,
            'actor_1': actors[0],
            'actor_2': actors[1],
            'actor_3': actors[2],
            'genres': genres,
            'keywords': keywords, 
            'overview': details.get('overview', ''), #résumé du plot mais peut etre pas utile 
            'original_language': details.get('original_language'),
        }
        
        all_movies.append(movie_data)
        
        time.sleep(0.3)
    
    movies_dataset = pd.DataFrame(all_movies).fillna('')
    
    return movies_dataset

In [32]:
movies_df = get_movie_dataset()

Page: 100%|██████████████████████████████████████████████████████████████████████████| 379/379 [01:41<00:00,  3.72it/s]
Movies: 100%|██████████████████████████████████████████████████████████████████████| 7564/7564 [51:33<00:00,  2.45it/s]


In [46]:
# Peut-être comparé les meilleures adaptation selon les pays
print(movies_df[movies_df['original_language'] == 'fr'])


      tmdb_id     imdb_id                                              title  \
16      15383   tt0064040                                    Army of Shadows   
151   1084736  tt26446278                          The Count of Monte Cristo   
194     65496   tt1570970                                   Student Services   
208       266   tt0057345                                           Contempt   
259      4561   tt0073115                                     The Story of O   
...       ...         ...                                                ...   
7457   862095                                             The Hands of Orlac   
7460   858532   tt0897388  Silence de Septembre (Chapitre 2) ou Quelques ...   
7470  1279542                  Sherlock Holmes et l'Aventure du Diamant Bleu   
7527  1440658  tt35933265                       When You Listen to This Song   
7539   989955  tt18378400                           Les raisins de la misère   

                                       

In [None]:
---
##### PB ->  il y a des films avec des auteurs étranges... Peut-être les filtrer
*/!\ Attention prévoir mini une heure pour chaque version !*

In [70]:
#========== VERSION 2 - Uniquement avec les novels explicitement ecrit (environ 6000) =========== #

def get_movie_novel():
    
    
    all_movies = []
    movie_ids = []
    url = "https://api.themoviedb.org/3/discover/movie"
    params = {
        'api_key': API_KEY,
        'with_keywords': '818',
        'page': 1
    }
    response = requests.get(url, params=params)
    data = response.json()
    total_pages = data.get('total_pages', 500)
    total_results = data.get('total_results', 0)
    
    
    for page in tqdm(range(1, total_pages + 1), desc="Page"):
        url = "https://api.themoviedb.org/3/discover/movie"
        params = {
            'api_key': API_KEY,
            'with_keywords': '818',  
            'page': page
        }
        response = requests.get(url, params=params)
        data = response.json()
        
        for movie in data['results']:
            movie_id = movie['id']
            url_details = f"https://api.themoviedb.org/3/movie/{movie_id}"
            params_details = {
                'api_key': API_KEY,
                'append_to_response': 'credits'
            }
            response_details = requests.get(url_details, params=params_details)
            details = response_details.json()
            
            novel_author = None
            if 'credits' in details and 'crew' in details['credits']:
                for person in details['credits']['crew']:
                    if person.get('job') == 'Novel':
                        novel_author = person.get('name')
                        break
            if novel_author:
                movie_ids.append(movie_id)
        
        time.sleep(0.25)
    
    for movie_id in tqdm(movie_ids, desc="Movies"):
        
        
        url = f"https://api.themoviedb.org/3/movie/{movie_id}"
        params = {
            'api_key': API_KEY,
            'append_to_response': 'credits,keywords'
        }
        response = requests.get(url, params=params)
        details = response.json()

        novel_author = None
        if 'credits' in details and 'crew' in details['credits']:
            for person in details['credits']['crew']:
                if person.get('job') == 'Novel':
                    novel_author = person.get('name')
                    break

        director = None
        if 'credits' in details and 'crew' in details['credits']:
            for person in details['credits']['crew']:
                if person['job'] == 'Director':
                    director = person['name']
                    break
        actors = [None, None, None]
        if 'credits' in details and 'cast' in details['credits']:
            for i in range(min(3, len(details['credits']['cast']))):
                actors[i] = details['credits']['cast'][i]['name']
        
        
        genres = ', '.join([g['name'] for g in details.get('genres', [])])
        
        keywords_list = []
        if 'keywords' in details and 'keywords' in details['keywords']:
            for kw in details['keywords']['keywords']:
                keywords_list.append(kw['name'])
        keywords = ', '.join(keywords_list)
        
        release_date = details.get('release_date', '')
        year = release_date[:4] if release_date else None
        
        
        movie_data = {
            'tmdb_id': details.get('id'),
            'imdb_id': details.get('imdb_id'),
            'title': details.get('title'),
            'original_title': details.get('original_title'),
            'release_date': release_date,
            'year': year,
            'runtime': details.get('runtime'),
            'budget': details.get('budget', None),
            'revenue': details.get('revenue', None),
            'vote_average': details.get('vote_average', None),
            'vote_count': details.get('vote_count', None),
            'popularity': details.get('popularity', None),
            'director': director,
            'author': novel_author,
            'actor_1': actors[0],
            'actor_2': actors[1],
            'actor_3': actors[2],
            'genres': genres,
            'keywords': keywords, 
            'overview': details.get('overview', ''),
            'original_language': details.get('original_language'),
        }
        
        all_movies.append(movie_data)
        
        time.sleep(0.3)
    
    movies_dataset = pd.DataFrame(all_movies).fillna('')
    
    return movies_dataset

In [68]:
movies_novel_dataset =  get_movie_novel()

Page: 100%|██████████████████████████████████████████████████████████████████████████| 379/379 [16:38<00:00,  2.63s/it]
Movies: 100%|██████████████████████████████████████████████████████████████████████| 5676/5676 [35:10<00:00,  2.69it/s]


In [None]:
# 2 - Relier avec les oeuvres originales (ISBN si possible)

In [76]:
print(movies_novel_dataset)

      tmdb_id     imdb_id                         title  \
0     1156594  tt33311244                     Our Fault   
1      604079  tt10374610                 The Long Walk   
2     1272166  tt32063098      Ballad of a Small Player   
3     1327862  tt33088452                Regretting You   
4     1062722   tt1312221                  Frankenstein   
...       ...         ...                           ...   
5671  1532630  tt37967616            All in Monte Carlo   
5672  1507429                         What You Do to Me   
5673  1572308  tt30139424          Trampoty pana Humbla   
5674   768226   tt1235191                    Zpovědnice   
5675   470948   tt0132161  The Victims of the East Wind   

                    original_title release_date  year  runtime     budget  \
0                    Culpa nuestra   2025-10-15  2025      112          0   
1                    The Long Walk   2025-09-10  2025      108   20000000   
2         Ballad of a Small Player   2025-10-15  2025      1

In [82]:
## ====== Exemple: auteurs adaptés ===== ###

auteurs = movies_novel_dataset['author'].value_counts()
print(auteurs[auteurs > 2])


author
Charles Dickens    54
Agatha Christie    52
Stephen King       44
Georges Simenon    36
Jules Verne        31
                   ..
Milo Urban          3
T. Svatopluk        3
Rao Xueman          3
Ryo Asai            3
Thomas Hughes       3
Name: count, Length: 384, dtype: int64


In [122]:
# ====== Exemples : livres de Stephen King ======= #

hawking_movies = movies_novel_dataset[movies_novel_dataset['author'].str.contains("Stephen King")]
print(hawking_movies[['title', 'year', 'director', 'vote_average']])

                              title  year              director  vote_average
1                     The Long Walk  2025      Francis Lawrence         7.000
35                   It Chapter Two  2019       Andy Muschietti         6.800
37                               It  2017       Andy Muschietti         7.237
40         The Shawshank Redemption  1994        Frank Darabont         8.700
66                      The Shining  1980       Stanley Kubrick         8.200
92                    The Dead Zone  1983      David Cronenberg         7.012
96                   The Green Mile  1999        Frank Darabont         8.502
97                  The Running Man  2025          Edgar Wright         0.000
144                          Carrie  1976        Brian De Palma         7.300
151                    Pet Sematary  1989          Mary Lambert         6.635
234                          Misery  1990            Rob Reiner         7.745
248                          Carrie  2013       Kimberly Peirce 

In [108]:
## ==== SAVE LES DF ==== #

movies_df.to_csv("movies_dataset.csv")
movies_novel_dataset.to_csv("movies_novel_dataset.csv")

# 2 - Lier les films aux livres

In [161]:

def google_books_data(df):
    df = df.copy()  

  
    for col in ['isbn', 'publishedDate', 'pageCount', 'averageRating', 'ratingsCount', 'categories', 'language']:
        df[col] = None

    for i, row in tqdm(df.iterrows(), total=len(df), desc="Book"):
        query = f"{row['title']} {row['author']}"
        url = f"https://www.googleapis.com/books/v1/volumes?q={query}"
        response = requests.get(url)
        data = response.json()

        if 'items' in data and len(data['items']) > 0:
            volume = data['items'][0]['volumeInfo']

            

In [162]:
hawking_movies_updated = google_books_data(hawking_movies)

Book: 100%|████████████████████████████████████████████████████████████████████████████| 44/44 [00:40<00:00,  1.08it/s]


In [150]:
print(hawking_movies_updated)
# trop de livres à None niveaux ratings... 

      tmdb_id     imdb_id                          title  \
1      604079  tt10374610                  The Long Walk   
35     474350   tt7349950                 It Chapter Two   
37     346364   tt1396484                             It   
40        278   tt0111161       The Shawshank Redemption   
66        694   tt0081505                    The Shining   
92      11336   tt0085407                  The Dead Zone   
96        497   tt0120689                 The Green Mile   
97     798645  tt14107334                The Running Man   
144      7340   tt0074285                         Carrie   
151      8913   tt0098084                   Pet Sematary   
234      1700   tt0100157                         Misery   
248    133805   tt1939659                         Carrie   
260      5876   tt0884328                       The Mist   
295    157433   tt0837563                   Pet Sematary   
296      6171   tt0285531                   Dreamcatcher   
304    501170   tt5606664               