# Get latest movie data using moviedb api
this guide will give you a starting point on how to fetch data from moviedb, process that data and save as Pandas DataFrame

<br>

## Steps :
1. follow this [guide](https://developers.themoviedb.org/3) and get your own MovieDB api.
2. in cell 2, configure with the api key you got and language and number of movies for this language
3. run all the cells, you will have dataframes saved as csv.

<br>

## Notes :
- currently there is some biased towards popular movies but for learning purposes it should be ok.
- read the MoviDB [docs](https://developers.themoviedb.org/3) and fetch extra datas you need.

In [11]:
import requests
import pandas as pd
import sys
import threading, logging

## language_count object :
1. find ISO-639-1  [codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) for required languages
2. int the key is the laguage and value is the number of movies of the same laguage to be downloaded

``` 
language_count = {
  ...,
  'language' :number_of_movies,
  ...
}
```

In [20]:
# uncomment next line and add your tmdb api key
import os
api_key = os.getenv("api_key")
language_count = {
    'en': 10000,
    # 'hi':2000,
    # 'bn':1000,
}

In [3]:
def get_movies(lang, freq):
  url = 'https://api.themoviedb.org/3/movie/popular?api_key={api_key}&with_original_language={lang}'.format(api_key=api_key,lang=lang)
  # print(url)
  movies = []
  page = 1
  progress = 0
  while movies.__len__()<freq:
    try:
        res = requests.get(url+"&page="+str(page))
    except:
        raise ('not connected to internet or movidb issue')

    if res.status_code != 200:
        print ('error')
        return []

    res = res.json()
    
    if 'errors' in res.keys():
      print('api error !!!')
      return movies

    movies = movies + res['results']

    if progress != round(len(movies)/freq*100):
      progress = round(len(movies)/freq*100)
      if progress%5==0:
        print( progress, end="%, ")
        
    page = page + 1
    # break
    # print(res)
  return movies

In [4]:
all_movies = []

for key in language_count:
  # print(key,language_count[key])
  print("Downloading ", key, end=" : ")
  movies = get_movies(key,language_count[key])
  all_movies = all_movies + movies
  print('Total movies found : ', movies.__len__())
  # break

Downloading  en : 5%, 10%, 15%, 20%, 25%, 30%, 35%, 40%, 45%, 50%, 55%, 60%, 65%, 70%, 75%, 80%, 85%, 90%, 95%, 100%, Total movies found :  10000


In [6]:
df = pd.DataFrame(all_movies, columns=['genre_ids', 'id', 'original_language',
       'overview', 'popularity', 'release_date', 'title', 'vote_average', 'vote_count'])
# df.to_csv('movies_dataset.csv', index=False)
df.to_pickle('../input/movies_dataset.pk',)
df

Unnamed: 0,genre_ids,id,original_language,overview,popularity,release_date,title,vote_average,vote_count
0,"[18, 36]",872585,en,The story of J. Robert Oppenheimer's role in t...,2718.643,2023-07-19,Oppenheimer,8.192,4663
1,"[27, 9648]",507089,en,"Recently fired and desperate for work, a troub...",1859.189,2023-10-25,Five Nights at Freddy's,7.933,2352
2,"[878, 28, 53]",670292,en,Amid a future war between the human race and t...,2234.144,2023-09-27,The Creator,7.229,883
3,"[28, 12, 53]",299054,en,Armed with every weapon they can get their han...,1294.274,2023-09-15,Expend4bles,6.417,762
4,"[28, 53]",575264,en,Ethan Hunt and his IMF team embark on their mo...,1144.924,2023-07-08,Mission: Impossible - Dead Reckoning Part One,7.591,2470
...,...,...,...,...,...,...,...,...,...
9995,"[12, 28, 27]",43149,en,Beleaguered adventurer Carl Denham returns to ...,11.018,1933-12-22,The Son of Kong,5.476,105
9996,"[35, 10749]",39037,en,Ambitious young Manhattanite and urban conserv...,12.469,2009-04-25,The Good Guy,5.626,143
9997,"[10749, 35, 18]",84306,en,"Newly single, 35, and uninspired by his job, J...",13.770,2012-09-14,Liberal Arts,6.456,492
9998,"[9648, 53, 18, 27]",335791,en,A surrogate mother harbors a deadly secret des...,13.523,2016-09-09,When the Bough Breaks,5.844,368


In [14]:
ids = df['id'].tolist()

In [16]:
def get_credits(ids):
  total_len = len(ids)
  progress = 0
  done_items = 0
  credits = []

  def get_credit(id):
    url = 'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}'.format(api_key=api_key,movie_id=id)
    try:
        res = requests.get(url)
    except:
        raise ('not connected to internet or movidb issue')

    if res.status_code != 200:
        print ('error')
        return []

    res = res.json()
    
    if 'errors' in res.keys():
      print('api error !!!')
      return credits
    # print(res)
    credits.append(res)

  threads = list()
  for id in ids:
    x = threading.Thread(target=get_credit,args=(id,))
    threads.append(x)
    x.start()
  for index, thread in enumerate(threads):
      # logging.info("Main    : before joining thread %d.", index)
      thread.join()    

  return credits

In [None]:
movie_credits = get_credits(ids)

new_movie_credits = {'cast':[],'crew':[]}
for movie_credit in movie_credits:

  new_movie_credits['cast'].append( {'id' : movie_credit['id'], 'cast' :[]} )
  for credit in movie_credit['cast']:
    new_movie_credits['cast'][ -1 ][ 'cast' ].append({
        'cast_id': credit['id'],
        'name': credit['name'],
        'character': credit['character'],
    })
    # break

  new_movie_credits['crew'].append( {'id' : movie_credit['id'], 'crew' :[]} )
  for crew in movie_credit['crew']:
    new_movie_credits['crew'][ -1 ][ 'crew' ].append({
        'crew_id': crew['id'],
        'name': crew['name'],
        'department': crew['department'],
    })
  # break

cast_df = pd.DataFrame(new_movie_credits['cast'])
crew_df = pd.DataFrame(new_movie_credits['crew'])

cast_df.to_pickle('../input/cast_dataset.pk',)
crew_df.to_pickle('../input/crew_dataset.pk',)


errorerror

error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
errorerror

error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
errorerror
error

error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
errorerror
error

error
error
error
error
error
error
error
error
error
error
erro

KeyboardInterrupt: 

errorerror
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error

error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
erro

error
error
error
error
error
error
error
error
error
error
error
error


In [9]:
cast_df

Unnamed: 0,id,cast
0,522402,"[{'cast_id': 31, 'name': 'Tom Hanks', 'charact..."
1,512195,"[{'cast_id': 18918, 'name': 'Dwayne Johnson', ..."
2,585245,"[{'cast_id': 1696017, 'name': 'Darby Camp', 'c..."
3,634649,"[{'cast_id': 1136406, 'name': 'Tom Holland', '..."
4,508943,"[{'cast_id': 1277188, 'name': 'Jacob Tremblay'..."
...,...,...
9995,658463,"[{'cast_id': 150408, 'name': 'Prosenjit Chatte..."
9996,659549,"[{'cast_id': 128709, 'name': 'Sabyasachi Chakr..."
9997,658466,"[{'cast_id': 998621, 'name': 'Anju Ghosh', 'ch..."
9998,659553,"[{'cast_id': 2491678, 'name': 'Chiranjeet', 'c..."


In [10]:
crew_df

Unnamed: 0,id,crew
0,522402,"[{'crew_id': 24, 'name': 'Robert Zemeckis', 'd..."
1,512195,"[{'crew_id': 9543, 'name': 'Beau Flynn', 'depa..."
2,585245,"[{'crew_id': 8220, 'name': 'Naomi Shohan', 'de..."
3,634649,"[{'crew_id': 2519, 'name': 'Sanja Milkovic Hay..."
4,508943,"[{'crew_id': 1259, 'name': 'Ennio Morricone', ..."
...,...,...
9995,658463,"[{'crew_id': 1143635, 'name': 'Shrikant Mohta'..."
9996,659549,"[{'crew_id': 2492553, 'name': 'Sanat Dutta', '..."
9997,658466,"[{'crew_id': 2342446, 'name': 'Swapan Saha', '..."
9998,659553,"[{'crew_id': 2342446, 'name': 'Swapan Saha', '..."


In [11]:
def get_keywords(ids):
  total_len = len(ids)
  progress = 0
  keywords = []

  def get_keyword(id):
    url = 'https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}'.format(api_key=api_key,movie_id=id)
    try:
        res = requests.get(url)
    except:
        raise ('not connected to internet or movidb issue')

    if res.status_code != 200:
        print ('error')
        return []

    res = res.json()
    
    if 'errors' in res.keys():
      print('api error !!!')
      return keywords
    # print(res)
    keywords.append(res)

    # for id in ids:
    #   get_cred

  threads = list()
  for id in ids:
    x = threading.Thread(target=get_keyword,args=(id,))
    threads.append(x)
    x.start()
  for index, thread in enumerate(threads):
    logging.info("Main    : before joining thread %d.", index)
    thread.join()

  return keywords


movie_keywords = get_keywords(ids)
    

In [12]:
len(movie_keywords)

10000

In [13]:
keywords_df = pd.DataFrame(movie_keywords)
keywords_df.to_csv('keywords_dataframe.csv')
keywords_df

Unnamed: 0,id,keywords
0,1930,"[{'id': 697, 'name': 'loss of loved one'}, {'i..."
1,585245,"[{'id': 818, 'name': 'based on novel or book'}..."
2,550988,"[{'id': 282, 'name': 'video game'}, {'id': 141..."
3,580489,"[{'id': 1701, 'name': 'hero'}, {'id': 2095, 'n..."
4,634649,"[{'id': 1701, 'name': 'hero'}, {'id': 5451, 'n..."
...,...,...
9995,658463,[]
9996,660079,[]
9997,659552,[]
9998,659136,[]


## Approximate time taken (in Google Colab)
- get_movies()   ~6mins
- get_credits()  ~3mins
- get_keywords() ~3mins
- **Total**      ~12mins