# Get latest movie data using moviedb api
this guide will give you a starting point on how to fetch data from moviedb, process that data and save as Pandas DataFrame

<br>

## Steps :
1. follow this [guide](https://developers.themoviedb.org/3) and get your own MovieDB api.
2. in cell 2, configure with the api key you got and language and number of movies for this language
3. run all the cells, you will have dataframes saved as csv.

<br>

## Notes :
- currently there is some biased towards popular movies but for learning purposes it should be ok.
- read the MoviDB [docs](https://developers.themoviedb.org/3) and fetch extra datas you need.

In [8]:
import requests
import pandas as pd
import sys
import threading, logging

## language_count object :
1. find ISO-639-1  [codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) for required languages
2. int the key is the laguage and value is the number of movies of the same laguage to be downloaded

``` 
language_count = {
  ...,
  'language' :number_of_movies,
  ...
}
```

In [9]:
api_key = '<moviedb_apikey>'
language_count = {
    'en':7000,
    'hi':2000,
    'bn':1000,
}

In [10]:
def get_movies(lang, freq):
  url = 'https://api.themoviedb.org/3/movie/popular?api_key={api_key}&with_original_language={lang}'.format(api_key=api_key,lang=lang)
  # print(url)
  movies = []
  page = 1
  progress = 0
  while movies.__len__()<freq:
    try:
        res = requests.get(url+"&page="+str(page))
    except:
        raise ('not connected to internet or movidb issue')

    if res.status_code != 200:
        print ('error')
        return []

    res = res.json()
    
    if 'errors' in res.keys():
      print('api error !!!')
      return movies

    movies = movies + res['results']

    if progress != round(len(movies)/freq*100):
      progress = round(len(movies)/freq*100)
      if progress%5==0:
        print( progress, end="%, ")
        
    page = page + 1
    # break
    # print(res)
  return movies

In [11]:
all_movies = []

for key in language_count:
  # print(key,language_count[key])
  print("Downloading ", key, end=" : ")
  movies = get_movies(key,language_count[key])
  all_movies = all_movies + movies
  print('Total movies found : ', movies.__len__())
  # break

Downloading  en : 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, Total movies found :  7000
Downloading  hi : 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, Total movies found :  2000
Downloading  bn : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, Total movies found :  1000


In [12]:
df = pd.DataFrame(all_movies, columns=['genre_ids', 'id', 'original_language',
       'overview', 'popularity', 'release_date', 'title', 'vote_average', 'vote_count'])
df.to_csv('movies_dataset.csv', index=False)
df.to_pickle('movies_dataset.pk',)
df

Unnamed: 0,genre_ids,id,original_language,overview,popularity,release_date,title,vote_average,vote_count
0,"[878, 28, 12]",580489,en,After finding a host body in investigative rep...,6972.003,2021-09-30,Venom: Let There Be Carnage,7.2,4500
1,"[28, 35, 80, 53]",512195,en,An Interpol-issued Red Notice is a global aler...,4496.760,2021-11-04,Red Notice,6.8,2016
2,"[28, 12, 878, 14]",634649,en,Peter Parker is unmasked and no longer able to...,4062.446,2021-12-15,Spider-Man: No Way Home,8.4,84
3,"[16, 35, 10751]",585245,en,As Emily struggles to fit in at home and at sc...,2509.512,2021-11-10,Clifford the Big Red Dog,7.6,568
4,"[28, 12, 14]",566525,en,Shang-Chi must confront the past he thought he...,2453.799,2021-09-01,Shang-Chi and the Legend of the Ten Rings,7.8,4244
...,...,...,...,...,...,...,...,...,...
9995,[],658466,bn,,0.600,1992-01-01,Bedeneer Prem,0.0,0
9996,[18],658463,bn,,0.600,1996-01-01,Bhai Amar Bhai,0.0,0
9997,[],658443,bn,,0.600,1977-01-01,Barbadhu,0.0,0
9998,[10749],658383,bn,,0.600,2003-01-01,Andha Prem,0.0,0


In [14]:
ids = df['id'].tolist()

In [15]:
def get_credits(ids):
  total_len = len(ids)
  progress = 0
  credits = []

  def get_credit(id):
    url = 'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}'.format(api_key=api_key,movie_id=id)
    try:
        res = requests.get(url)
    except:
        raise ('not connected to internet or movidb issue')

    if res.status_code != 200:
        print ('error')
        return []

    res = res.json()
    
    if 'errors' in res.keys():
      print('api error !!!')
      return credits
    # print(res)
    credits.append(res)

  threads = list()
  for id in ids:
    x = threading.Thread(target=get_credit,args=(id,))
    x.start()
  for index, thread in enumerate(threads):
      # logging.info("Main    : before joining thread %d.", index)
      thread.join()    
      progress = progress + 1
      if round(len(progress)/total_len*100)%5 == 0:
        print( progress, end=", ")

  return credits

movie_credits = get_credits(ids)

new_movie_credits = {'cast':[],'crew':[]}
for movie_credit in movie_credits:
  
  new_movie_credits['cast'].append( {'id' : movie_credit['id'], 'cast' :[]} )
  for credit in movie_credit['cast']:
    new_movie_credits['cast'][ -1 ][ 'cast' ].append({
        'cast_id': credit['id'],
        'name': credit['name'],
        'character': credit['character'],
    })
    # break
  
  new_movie_credits['crew'].append( {'id' : movie_credit['id'], 'crew' :[]} )
  for crew in movie_credit['crew']:
    new_movie_credits['crew'][ -1 ][ 'crew' ].append({
        'crew_id': crew['id'],
        'name': crew['name'],
        'department': crew['department'],
    })
  # break

cast_df = pd.DataFrame(new_movie_credits['cast'])
crew_df = pd.DataFrame(new_movie_credits['crew'])

cast_df.to_csv('cast_dataset.csv', index=False)
crew_df.to_csv('crew_dataset.csv', index=False)
    

In [16]:
cast_df

Unnamed: 0,id,cast
0,580489,"[{'cast_id': 2524, 'name': 'Tom Hardy', 'chara..."
1,617653,"[{'cast_id': 1892, 'name': 'Matt Damon', 'char..."
2,566525,"[{'cast_id': 1489211, 'name': 'Simu Liu', 'cha..."
3,568124,"[{'cast_id': 968367, 'name': 'Stephanie Beatri..."
4,774741,"[{'cast_id': 2239159, 'name': 'Brady Noon', 'c..."
...,...,...
9995,658496,"[{'cast_id': 2491761, 'name': 'Bimal Chakrabor..."
9996,658443,"[{'cast_id': 114853, 'name': 'Samit Bhanja', '..."
9997,658235,"[{'cast_id': 585404, 'name': 'Mukesh Rishi', '..."
9998,658383,"[{'cast_id': 150408, 'name': 'Prosenjit Chatte..."


In [17]:
crew_df

Unnamed: 0,id,crew
0,580489,"[{'crew_id': 149, 'name': 'Robert Richardson',..."
1,617653,"[{'crew_id': 120, 'name': 'Dariusz Wolski', 'd..."
2,566525,"[{'crew_id': 7232, 'name': 'Sarah Halley Finn'..."
3,568124,"[{'crew_id': 8159, 'name': 'Shannon Mills', 'd..."
4,774741,"[{'crew_id': 32608, 'name': 'Brad Simpson', 'd..."
...,...,...
9995,658496,"[{'crew_id': 1658227, 'name': 'Sekhar Das', 'd..."
9996,658443,"[{'crew_id': 2491583, 'name': 'Bijoy Chatterje..."
9997,658235,"[{'crew_id': 2491093, 'name': 'Rabindra Pradha..."
9998,658383,"[{'crew_id': 2488664, 'name': 'Narayan Chatter..."


In [20]:
def get_keywords(ids):
  total_len = len(ids)
  progress = 0
  keywords = []

  def get_keyword(id):
    url = 'https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}'.format(api_key=api_key,movie_id=id)
    try:
        res = requests.get(url)
    except:
        raise ('not connected to internet or movidb issue')

    if res.status_code != 200:
        print ('error')
        return []

    res = res.json()
    
    if 'errors' in res.keys():
      print('api error !!!')
      return keywords
    # print(res)
    keywords.append(res)

    # for id in ids:
    #   get_cred

  threads = list()
  for id in ids:
    x = threading.Thread(target=get_keyword,args=(id,))
    x.start()
  for index, thread in enumerate(threads):
    logging.info("Main    : before joining thread %d.", index)
    thread.join()
  
    progress = progress + 1
    if round(len(progress)/total_len*100)%5 == 0:
      print( progress, end=", ")

  return keywords


movie_keywords = get_keywords(ids)
    

In [22]:
keywords_df = pd.DataFrame(movie_keywords)
keywords_df.to_csv('keywords_dataframe.csv')
keywords_df

Unnamed: 0,id,keywords
0,617653,"[{'id': 818, 'name': 'based on novel or book'}..."
1,580489,"[{'id': 1701, 'name': 'hero'}, {'id': 2095, 'n..."
2,568124,"[{'id': 4344, 'name': 'musical'}]"
3,762433,"[{'id': 588, 'name': 'rome, italy'}, {'id': 94..."
4,566525,"[{'id': 779, 'name': 'martial arts'}, {'id': 9..."
...,...,...
9995,658496,[]
9996,658383,[]
9997,658616,[]
9998,658443,[]
