-
Notifications
You must be signed in to change notification settings - Fork 3
/
movies_api_credits.py
78 lines (65 loc) · 4.24 KB
/
movies_api_credits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#NB: The formatting for cast and crew was created and read in even on the initial execution. If starting
#for the first time, need to run for just the first title and write to CSV before the full run.
#Before running: Add API key into line 11, file path to lines 14-16, 77-78
import requests
import json
import numpy as np
import pandas as pd
API_key = ''
#Read in previous data
df = pd.read_csv(r'\lb_tmdb.csv')
cast_full = pd.read_csv(r'\cast.csv')
crew_full = pd.read_csv(r'\crew.csv')
#For Update Run: Change date to last script execution
df = df[(df['Date'] > '2023-11-17')]
#Letterboxd logged titles, i.e. titles for the API to search for
title = df['id'].astype(str).tolist()
#Filter the title list to only values not in cast_full already
title = np.setdiff1d(title,cast_full['id'].astype(str).tolist())
print(title)
#Iterative approach to pulling all the credits
for i in range (0,len(title)):
query = 'https://api.themoviedb.org/3/movie/'+title[i]+'/credits?api_key='+API_key+''
response = requests.get(query)
if response.status_code==200:
json_format = json.loads(response.text)
#Only tries to format correctly if both cast and crew are in JSON payload
if len(json_format['cast']) > 0 and len(json_format['crew']) > 0 :
#Convert cast dictionary into its own dataframe
cast = pd.DataFrame.from_dict(json_format['cast'])
cast['id'] = json_format['id']
#Remove exteraneous columns
cast.drop(['adult','known_for_department','cast_id','character','credit_id','original_name'],axis=1,inplace=True)
#Recode Gender to String
cast['gender'] = np.where(cast['gender']==1,"Female","Male")
#Filter people to only the first 20 credited cast members
cast = cast[cast['order']<=20]
#If profile image isn't blank add in necessary prefix, else revert to default
cast['profile_path'] = np.where(cast['profile_path'].notna(),r'https://image.tmdb.org/t/p/original' + cast['profile_path'].astype(str),r'https://i0.wp.com/s.ltrbxd.com/static/img/avatar1000.a71b6e9c.png?ssl=1')
#Convert crew dictionary into its own dataframe
crew = pd.DataFrame.from_dict(json_format['crew'])
crew['id'] = json_format['id']
#Drop exteraneous columns
crew.drop(['adult','known_for_department','credit_id','original_name'],axis=1,inplace=True)
#Recode Gender to String
crew['gender'] = np.where(crew['gender']==1,"Female","Male")
#If profile image isn't blank add in necessary prefix, else revert to default
crew['profile_path'] = np.where(crew['profile_path'].notna(),r'https://image.tmdb.org/t/p/original' + crew['profile_path'].astype(str),r'https://i0.wp.com/s.ltrbxd.com/static/img/avatar1000.a71b6e9c.png?ssl=1')
#Limit departaments to only the ones I'm interested in
crew = crew[crew.department.isin(["Writing","Directing","Costume & Make-Up","Editing","Camera"])]
#Limit roles to only the ones I'm looking at tracking
crew = crew[crew.job.isin(["Director of Photography","Director","Costume Design","Editor","Book","Novel","Screenplay","Writer","Lyricist","Script Consultant"])]
#Append to existing df
cast_full = pd.concat([cast_full,cast],ignore_index=True)
crew_full = pd.concat([crew_full,crew],ignore_index=True)
#Print iterator to make sure it doesn't get stuck
print(i)
else:
i = i + 1
print("Missing cast or crew")
else:
i = i + 1
print("Non-200 response")
#Write to CSV
cast_full.to_csv(r'\cast.csv',header=True, index = False)
crew_full.to_csv(r'\crew.csv',header=True, index = False)