In [94]:
!pip install requests-ratelimiter



In [95]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [96]:
from requests import Session
from requests_ratelimiter import LimiterAdapter
from tqdm import tqdm
import pandas as pd

In [97]:
headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJkNGRlM2I3OTZmZTQ5NDU2MmZiZDY4ODBiOTQzYmY4MSIsInN1YiI6IjY1NDQxODM0ZmQ0ZjgwMDBlNDdlMzIzOCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.GhoOUpNOWa3VnXXakIhYXBOQ2Xdd6861fl-m6vKiQ7M",
}

In [107]:
links = pd.read_csv("links.csv", usecols=["movieId", "tmdbId"], dtype={"movieId": str, "tmdbId": str})
tmdb_ids = links["tmdbId"].to_list()
movie_ids = links["movieId"].to_list()

In [108]:
assert len(tmdb_ids) == len(movie_ids)

In [109]:
session = Session()
adapter = LimiterAdapter(per_second=100)

session.mount("https://api.themoviedb.org/3/movie/", adapter)

In [110]:
error_rows = {}

In [113]:
movies_df = pd.DataFrame(
    columns=[
        "movieId",
        "budget",
        "original_language",
        "popularity",
        "revenue",
        "runtime",
        "vote_average",
        "vote_count",
        "overview",
        "tags"
    ]
)

In [114]:
for movie_id, tmdb_id in tqdm(zip(movie_ids, tmdb_ids)):
    response = session.get(
        f"https://api.themoviedb.org/3/movie/{tmdb_id}",
        headers=headers
    )
    response_tags = session.get(f"https://api.themoviedb.org/3/movie/{tmdb_id}/keywords",
                                headers=headers)
    if response.status_code == 200:
        response = response.json()
        movies_df = movies_df.append(
            {
                "movieId": movie_id,
                "budget": response["budget"],
                "original_language": response["original_language"],
                "popularity": response["popularity"],
                "revenue": response["revenue"],
                "runtime": response["runtime"],
                "vote_average": response["vote_average"],
                "vote_count": response["vote_count"],
                "overview": response["overview"],
                "tags": [y['name'] for y in response_tags.json()["keywords"]] if response_tags.status_code == 200 else [""]

            },
            ignore_index=True,
        )
    else:
      tv_response = session.get(
          f"https://api.themoviedb.org/3/tv/{tmdb_id}",
          headers=headers
      )
      if tv_response.status_code == 200:
        tv_response = tv_response.json()
        movies_df = movies_df.append(
                {
                    "movieId": movie_id,
                    "budget": 0,
                    "original_language": tv_response["original_language"],
                    "popularity": tv_response["popularity"],
                    "revenue": 0,
                    "runtime": 0,
                    "vote_average": tv_response["vote_average"],
                    "vote_count": tv_response["vote_count"],
                    "overview": tv_response["overview"],
                    "tags": [""]
                },
                ignore_index=True,
            )
      else:
        error_rows[movie_id] = tv_response.status_code


9742it [21:13,  7.65it/s]


In [115]:
movies_df.head()

Unnamed: 0,movieId,budget,original_language,popularity,revenue,runtime,vote_average,vote_count,overview,tags
0,1,30000000,en,102.775,394400000,81,7.97,17264,"Led by Woody, Andy's toys live happily in his ...","[martial arts, jealousy, friendship, bullying,..."
1,2,65000000,en,15.21,262821940,104,7.238,9887,When siblings Judy and Peter discover an encha...,"[giant insect, board game, jungle, disappearan..."
2,3,25000000,en,12.835,71500000,101,6.494,350,A family wedding reignites the ancient feud be...,"[fishing, halloween, sequel, old man, best fri..."
3,4,16000000,en,14.973,81452156,127,6.183,142,"Cheated on, mistreated and stepped on, the wom...","[based on novel or book, interracial relations..."
4,5,0,en,17.414,76594107,106,6.235,663,Just when George Banks has recovered from his ...,"[parent child relationship, baby, midlife cris..."


In [116]:
missing_rows = movies_df[(movies_df == 0).any(axis=1) | (movies_df.isnull().any(axis=1))]["movieId"].astype(str).tolist()

print("Number of rows that have empty/missing feature: " + str(len(missing_rows)))
print("Number of error rows: " + str(len(error_rows)))

Number of rows that have empty/missing feature: 4483
Number of error rows: 55


In [117]:
error_rows

{'791': 404,
 '1107': 404,
 '2851': 404,
 '4051': 404,
 '7669': 404,
 '7762': 404,
 '7841': 404,
 '26453': 404,
 '26587': 404,
 '32600': 404,
 '40697': 404,
 '52281': 404,
 '53883': 404,
 '55207': 404,
 '57772': 404,
 '61406': 404,
 '77177': 404,
 '79299': 404,
 '84847': 404,
 '86237': 404,
 '90647': 404,
 '92475': 404,
 '93988': 404,
 '95738': 404,
 '96518': 404,
 '96520': 404,
 '99532': 404,
 '99764': 404,
 '106642': 404,
 '115969': 404,
 '121035': 404,
 '126430': 404,
 '127390': 404,
 '131724': 404,
 '139130': 404,
 '140481': 404,
 '148675': 404,
 '150548': 404,
 '151763': 404,
 '152284': 404,
 '159817': 404,
 '163809': 404,
 '167570': 404,
 '170705': 404,
 '171011': 404,
 '171495': 404,
 '171749': 404,
 '173535': 404,
 '174053': 404,
 '174403': 404,
 '175693': 404,
 '176329': 404,
 '179135': 404,
 '184257': 404,
 '185135': 404}

In [118]:
# Export to excel
movies_df.to_excel("movies.xlsx", index=False)

In [119]:
from google.colab import files
files.download('movies.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>