In [2]:
import os
import json
from tqdm import tqdm
import requests
from preprocessing.rss_parser import RSSParser

### Parse RSS, create dictionary of episode names and their raw text topics

In [3]:
parser = RSSParser("data/misc/episodes.rss")()

episode_topics = {}
for filename, ep_dict in parser.episodes_dict.items():
    episode_topics[filename] = parser.get_topics(ep_dict)



#### Exclude hand-picked topics that are not movies or TV shows

In [4]:
exclude_topics = {
    "borítókép",
    "sorsolás",
    "villámkérdés",
    "felvezető",
    "zárthelyi",
    "filmév",
    "filmbarátok",
    "oscar",
    "évösszegzés",
}

In [5]:
title_set = set()
for k,v in episode_topics.items():
    {title_set.add(t["topic"]) for t in v if not any(topic in t["topic"] for topic in exclude_topics)}

### Search for the titles using TMDB API and get IMDB link

In [6]:
tmdb_api_key = os.environ["TMDB_TOKEN"]

In [7]:
movie_url_dict = {}
not_found_dict = {}

for title in tqdm(title_set):
    query = title.replace(" ", "+")
    tmdb_search = f"https://api.themoviedb.org/3/search/movie?api_key={tmdb_api_key}&query={query}"
    response = requests.get(tmdb_search)
    if response.status_code != 200:
        not_found_dict[title] = f"Tmdb search error: {response.status_code} - {response.text}"
        continue
    results = response.json()["results"]
    if len(results) == 0:
        not_found_dict[title] = "Tmdb search error: No results"
        continue
    tmdb_id = sorted(results, key=lambda x: x["popularity"], reverse=True)[0]["id"]
    tmdb_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}?api_key={tmdb_api_key}"
    response = requests.get(tmdb_url)
    if response.status_code != 200:
        not_found_dict[title] = f"Tmdb movie lookup error: {response.status_code} - {response.text}"
        continue
    movie = response.json()
    imdb_id = movie.get("imdb_id")
    if imdb_id is None:
        not_found_dict[title] = "Tmdb movie lookup error: No imdb id"
        continue
    imdb_url = f"https://www.imdb.com/title/{imdb_id}"
    movie_url_dict[title] = imdb_url

100%|██████████| 1652/1652 [43:36<00:00,  1.58s/it] 


In [15]:
list(movie_url_dict.items())[:10]

[('az elátkozott leeds united', 'https://www.imdb.com/title/tt1226271'),
 ('a vihar kapujában', 'https://www.imdb.com/title/tt0042876'),
 ('éjféli etetés', 'https://www.imdb.com/title/tt0805570'),
 ('cam', 'https://www.imdb.com/title/tt13496400'),
 ('palmer', 'https://www.imdb.com/title/tt6857376'),
 ('the green inferno', 'https://www.imdb.com/title/tt2403021'),
 ('csúcshatás', 'https://www.imdb.com/title/tt1219289'),
 ('mulan', 'https://www.imdb.com/title/tt0120762'),
 ('straight outta compton', 'https://www.imdb.com/title/tt1398426'),
 ('asterix a varázsital titka', 'https://www.imdb.com/title/tt8001346')]

In [11]:
with open("data/misc/episode_topics.json", "w") as file:
    json.dump(episode_topics, file, indent=4, ensure_ascii=False)

In [12]:
with open("data/misc/not_found.json", "w") as file:
    json.dump(not_found_dict, file, indent=4, ensure_ascii=False)