In [1]:
import pandas as pd

basics = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
basics = basics[basics["titleType"] == "movie"]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0.0,1894.0,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0.0,1897.0,,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0.0,1905.0,,100,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0.0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0.0,1907.0,,90,Drama
...,...,...,...,...,...,...,...,...,...
9569501,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0.0,2015.0,,57,Documentary
9569528,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0.0,2007.0,,100,Documentary
9569540,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0.0,2013.0,,,Comedy
9569550,tt9916730,movie,6 Gunn,6 Gunn,0.0,2017.0,,116,


In [2]:
ratings = pd.read_csv('title.ratings.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1947
1,tt0000002,5.8,263
2,tt0000003,6.5,1773
3,tt0000004,5.6,179
4,tt0000005,6.2,2580
...,...,...,...
1273679,tt9916730,8.1,9
1273680,tt9916766,7.0,21
1273681,tt9916778,7.2,36
1273682,tt9916840,8.8,6


In [3]:
df = pd.merge(basics, ratings, on='tconst')
df = df.sort_values(by='numVotes', ascending=False)
top = df.head(1000)
top

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
60889,tt0111161,movie,The Shawshank Redemption,The Shawshank Redemption,0.0,1994.0,,142,Drama,9.3,2692648
134739,tt0468569,movie,The Dark Knight,The Dark Knight,0.0,2008.0,,152,"Action,Crime,Drama",9.0,2666465
173373,tt1375666,movie,Inception,Inception,0.0,2010.0,,148,"Action,Adventure,Sci-Fi",8.8,2365330
70740,tt0137523,movie,Fight Club,Fight Club,0.0,1999.0,,139,Drama,8.8,2138148
60157,tt0109830,movie,Forrest Gump,Forrest Gump,0.0,1994.0,,142,"Drama,Romance",8.8,2090961
...,...,...,...,...,...,...,...,...,...,...,...
144361,tt0959337,movie,Revolutionary Road,Revolutionary Road,0.0,2008.0,,119,"Drama,Romance",7.3,215949
179583,tt1464335,movie,Uncharted,Uncharted,0.0,2022.0,,116,"Action,Adventure",6.3,215596
69782,tt0134119,movie,The Talented Mr. Ripley,The Talented Mr. Ripley,0.0,1999.0,,139,"Crime,Drama,Thriller",7.4,215460
216972,tt2404435,movie,The Magnificent Seven,The Magnificent Seven,0.0,2016.0,,132,"Action,Adventure,Western",6.8,214763


In [4]:
import json
from collections import defaultdict
from SPARQLWrapper import SPARQLWrapper, TURTLE, JSONLD, JSON
from tqdm.notebook import tqdm


sparql = SPARQLWrapper("http://dbpedia.org/sparql")

film_categories = {}

for film_title in tqdm(top['primaryTitle']):

    QUERY = f'''
    SELECT DISTINCT ?cat
    WHERE {{
      ?a <http://dbpedia.org/property/name> ?name .
      ?a <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Film> .
      ?a <http://purl.org/dc/terms/subject> ?x .
      ?x <http://www.w3.org/2000/01/rdf-schema#label> ?b .
      FILTER(langMatches(lang(?b), "en"))
      FILTER(str(?name) = "{film_title}")
      BIND (str(?b) as ?cat)
    }}
    '''

    sparql.setQuery(QUERY)
    sparql.setReturnFormat(JSON)
    ret = sparql.query().convert()
    film_categories[film_title] = list({binding['cat']['value'] for binding in ret['results']['bindings']})


  0%|          | 0/1000 [00:00<?, ?it/s]

In [5]:
from collections import Counter

cats_count = Counter()

for categories in film_categories.values():
    cats_count.update(categories)

film_categories_trimmed = {
    title: [cat for cat in cats if cats_count[cat] > 1]
    for title, cats in film_categories.items()
}

film_categories_trimmed = {
    title: cats
    for title, cats in film_categories_trimmed.items()
    if len(cats) >= 5
}

In [6]:
with open('films.json', 'w') as f:
    json.dump(film_categories_trimmed, f, indent=2)