In [1]:
import pandas as pd

basics = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
basics = basics[basics["titleType"] == "movie"]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0.0,1894.0,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0.0,1897.0,,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0.0,1905.0,,100,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0.0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0.0,1907.0,,90,Drama
...,...,...,...,...,...,...,...,...,...
9569501,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0.0,2015.0,,57,Documentary
9569528,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0.0,2007.0,,100,Documentary
9569540,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0.0,2013.0,,,Comedy
9569550,tt9916730,movie,6 Gunn,6 Gunn,0.0,2017.0,,116,


In [None]:
ratings = pd.read_csv('title.ratings.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
ratings

In [None]:
df = pd.merge(basics, ratings, on='tconst')
df = df.sort_values(by='numVotes', ascending=False)
top = df.head(500)
top

In [None]:
import json
from collections import defaultdict
from SPARQLWrapper import SPARQLWrapper, TURTLE, JSONLD, JSON

sparql = SPARQLWrapper("http://dbpedia.org/sparql")

film_categories = {}

for film_title in top['primaryTitle']:

    QUERY = f'''
    SELECT DISTINCT ?cat
    WHERE {{
      ?a <http://dbpedia.org/property/name> ?name .
      ?a <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Film> .
      ?a <http://purl.org/dc/terms/subject> ?x .
      ?x <http://www.w3.org/2000/01/rdf-schema#label> ?b .
      FILTER(langMatches(lang(?b), "en"))
      FILTER(str(?name) = "{film_title}")
      BIND (str(?b) as ?cat)
    }}
    '''
    print('.', end='')

    sparql.setQuery(QUERY)
    sparql.setReturnFormat(JSON)
    ret = sparql.query().convert()
    film_categories[film_title] = list({binding['cat']['value'] for binding in ret['results']['bindings']})

print()

In [None]:
from collections import Counter

cats_count = Counter()

for categories in film_categories.values():
    cats_count.update(categories)
    
film_categories_trimmed = {
    title: [cat for cat in cats if cats_count[cat] > 1]
    for title, cats in film_categories.items()
    if len(cats) >= 1
}

In [None]:
with open('films.json', 'w') as f:
    json.dump(film_categories_trimmed, f)