In [1]:
import pandas as pd

basics = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
basics = basics[basics["titleType"] == "movie"]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0.0,1894.0,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0.0,1897.0,,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0.0,1905.0,,100,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0.0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0.0,1907.0,,90,Drama
...,...,...,...,...,...,...,...,...,...
9569501,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0.0,2015.0,,57,Documentary
9569528,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0.0,2007.0,,100,Documentary
9569540,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0.0,2013.0,,,Comedy
9569550,tt9916730,movie,6 Gunn,6 Gunn,0.0,2017.0,,116,


In [2]:
ratings = pd.read_csv('title.ratings.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1947
1,tt0000002,5.8,263
2,tt0000003,6.5,1773
3,tt0000004,5.6,179
4,tt0000005,6.2,2580
...,...,...,...
1273679,tt9916730,8.1,9
1273680,tt9916766,7.0,21
1273681,tt9916778,7.2,36
1273682,tt9916840,8.8,6


In [3]:
cast = pd.read_csv('title.principals.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
names = pd.read_csv('name.basics.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])

In [4]:
df = pd.merge(basics, ratings, on='tconst')
df = df.sort_values(by='numVotes', ascending=False)

top = df.loc[df['primaryTitle'] == df['originalTitle']].head(1000).merge(
    cast.loc[cast['category'].isin(['actor','actress'])], on='tconst'
).merge(names, on='nconst').groupby(['tconst']).agg({
    'primaryName': '|'.join
}).merge(basics, on='tconst')

top

Unnamed: 0,tconst,primaryName,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0027977,Al Ernest Garcia|Charles Chaplin|Paulette Godd...,movie,Modern Times,Modern Times,0.0,1936.0,,87,"Comedy,Drama,Romance"
1,tt0031381,Thomas Mitchell|Clark Gable|Vivien Leigh|Barba...,movie,Gone with the Wind,Gone with the Wind,0.0,1939.0,,238,"Drama,Romance,War"
2,tt0032138,Judy Garland|Frank Morgan|Ray Bolger|Bert Lahr,movie,The Wizard of Oz,The Wizard of Oz,0.0,1939.0,,102,"Adventure,Family,Fantasy"
3,tt0032553,Charles Chaplin|Paulette Goddard|Grace Hayle|J...,movie,The Great Dictator,The Great Dictator,0.0,1940.0,,125,"Comedy,Drama,War"
4,tt0033467,Orson Welles|Joseph Cotten|Dorothy Comingore|A...,movie,Citizen Kane,Citizen Kane,0.0,1941.0,,119,"Drama,Mystery"
...,...,...,...,...,...,...,...,...,...,...
995,tt9032400,Angelina Jolie|Gemma Chan|Richard Madden|Salma...,movie,Eternals,Eternals,0.0,2021.0,,156,"Action,Adventure,Fantasy"
996,tt9243946,Aaron Paul|Jonathan Banks|Matt Jones|Charles B...,movie,El Camino: A Breaking Bad Movie,El Camino: A Breaking Bad Movie,0.0,2019.0,,122,"Action,Crime,Drama"
997,tt9376612,Ben Kingsley|Simu Liu|Awkwafina|Tony Chiu-Wai ...,movie,Shang-Chi and the Legend of the Ten Rings,Shang-Chi and the Legend of the Ten Rings,0.0,2021.0,,132,"Action,Adventure,Fantasy"
998,tt9419884,Benedict Cumberbatch|Chiwetel Ejiofor|Benedict...,movie,Doctor Strange in the Multiverse of Madness,Doctor Strange in the Multiverse of Madness,0.0,2022.0,,126,"Action,Adventure,Fantasy"


In [5]:
from rdflib import Graph
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("http://localhost:7200/repositories/wikidata")

def gen():
    QUERY = '''
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT DISTINCT
    ?title ?tconst
    (GROUP_CONCAT(DISTINCT ?director; SEPARATOR="|") as ?directors)
    (GROUP_CONCAT(DISTINCT ?composer; SEPARATOR="|") as ?composers)
    (GROUP_CONCAT(DISTINCT ?genre; SEPARATOR="|") as ?genres)
    (GROUP_CONCAT(DISTINCT ?subject; SEPARATOR="|") as ?subjects)
    (GROUP_CONCAT(DISTINCT ?period; SEPARATOR="|") as ?periods)
    (GROUP_CONCAT(DISTINCT ?location; SEPARATOR="|") as ?locations)

    WHERE {
        ?film wdt:P31 wd:Q11424 .
        ?film rdfs:label ?title .
        ?film wdt:P345 ?tconst .
        OPTIONAL { ?film wdt:P57 ?d . ?d rdfs:label ?director . }
        OPTIONAL { ?film wdt:P86 ?c . ?c rdfs:label ?composer . }
        OPTIONAL { ?film wdt:P136 ?g . ?g rdfs:label ?genre . }
        OPTIONAL { ?film wdt:P921 ?s . ?s rdfs:label ?subject . }
        OPTIONAL { ?film wdt:P180 ?s2 . ?s2 rdfs:label ?subject . }
        OPTIONAL { ?film wdt:P2401 ?p . ?p rdfs:label ?period . }
        OPTIONAL { ?film wdt:P840 ?l . ?l rdfs:label ?location . }
    }
    GROUP BY ?title ?tconst
    '''

    sparql.setQuery(QUERY)
    sparql.setReturnFormat(JSON)
    ret = sparql.queryAndConvert()

    for row in ret['results']['bindings']:
        yield {k: v['value'] for k, v in row.items()}


df2 = pd.DataFrame.from_records(gen())
df2

Unnamed: 0,title,tconst,directors,composers,genres,subjects,periods,locations
0,The Intouchables,tt1675434,Éric Toledano|Olivier Nakache,Ludovico Einaudi,film based on literature|biographical film|com...,,,Paris
1,12 Angry Men,tt0050083,Sidney Lumet,Kenyon Hopkins,drama film|Huis clos|trial film,jury trial|distinction|capital punishment,,Manhattan
2,Nalai Manithan,tt4168940,Velu Prabhakaran,,romance film|horror film|action film,,,Chennai
3,Labyrinth,tt0091369,Jim Henson,Trevor Jones,fantasy film|coming-of-age fiction|adventure f...,,,
4,Pirates II: Stagnetti's Revenge,tt1266097,Joone,,pirate film|pornographic film,sea piracy,,Jamaica
...,...,...,...,...,...,...,...,...
21370,Raavan Leela,tt9538242,,,drama film,,,
21371,Iris Chang: The Rape of Nanking,tt1218513,,,,,,
21372,The Challenge,tt14812496,Klim Shipenko,,space drama,,,
21373,Kampung Latah Kena Kuarantin,tt15392890,Azizi Adnan,,comedy film,latah,,


In [6]:
df3 = top.merge(df2, on='tconst')
df3

Unnamed: 0,tconst,primaryName,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_x,title,directors,composers,genres_y,subjects,periods,locations
0,tt0032553,Charles Chaplin|Paulette Goddard|Grace Hayle|J...,movie,The Great Dictator,The Great Dictator,0.0,1940.0,,125,"Comedy,Drama,War",The Great Dictator,Charlie Chaplin,Charlie Chaplin|Meredith Willson,drama film|comedy drama|war film|anti-war film...,antisemitism|fascism,,Osterlich|Bacteria|Tomainia
1,tt0050083,Henry Fonda|Lee J. Cobb|Martin Balsam|John Fie...,movie,12 Angry Men,12 Angry Men,0.0,1957.0,,96,"Crime,Drama",12 Angry Men,Sidney Lumet,Kenyon Hopkins,drama film|Huis clos|trial film,jury trial|distinction|capital punishment,,Manhattan
2,tt0052618,Jack Hawkins|Charlton Heston|Stephen Boyd|Haya...,movie,Ben-Hur,Ben-Hur,0.0,1959.0,,212,"Adventure,Drama",Ben-Hur,William Wyler,Miklós Rózsa,drama film|action film|film based on a novel|C...,,,Israel
3,tt0053125,Cary Grant|Eva Marie Saint|James Mason|Jessie ...,movie,North by Northwest,North by Northwest,0.0,1959.0,,136,"Action,Adventure,Mystery",North by Northwest,Alfred Hitchcock,Bernard Herrmann,crime film|thriller film|mystery film|spy film,espionage|kidnapping|mistaken identity,,New York City|Indiana|Chicago|Manhattan|Long I...
4,tt0057115,Richard Attenborough|James Garner|Steve McQuee...,movie,The Great Escape,The Great Escape,0.0,1963.0,,172,"Adventure,Drama,History",The Great Escape,John Sturges,Elmer Bernstein,drama film|war film|film based on books|prison...,World War II|aviation|escape,,Germany
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,tt6264654,Ryan Reynolds|Taika Waititi|Jodie Comer|Lil Re...,movie,Free Guy,Free Guy,0.0,2021.0,,115,"Action,Adventure,Comedy",Free Guy,Shawn Levy,Christophe Beck,comedy film|action film|fantasy film|science f...,,,United States of America
115,tt6857112,Lupita Nyong'o|Winston Duke|Elisabeth Moss|Tim...,movie,Us,Us,0.0,2019.0,,116,"Horror,Mystery,Thriller",Us,Jordan Peele,Michael Abels,horror film|action film|science fiction film|t...,self-discovery|psychological repression|coming...,,California
116,tt7286456,Joaquin Phoenix|Robert De Niro|Zazie Beetz|Fra...,movie,Joker,Joker,0.0,2019.0,,122,"Crime,Drama,Thriller",Joker,Todd Phillips,Hildur Guðnadóttir,drama film|crime film|thriller film|psychologi...,Joker,,Gotham City
117,tt7349950,Jessica Chastain|Bill Hader|James McAvoy|Isaia...,movie,It Chapter Two,It Chapter Two,0.0,2019.0,,169,"Drama,Fantasy,Horror",It: Chapter Two,Andrés Muschietti,Benjamin Wallfisch,horror film|film based on a novel|LGBT-related...,,,Maine


In [8]:
import json 

film_data = {}

for row in df3.itertuples():
    title = row.title
    year = int(row.startYear)
    display = f'{title} ({year})'
    film = {
        'title': title,
        'imdb': row.tconst,
        'decade': str(int(year / 10) * 10) + 's',
        'actors': sorted(set(
            row.primaryName.split('|')
        )),
    }

    film['directors'] = sorted(set(row.directors.split('|')), key=str.casefold) or None
    film['composers'] = sorted(set(row.composers.split('|')), key=str.casefold) or None
    film['settings'] = sorted(filter(bool, set(row.periods.split('|')) | set(row.locations.split('|'))),
        key=str.casefold
    ) or None
    film['genres'] = sorted(
        set(g.replace(' film', '').lower() for g in row.genres_y.split('|'))
        |
        set(g.lower() for g in row.genres_x.split(',')),
        key=str.casefold
    ) or None
    film['subjects'] = sorted(filter(bool, set(row.subjects.split('|'))), key=str.casefold) or None

    if len(film) > 2:
        film_data[display] = film

with open('films.json', 'w') as f:
    json.dump(film_data, f, indent=2)