In [1]:
import pandas as pd

basics = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
basics = basics[basics["titleType"] == "movie"]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0.0,1894.0,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0.0,1897.0,,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0.0,1905.0,,100,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0.0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0.0,1907.0,,90,Drama
...,...,...,...,...,...,...,...,...,...
9629481,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0.0,2015.0,,57,Documentary
9629508,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0.0,2007.0,,100,Documentary
9629520,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0.0,2013.0,,,Comedy
9629530,tt9916730,movie,6 Gunn,6 Gunn,0.0,2017.0,,116,


In [2]:
ratings = pd.read_csv('title.ratings.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1952
1,tt0000002,5.8,264
2,tt0000003,6.5,1787
3,tt0000004,5.6,179
4,tt0000005,6.2,2589
...,...,...,...
1281760,tt9916730,8.3,10
1281761,tt9916766,7.0,21
1281762,tt9916778,7.2,36
1281763,tt9916840,8.8,6


In [3]:
cast = pd.read_csv('title.principals.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
names = pd.read_csv('name.basics.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])

In [4]:
df = pd.merge(basics, ratings, on='tconst')
df = df.sort_values(by='numVotes', ascending=False)

top = df.loc[df['primaryTitle'] == df['originalTitle']].head(1000).merge(
    cast.loc[cast['category'].isin(['actor','actress'])], on='tconst'
).merge(names, on='nconst').groupby(['tconst']).agg({
    'primaryName': '|'.join
}).merge(basics, on='tconst')

top

Unnamed: 0,tconst,primaryName,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0027977,Al Ernest Garcia|Charles Chaplin|Paulette Godd...,movie,Modern Times,Modern Times,0.0,1936.0,,87,"Comedy,Drama,Romance"
1,tt0031381,Thomas Mitchell|Clark Gable|Vivien Leigh|Barba...,movie,Gone with the Wind,Gone with the Wind,0.0,1939.0,,238,"Drama,Romance,War"
2,tt0032138,Judy Garland|Frank Morgan|Ray Bolger|Bert Lahr,movie,The Wizard of Oz,The Wizard of Oz,0.0,1939.0,,102,"Adventure,Family,Fantasy"
3,tt0032553,Charles Chaplin|Paulette Goddard|Grace Hayle|J...,movie,The Great Dictator,The Great Dictator,0.0,1940.0,,125,"Comedy,Drama,War"
4,tt0033467,Orson Welles|Joseph Cotten|Dorothy Comingore|A...,movie,Citizen Kane,Citizen Kane,0.0,1941.0,,119,"Drama,Mystery"
...,...,...,...,...,...,...,...,...,...,...
995,tt9243946,Aaron Paul|Jonathan Banks|Matt Jones|Charles B...,movie,El Camino: A Breaking Bad Movie,El Camino: A Breaking Bad Movie,0.0,2019.0,,122,"Action,Crime,Drama"
996,tt9376612,Ben Kingsley|Simu Liu|Awkwafina|Tony Chiu-Wai ...,movie,Shang-Chi and the Legend of the Ten Rings,Shang-Chi and the Legend of the Ten Rings,0.0,2021.0,,132,"Action,Adventure,Fantasy"
997,tt9419884,Benedict Cumberbatch|Chiwetel Ejiofor|Benedict...,movie,Doctor Strange in the Multiverse of Madness,Doctor Strange in the Multiverse of Madness,0.0,2022.0,,126,"Action,Adventure,Fantasy"
998,tt9764362,Ralph Fiennes|Nicholas Hoult|Anya Taylor-Joy|H...,movie,The Menu,The Menu,0.0,2022.0,,107,"Horror,Thriller"


In [5]:
from rdflib import Graph
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("http://localhost:7200/repositories/wikidata")

def gen():
    QUERY = '''
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT DISTINCT
    ?title ?tconst
    (GROUP_CONCAT(DISTINCT ?director; SEPARATOR="|") as ?directors)
    (GROUP_CONCAT(DISTINCT ?composer; SEPARATOR="|") as ?composers)
    (GROUP_CONCAT(DISTINCT ?genre; SEPARATOR="|") as ?genres)
    (GROUP_CONCAT(DISTINCT ?subject; SEPARATOR="|") as ?subjects)
    (GROUP_CONCAT(DISTINCT ?period; SEPARATOR="|") as ?periods)
    (GROUP_CONCAT(DISTINCT ?location; SEPARATOR="|") as ?locations)

    WHERE {
        ?film wdt:P31 wd:Q11424 .
        ?film rdfs:label ?title .
        ?film wdt:P345 ?tconst .
        OPTIONAL { ?film wdt:P57 ?d . ?d rdfs:label ?director . }
        OPTIONAL { ?film wdt:P86 ?c . ?c rdfs:label ?composer . }
        OPTIONAL { ?film wdt:P136 ?g . ?g rdfs:label ?genre . }
        OPTIONAL { ?film wdt:P921 ?s . ?s rdfs:label ?subject . }
        OPTIONAL { ?film wdt:P180 ?s2 . ?s2 rdfs:label ?subject . }
        OPTIONAL { ?film wdt:P2401 ?p . ?p rdfs:label ?period . }
        OPTIONAL { ?film wdt:P840 ?l . ?l rdfs:label ?location . }
    }
    GROUP BY ?title ?tconst
    '''

    sparql.setQuery(QUERY)
    sparql.setReturnFormat(JSON)
    ret = sparql.queryAndConvert()

    for row in ret['results']['bindings']:
        yield {k: v['value'] for k, v in row.items()}


df2 = pd.DataFrame.from_records(gen())
df2

Unnamed: 0,title,tconst,directors,composers,genres,subjects,periods,locations
0,The Intouchables,tt1675434,Éric Toledano|Olivier Nakache,Ludovico Einaudi,film based on literature|biographical film|com...,,,Paris
1,Copernicus,tt0070278,Czesław Petelski|Ewa Petelska,Jerzy Maksymiuk,historical film,,,
2,Ivan's Childhood,tt0056111,Andrei Tarkovsky,Vyacheslav Ovchinnikov,drama film|war film,World War II,,Soviet Union
3,Andrei Rublev,tt0060107,Andrei Tarkovsky,Vyacheslav Ovchinnikov,drama film|art film|medieval film,,,Moscow
4,The Sacrifice,tt0091670,Andrei Tarkovsky,Johann Sebastian Bach,drama film,World War III|religiosity|sacrifice,,Sweden
...,...,...,...,...,...,...,...,...
105129,The Truth About Husbands,tt0011794,Kenneth Webb,,drama film|silent film,,,
105130,The Postman,tt0299582,Kamel El-Telmissany,,,,,
105131,Princess Leila,tt0273730,Niazi Mostafa,,,,,
105132,Served Like a Girl,tt6175710,,Michael A. Levine,documentary film,,,


In [6]:
df3 = top.merge(df2, on='tconst')
df3

Unnamed: 0,tconst,primaryName,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_x,title,directors,composers,genres_y,subjects,periods,locations
0,tt0031381,Thomas Mitchell|Clark Gable|Vivien Leigh|Barba...,movie,Gone with the Wind,Gone with the Wind,0.0,1939.0,,238,"Drama,Romance,War",Gone with the Wind,Sam Wood|George Cukor|Victor Fleming,Max Steiner,drama film|romance film|film based on a novel|...,slavery in the United States|revenge,,London|Atlanta
1,tt0032138,Judy Garland|Frank Morgan|Ray Bolger|Bert Lahr,movie,The Wizard of Oz,The Wizard of Oz,0.0,1939.0,,102,"Adventure,Family,Fantasy",The Wizard of Oz,King Vidor|Victor Fleming,Harold Arlen,drama film|fantasy film|film based on a novel|...,,,Kansas|Land of Oz|Emerald City|Munchkin Country
2,tt0032553,Charles Chaplin|Paulette Goddard|Grace Hayle|J...,movie,The Great Dictator,The Great Dictator,0.0,1940.0,,125,"Comedy,Drama,War",The Great Dictator,Charlie Chaplin,Charlie Chaplin|Meredith Willson,drama film|comedy drama|war film|anti-war film...,antisemitism|fascism,,Osterlich|Bacteria|Tomainia
3,tt0043014,William Holden|Gloria Swanson|Erich von Strohe...,movie,Sunset Blvd.,Sunset Blvd.,0.0,1950.0,,110,"Drama,Film-Noir",Sunset Boulevard,Billy Wilder,Franz Waxman,drama film|flashback film|film noir,film industry|loneliness|delusion|ageing|cinem...,,Los Angeles
4,tt0045152,Gene Kelly|Donald O'Connor|Debbie Reynolds|Jea...,movie,Singin' in the Rain,Singin' in the Rain,0.0,1952.0,,103,"Comedy,Musical,Romance",Singin' in the Rain,Gene Kelly|Stanley Donen,Nacio Herb Brown,romantic comedy|musical film,,,Los Angeles
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,tt8936646,Chris Hemsworth|Bryon Lerum|Ryder Lerum|Rudhra...,movie,Extraction,Extraction,0.0,2020.0,,116,"Action,Thriller",Extraction,Sam Hargrave,Henry Jackman,action film,,,India|Bangladesh|Mumbai|Dhaka|Kimberley
553,tt9032400,Angelina Jolie|Gemma Chan|Richard Madden|Salma...,movie,Eternals,Eternals,0.0,2021.0,,156,"Action,Adventure,Fantasy",Eternals,Chloé Zhao,Ramin Djawadi,drama film|action film|speculative/fantastic f...,,,Australia|London|Iraq|Amazon|Chicago|Tenochtit...
554,tt9114286,Lupita Nyong'o|Danai Gurira|Winston Duke|Letit...,movie,Black Panther: Wakanda Forever,Black Panther: Wakanda Forever,0.0,2022.0,,161,"Action,Adventure,Drama",Black Panther: Wakanda Forever,Ryan Coogler,Ludwig Göransson,action film,,,Culiacán
555,tt9243946,Aaron Paul|Jonathan Banks|Matt Jones|Charles B...,movie,El Camino: A Breaking Bad Movie,El Camino: A Breaking Bad Movie,0.0,2019.0,,122,"Action,Crime,Drama",El Camino: A Breaking Bad Movie,Vince Gilligan,Dave Porter,drama film|crime film,,,New Mexico


In [7]:
import json 

film_data = {}

for row in df3.itertuples():
    title = row.title
    year = int(row.startYear)
    display = f'{title} ({year})'
    film = {
        'title': title,
        'imdb': row.tconst,
        'decade': str(int(year / 10) * 10) + 's',
        'actors': sorted(set(
            row.primaryName.split('|')
        )),
    }

    film['directors'] = sorted(set(row.directors.split('|')), key=str.casefold) or None
    film['composers'] = sorted(filter(bool, set(row.composers.split('|'))), key=str.casefold) or None
    film['settings'] = sorted(filter(bool, set(row.periods.split('|')) | set(row.locations.split('|'))),
        key=str.casefold
    ) or None
    film['genres'] = sorted(
        set(g.replace(' film', '').lower() for g in row.genres_y.split('|'))
        |
        set(g.lower() for g in row.genres_x.split(',')),
        key=str.casefold
    ) or None
    film['subjects'] = sorted(filter(bool, set(row.subjects.split('|'))), key=str.casefold) or None

    if len(film) > 2:
        film_data[display] = film

with open('films.json', 'w') as f:
    json.dump(film_data, f, indent=2)