In [1]:
import pandas as pd

basics = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
basics = basics[basics["titleType"] == "movie"]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0.0,1894.0,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0.0,1897.0,,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0.0,1905.0,,100,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0.0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0.0,1907.0,,90,Drama
...,...,...,...,...,...,...,...,...,...
9569501,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0.0,2015.0,,57,Documentary
9569528,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0.0,2007.0,,100,Documentary
9569540,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0.0,2013.0,,,Comedy
9569550,tt9916730,movie,6 Gunn,6 Gunn,0.0,2017.0,,116,


In [2]:
ratings = pd.read_csv('title.ratings.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1947
1,tt0000002,5.8,263
2,tt0000003,6.5,1773
3,tt0000004,5.6,179
4,tt0000005,6.2,2580
...,...,...,...
1273679,tt9916730,8.1,9
1273680,tt9916766,7.0,21
1273681,tt9916778,7.2,36
1273682,tt9916840,8.8,6


In [3]:
cast = pd.read_csv('title.principals.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
names = pd.read_csv('name.basics.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])

In [4]:
df = pd.merge(basics, ratings, on='tconst')
df = df.sort_values(by='numVotes', ascending=False)

top = df.loc[df['primaryTitle'] == df['originalTitle']].head(1000).merge(
    cast.loc[cast['category'].isin(['actor','actress'])], on='tconst'
).merge(names, on='nconst').groupby(['tconst']).agg({
    'primaryName': '|'.join
}).merge(basics, on='tconst')

top

Unnamed: 0,tconst,primaryName,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0027977,Al Ernest Garcia|Charles Chaplin|Paulette Godd...,movie,Modern Times,Modern Times,0.0,1936.0,,87,"Comedy,Drama,Romance"
1,tt0031381,Thomas Mitchell|Clark Gable|Vivien Leigh|Barba...,movie,Gone with the Wind,Gone with the Wind,0.0,1939.0,,238,"Drama,Romance,War"
2,tt0032138,Judy Garland|Frank Morgan|Ray Bolger|Bert Lahr,movie,The Wizard of Oz,The Wizard of Oz,0.0,1939.0,,102,"Adventure,Family,Fantasy"
3,tt0032553,Charles Chaplin|Paulette Goddard|Grace Hayle|J...,movie,The Great Dictator,The Great Dictator,0.0,1940.0,,125,"Comedy,Drama,War"
4,tt0033467,Orson Welles|Joseph Cotten|Dorothy Comingore|A...,movie,Citizen Kane,Citizen Kane,0.0,1941.0,,119,"Drama,Mystery"
...,...,...,...,...,...,...,...,...,...,...
995,tt9032400,Angelina Jolie|Gemma Chan|Richard Madden|Salma...,movie,Eternals,Eternals,0.0,2021.0,,156,"Action,Adventure,Fantasy"
996,tt9243946,Aaron Paul|Jonathan Banks|Matt Jones|Charles B...,movie,El Camino: A Breaking Bad Movie,El Camino: A Breaking Bad Movie,0.0,2019.0,,122,"Action,Crime,Drama"
997,tt9376612,Ben Kingsley|Simu Liu|Awkwafina|Tony Chiu-Wai ...,movie,Shang-Chi and the Legend of the Ten Rings,Shang-Chi and the Legend of the Ten Rings,0.0,2021.0,,132,"Action,Adventure,Fantasy"
998,tt9419884,Benedict Cumberbatch|Chiwetel Ejiofor|Benedict...,movie,Doctor Strange in the Multiverse of Madness,Doctor Strange in the Multiverse of Madness,0.0,2022.0,,126,"Action,Adventure,Fantasy"


In [5]:
from rdflib import Graph
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("http://localhost:7200/repositories/wikidata")

def gen():
    QUERY = '''
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT DISTINCT
    ?title ?tconst
    (GROUP_CONCAT(DISTINCT ?director; SEPARATOR="|") as ?directors)
    (GROUP_CONCAT(DISTINCT ?genre; SEPARATOR="|") as ?genres)
    (GROUP_CONCAT(DISTINCT ?subject; SEPARATOR="|") as ?subjects)
    (GROUP_CONCAT(DISTINCT ?actor; SEPARATOR="|") as ?actors)

    WHERE {
        ?film wdt:P31 wd:Q11424 .
        ?film rdfs:label ?title .
        ?film wdt:P345 ?tconst .
        OPTIONAL { ?film wdt:P57 ?d . ?d rdfs:label ?director . }
        OPTIONAL { ?film wdt:P136 ?g . ?g rdfs:label ?genre . }
        OPTIONAL { ?film wdt:P921 ?s . ?s rdfs:label ?subject . }
        OPTIONAL { ?film wdt:P161 ?g . ?g rdfs:label ?actor . }
    }
    GROUP BY ?title ?tconst
    '''

    sparql.setQuery(QUERY)
    sparql.setReturnFormat(JSON)
    ret = sparql.queryAndConvert()

    for row in ret['results']['bindings']:
        yield {k: v['value'] for k, v in row.items()}


df2 = pd.DataFrame.from_records(gen())
df2

Unnamed: 0,title,tconst,directors,genres,subjects,actors
0,The Intouchables,tt1675434,Éric Toledano|Olivier Nakache,film based on literature|biographical film|com...,,
1,12 Angry Men,tt0050083,Sidney Lumet,drama film|Huis clos|trial film,jury trial|distinction|capital punishment,Jiří Voskovec|Robert Webber|Jack Warden|John F...
2,Nalai Manithan,tt4168940,Velu Prabhakaran,romance film|horror film|action film,,
3,Labyrinth,tt0091369,Jim Henson,fantasy film|coming-of-age fiction|adventure f...,,
4,Pirates II: Stagnetti's Revenge,tt1266097,Joone,pirate film|pornographic film,sea piracy,
...,...,...,...,...,...,...
17791,Sawtooth,tt0362118,,drama film|thriller film,,
17792,Take Two,tt0096215,Peter Rowe,,,
17793,Richest Man in Town,tt0034099,Charles Barton,drama film,,
17794,Emoticon ;),tt2147275,Livia De Paolis,,,


In [6]:
df3 = top.merge(df2, on='tconst')
df3

Unnamed: 0,tconst,primaryName,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_x,title,directors,genres_y,subjects,actors
0,tt0032553,Charles Chaplin|Paulette Goddard|Grace Hayle|J...,movie,The Great Dictator,The Great Dictator,0.0,1940.0,,125,"Comedy,Drama,War",The Great Dictator,Charlie Chaplin,drama film|comedy drama|war film|anti-war film...,antisemitism|fascism,Leo White|Paulette Goddard|Charlie Chaplin|Sig...
1,tt0050083,Henry Fonda|Lee J. Cobb|Martin Balsam|John Fie...,movie,12 Angry Men,12 Angry Men,0.0,1957.0,,96,"Crime,Drama",12 Angry Men,Sidney Lumet,drama film|Huis clos|trial film,jury trial|distinction|capital punishment,Jiří Voskovec|Robert Webber|Jack Warden|John F...
2,tt0052618,Jack Hawkins|Charlton Heston|Stephen Boyd|Haya...,movie,Ben-Hur,Ben-Hur,0.0,1959.0,,212,"Adventure,Drama",Ben-Hur,William Wyler,drama film|action film|film based on a novel|C...,,
3,tt0053125,Cary Grant|Eva Marie Saint|James Mason|Jessie ...,movie,North by Northwest,North by Northwest,0.0,1959.0,,136,"Action,Adventure,Mystery",North by Northwest,Alfred Hitchcock,crime film|thriller film|mystery film|spy film,espionage|kidnapping|mistaken identity,Edward Binns|Alfred Hitchcock|Lawrence Dobkin|...
4,tt0057115,Richard Attenborough|James Garner|Steve McQuee...,movie,The Great Escape,The Great Escape,0.0,1963.0,,172,"Adventure,Drama,History",The Great Escape,John Sturges,drama film|war film|film based on books|prison...,World War II|aviation|escape,Charles Bronson|Nigel Stock|Robert Freitag|Han...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,tt5013056,Tom Hardy|Fionn Whitehead|Barry Keoghan|Mark R...,movie,Dunkirk,Dunkirk,0.0,2017.0,,106,"Action,Drama,History",Dunkirk,Christopher Nolan,drama film|war film,Dunkirk evacuation,
106,tt5027774,Woody Harrelson|Frances McDormand|Sam Rockwell...,movie,"Three Billboards Outside Ebbing, Missouri","Three Billboards Outside Ebbing, Missouri",0.0,2017.0,,115,"Comedy,Crime,Drama","Three Billboards Outside Ebbing, Missouri",Martin McDonagh,drama film|crime film|mystery film|black comed...,law enforcement|violence against women,Sam Rockwell|Frances McDormand|Woody Harrelson...
107,tt5362988,Jeremy Renner|Elizabeth Olsen|Graham Greene|Ke...,movie,Wind River,Wind River,0.0,2017.0,,107,"Crime,Drama,Mystery",Wind River,Taylor Sheridan,crime film|Western film|thriller film|mystery ...,,
108,tt5442430,Ryan Reynolds|Jake Gyllenhaal|Rebecca Ferguson...,movie,Life,Life,0.0,2017.0,,104,"Horror,Sci-Fi,Thriller",Life,Daniel Espinosa,drama film|horror film|action film|science fic...,,


In [7]:
import json 

film_data = {}

for row in df3.itertuples():
    title = row.title
    year = int(row.startYear)
    display = f'{title} ({year})'
    film = {
        'title': title,
        'imdb': row.tconst,
        'decade': str(int(year / 10) * 10) + 's',
        'actors': sorted(set(
            row.primaryName.split('|')
        )),
    }

    film['directors'] = sorted(set(row.directors.split('|')), key=str.casefold)
    film['genres'] = sorted(
        set(g.replace(' film', '').lower() for g in row.genres_y.split('|'))
        |
        set(g.lower() for g in row.genres_x.split(',')),
        key=str.casefold
    )
    film['subjects'] = sorted(set(row.subjects.split('|')), key=str.casefold)

    if len(film) > 2:
        film_data[display] = film

with open('films.json', 'w') as f:
    json.dump(film_data, f, indent=2)