In [1]:
import pandas as pd

basics = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
basics = basics[basics["titleType"] == "movie"]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0.0,1894.0,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0.0,1897.0,,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0.0,1905.0,,100,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0.0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0.0,1907.0,,90,Drama
...,...,...,...,...,...,...,...,...,...
9569501,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0.0,2015.0,,57,Documentary
9569528,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0.0,2007.0,,100,Documentary
9569540,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0.0,2013.0,,,Comedy
9569550,tt9916730,movie,6 Gunn,6 Gunn,0.0,2017.0,,116,


In [2]:
ratings = pd.read_csv('title.ratings.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1947
1,tt0000002,5.8,263
2,tt0000003,6.5,1773
3,tt0000004,5.6,179
4,tt0000005,6.2,2580
...,...,...,...
1273679,tt9916730,8.1,9
1273680,tt9916766,7.0,21
1273681,tt9916778,7.2,36
1273682,tt9916840,8.8,6


In [3]:
cast = pd.read_csv('title.principals.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])
names = pd.read_csv('name.basics.tsv', sep='\t', low_memory=False, na_values=["\\N","nan"])

In [4]:
df = pd.merge(basics, ratings, on='tconst')
df = df.sort_values(by='numVotes', ascending=False)

top = df.head(2000).merge(
    cast.loc[cast['category'].isin(['actor','actress'])], on='tconst'
).merge(names, on='nconst').groupby(['tconst']).agg({
    'primaryName': '|'.join
}).merge(basics, on='tconst')

top

Unnamed: 0,tconst,primaryName,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0012349,Charles Chaplin|Henry Bergman|B.F. Blinn|Edna ...,movie,The Kid,The Kid,0.0,1921.0,,68,"Comedy,Drama,Family"
1,tt0015864,Charles Chaplin|Henry Bergman|Frank Aderias|Ma...,movie,The Gold Rush,The Gold Rush,0.0,1925.0,,95,"Adventure,Comedy,Drama"
2,tt0017136,Brigitte Helm|Alfred Abel|Gustav Fröhlich|Rudo...,movie,Metropolis,Metropolis,0.0,1927.0,,153,"Drama,Sci-Fi"
3,tt0021749,Charles Chaplin|Virginia Cherrill|Florence Lee...,movie,City Lights,City Lights,0.0,1931.0,,87,"Comedy,Drama,Romance"
4,tt0022100,Peter Lorre|Ellen Widmann|Inge Landgut|Otto We...,movie,M,M - Eine Stadt sucht einen Mörder,0.0,1931.0,,117,"Crime,Mystery,Thriller"
...,...,...,...,...,...,...,...,...,...,...
1993,tt9639470,Anya Taylor-Joy|Thomasin McKenzie|Matt Smith|D...,movie,Last Night in Soho,Last Night in Soho,0.0,2021.0,,116,"Drama,Horror,Mystery"
1994,tt9764362,Ralph Fiennes|Nicholas Hoult|Anya Taylor-Joy|H...,movie,The Menu,The Menu,0.0,2022.0,,107,"Comedy,Horror,Thriller"
1995,tt9770150,Frances McDormand|David Strathairn|Linda May|G...,movie,Nomadland,Nomadland,0.0,2020.0,,107,Drama
1996,tt9777666,Chris Pratt|J.K. Simmons|Yvonne Strahovski|Bet...,movie,The Tomorrow War,The Tomorrow War,0.0,2021.0,,138,"Action,Adventure,Drama"


In [5]:
from rdflib import Graph

g = Graph()


g.parse('films2.nt', format='nt')

len(g)

1061553

In [6]:
import json
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm.notebook import tqdm

film_data = {}

sparql = SPARQLWrapper("http://dbpedia.org/sparql")


for imdb_id in tqdm(top['tconst']):
    query = f"""
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    
    SELECT DISTINCT
    ?title
    (GROUP_CONCAT(?director; SEPARATOR="|") as ?directors)
    (GROUP_CONCAT(?genre; SEPARATOR="|") as ?genres)
    (GROUP_CONCAT(?subject; SEPARATOR="|") as ?subjects)
    (GROUP_CONCAT(?actor; SEPARATOR="|") as ?actors)

    WHERE {{
        ?film rdfs:label ?title .
        ?film wdt:P345 "{imdb_id}" .
        OPTIONAL{{ ?film wdt:P57 ?d . ?d rdfs:label ?director . }}
        OPTIONAL{{ ?film wdt:P136 ?g . ?g rdfs:label ?genre . }}
        OPTIONAL{{ ?film wdt:P921 ?s . ?s rdfs:label ?subject . }}
        OPTIONAL{{ ?film wdt:P161 ?g . ?g rdfs:label ?actor . }}
    }}
    GROUP BY ?title
    """

    qres = g.query(query)
    for row in qres:
        title = row.title
        year = int(top.loc[top['tconst'] == imdb_id]['startYear'].values[0])
        display = f'{title} ({year})'
        film = {
            'title': row.title,
            'imdb': imdb_id,
            'decade': str(int(year / 10) * 10) + 's',
            'actors': sorted(set(
                top.loc[top['tconst'] == imdb_id]['primaryName'].values[0].split('|')
            )),
        }
        
        if row.directors:
            film['directors'] = sorted(set(row.directors.split('|')), key=str.casefold)
        if row.genres:
            film['genres'] = sorted(
                set(g.replace(' film', '').lower() for g in row.genres.split('|'))
                |
                set(g.lower() for g in top.loc[top['tconst'] == imdb_id]['genres'].values[0].split(',')),
                key=str.casefold
            )
        if row.subjects:
            film['subjects'] = sorted(set(row.subjects.split('|')), key=str.casefold)
        
        if len(film) > 2:
            film_data[display] = film
            
with open('films.json', 'w') as f:
    json.dump(film_data, f, indent=2)

  0%|          | 0/1998 [00:00<?, ?it/s]