# Finds good rated movies

Idea of this is primitive: find well-rated movies with enough people voting them and that were released previously.

I am also trying to find dubbed movies in Portuguese thus I chose BR locale for translated titles.

In [1]:
import pandas as pd

import requests
from pathlib import Path

import shutil
import gzip

pd.set_option('display.max_rows', 500)

In [2]:
links = """name.basics.tsv.gz
title.akas.tsv.gz
title.basics.tsv.gz
title.crew.tsv.gz
title.episode.tsv.gz
title.principals.tsv.gz
title.ratings.tsv.gz""".split()

to_download = [f"https://datasets.imdbws.com/{l}" for l in links]
dest_dir = Path.home() / "data" / "imdb"

In [3]:
for fname, url in zip(links, to_download):
    dest = dest_dir / fname
    if dest.exists():
        continue
        
    dest.parent.mkdir(exist_ok=True, parents=True)
        
    response = requests.get(url, allow_redirects=True)
    if not response.ok:
        raise ValueError(f"bad status code: {response.status_code}")

    with dest.open(mode='wb') as f:  # replace with your desired filename and extension
        f.write(response.content)

In [4]:
for filename in dest_dir.glob("*.tsv.gz"):
    dest_file = dest_dir / filename.stem
    if dest_file.exists():
        continue
        
    with gzip.open(filename, 'rb') as f_in:
        with dest_file.open(mode='wb') as f_out:  # remove '.gz' from filename
            shutil.copyfileobj(f_in, f_out)

In [5]:
ratings = pd.read_csv(dest_dir / "title.ratings.tsv", sep='\t')
details = pd.read_csv(dest_dir / "title.akas.tsv", sep='\t',
                      usecols=['titleId', 'title', 'region', 'language'])
basics = pd.read_csv(dest_dir / "title.basics.tsv", sep='\t',
                     usecols=['tconst', 'startYear', 'genres', 'primaryTitle', 'titleType'])

In [6]:
joined = details.join(ratings.set_index('tconst'), on='titleId').join(basics.set_index('tconst'), on='titleId')

In [7]:
good = joined[(joined['averageRating'] >= 7.2) & (joined['numVotes'] > 100_000) \
              & (joined['startYear'] > '2020') & (joined['titleType'] == 'movie') & (joined['region'] == 'BR')]
good = good.drop_duplicates(subset=['titleId'])

In [9]:
good[['title', 'primaryTitle', 'startYear', 'genres', 'averageRating', 'numVotes']] \
    .sort_values('averageRating', ascending=False)

Unnamed: 0,title,primaryTitle,startYear,genres,averageRating,numVotes
35252617,Homem-Aranha: Através do Aranhaverso,Spider-Man: Across the Spider-Verse,2023,"Action,Adventure,Animation",9.0,133894.0
14392279,Jai Bhim,Jai Bhim,2021,"Crime,Drama,Mystery",8.8,207272.0
4043990,Shershaah,Shershaah,2021,"Action,Biography,Drama",8.3,125033.0
17527748,Top Gun: Maverick,Top Gun: Maverick,2022,"Action,Drama",8.3,597429.0
5353379,Homem-Aranha: Sem Volta para Casa,Spider-Man: No Way Home,2021,"Action,Adventure,Fantasy",8.2,803938.0
31017052,Guardiões da Galáxia Vol. 3,Guardians of the Galaxy Vol. 3,2023,"Action,Adventure,Comedy",8.2,173167.0
4204746,No Ritmo do Coração,CODA,2021,"Comedy,Drama,Music",8.0,147113.0
6987287,Duna: Primeira Parte,Dune: Part One,2021,"Action,Adventure,Drama",8.0,683653.0
8607183,Liga da Justiça de Zack Snyder,Zack Snyder's Justice League,2021,"Action,Adventure,Fantasy",8.0,414731.0
27032528,Gato de Botas 2: O Último Pedido,Puss in Boots: The Last Wish,2022,"Adventure,Animation,Comedy",7.9,134901.0
