# Finds good rated movies

Idea of this is primitive: find well-rated movies with enough people voting them and that were released previously.

I am also trying to find dubbed movies in Portuguese thus I chose BR locale for translated titles.

In [1]:
import pandas as pd

import requests
from pathlib import Path

import shutil
import gzip

pd.set_option('display.max_rows', 500)

In [2]:
links = """name.basics.tsv.gz
title.akas.tsv.gz
title.basics.tsv.gz
title.crew.tsv.gz
title.episode.tsv.gz
title.principals.tsv.gz
title.ratings.tsv.gz""".split()

to_download = [f"https://datasets.imdbws.com/{l}" for l in links]
dest_dir = Path.home() / "data" / "imdb"

In [3]:
for fname, url in zip(links, to_download):
    dest = dest_dir / fname
    if dest.exists():
        continue
        
    dest.parent.mkdir(exist_ok=True, parents=True)
        
    response = requests.get(url, allow_redirects=True)
    if not response.ok:
        raise ValueError(f"bad status code: {response.status_code}")

    with dest.open(mode='wb') as f:  # replace with your desired filename and extension
        f.write(response.content)

In [4]:
for filename in dest_dir.glob("*.tsv.gz"):
    dest_file = dest_dir / filename.stem
    if dest_file.exists():
        continue
        
    with gzip.open(filename, 'rb') as f_in:
        with dest_file.open(mode='wb') as f_out:  # remove '.gz' from filename
            shutil.copyfileobj(f_in, f_out)

In [5]:
ratings = pd.read_csv(dest_dir / "title.ratings.tsv", sep='\t')
details = pd.read_csv(dest_dir / "title.akas.tsv", sep='\t',
                      usecols=['titleId', 'title', 'region', 'language'])
basics = pd.read_csv(dest_dir / "title.basics.tsv", sep='\t',
                     usecols=['tconst', 'startYear', 'genres', 'primaryTitle', 'titleType'])

In [6]:
joined = details.join(ratings.set_index('tconst'), on='titleId').join(basics.set_index('tconst'), on='titleId')

In [10]:
good = joined[(joined['averageRating'] >= 7.2) & (joined['numVotes'] > 100_000) \
              & (joined['startYear'] > '2015') & (joined['titleType'] == 'movie') & (joined['region'] == 'BR')]
good = good.drop_duplicates(subset=['titleId'])

In [11]:
good[['title', 'primaryTitle', 'startYear', 'genres', 'averageRating', 'numVotes']] \
    .sort_values('averageRating', ascending=False)

Unnamed: 0,title,primaryTitle,startYear,genres,averageRating,numVotes
35252617,Homem-Aranha: Através do Aranhaverso,Spider-Man: Across the Spider-Verse,2023,"Action,Adventure,Animation",9.0,133894.0
14392279,Jai Bhim,Jai Bhim,2021,"Crime,Drama,Mystery",8.8,207272.0
30976205,Parasita,Parasite,2019,"Drama,Thriller",8.5,859663.0
22629390,Viva: A Vida é uma Festa,Coco,2017,"Adventure,Animation,Comedy",8.4,537634.0
27362811,Vingadores: Ultimato,Avengers: Endgame,2019,"Action,Adventure,Drama",8.4,1185758.0
33027581,Pobre Coração,Dil Bechara,2020,"Comedy,Drama,Romance",8.4,132761.0
31783342,Coringa,Joker,2019,"Crime,Drama,Thriller",8.4,1360942.0
28051948,Homem-Aranha: No Aranhaverso,Spider-Man: Into the Spider-Verse,2018,"Action,Adventure,Animation",8.4,590919.0
27362712,Vingadores: Guerra Infinita,Avengers: Infinity War,2018,"Action,Adventure,Sci-Fi",8.4,1128749.0
29003398,Your Name.,Your Name.,2016,"Animation,Drama,Fantasy",8.4,288675.0
