In [1]:
from pathlib import Path
import polars as pl

pl.Config.set_tbl_rows(-1)

dest_dir = Path.home() / "data" / "imdb"

csv_options = {
    'separator': '\t',
    'encoding': 'utf8',
    'ignore_errors': True,
    'infer_schema_length': 10000,  # Adjust as needed
    'quote_char': None  # Disable quoting to handle unescaped quotes
}

ratings = pl.scan_csv(
    dest_dir / "title.ratings.tsv",
    **csv_options
)

details = pl.scan_csv(
    dest_dir / "title.akas.tsv",
    **csv_options
).select(['titleId', 'title', 'region', 'language'])

basics = pl.scan_csv(
    dest_dir / "title.basics.tsv",
    **csv_options
).select(['tconst', 'startYear', 'genres', 'primaryTitle', 'titleType'])

# Perform lazy joins
joined = details.join(
    ratings,
    left_on='titleId',
    right_on='tconst',
    how='inner'  # Only keep matching rows
).join(
    basics,
    left_on='titleId',
    right_on='tconst',
    how='inner'
)

# Apply filters using lazy evaluation
good = joined.filter(
    (pl.col('averageRating') >= 7.2) &
    (pl.col('numVotes') > 100_000) &
    (pl.col('startYear').str.strip_chars().cast(pl.Int64, strict=False) > 2015) &
    (pl.col('titleType') == 'movie') &
    (pl.col('region') == 'BR') &
    (pl.col('genres').str.contains('Horror'))
).unique(subset=['titleId'])


result = good.select([
    'title',
    'primaryTitle',
    'startYear',
    'genres',
    'averageRating',
    'numVotes'
]).sort('averageRating', descending=True)

final_df = result.collect()
print(final_df)

shape: (13, 6)
┌───────────────────┬───────────────────┬───────────┬───────────────────┬───────────────┬──────────┐
│ title             ┆ primaryTitle      ┆ startYear ┆ genres            ┆ averageRating ┆ numVotes │
│ ---               ┆ ---               ┆ ---       ┆ ---               ┆ ---           ┆ ---      │
│ str               ┆ str               ┆ str       ┆ str               ┆ f64           ┆ i64      │
╞═══════════════════╪═══════════════════╪═══════════╪═══════════════════╪═══════════════╪══════════╡
│ Corra!            ┆ Get Out           ┆ 2017      ┆ Horror,Mystery,Th ┆ 7.8           ┆ 732105   │
│                   ┆                   ┆           ┆ riller            ┆               ┆          │
│ Invasão Zumbi     ┆ Train to Busan    ┆ 2016      ┆ Action,Horror,Thr ┆ 7.6           ┆ 270688   │
│                   ┆                   ┆           ┆ iller             ┆               ┆          │
│ Um Lugar          ┆ A Quiet Place     ┆ 2018      ┆ Drama,Horror,Sci- ┆ 7.