In [1]:
from pathlib import Path
import polars as pl
from tabulate import tabulate
from IPython.display import display, HTML

def show(df: pl.DataFrame):
    return display(HTML(tabulate(df.to_dicts(), headers="keys", tablefmt="html")))

pl.Config.set_tbl_rows(-1)

dest_dir = Path.home() / "data" / "imdb"

csv_options = {
    'separator': '\t',
    'encoding': 'utf8',
    'ignore_errors': True,
    'infer_schema_length': 10000,  # Adjust as needed
    'quote_char': None  # Disable quoting to handle unescaped quotes
}

ratings = pl.scan_csv(
    dest_dir / "title.ratings.tsv",
    **csv_options
)

details = pl.scan_csv(
    dest_dir / "title.akas.tsv",
    **csv_options
).select(['titleId', 'title', 'region', 'language'])

basics = pl.scan_csv(
    dest_dir / "title.basics.tsv",
    **csv_options
).select(['tconst', 'startYear', 'genres', 'primaryTitle', 'titleType'])

# Perform lazy joins
joined = details.join(
    ratings,
    left_on='titleId',
    right_on='tconst',
    how='inner'  # Only keep matching rows
).join(
    basics,
    left_on='titleId',
    right_on='tconst',
    how='inner'
)

# Apply filters using lazy evaluation
good = joined.filter(
    (pl.col('averageRating') >= 7.2) &
    (pl.col('numVotes') > 50_000) &
    (pl.col('startYear').str.strip_chars().cast(pl.Int64, strict=False) > 2010) &
    (pl.col("titleType").is_in(["movie", "tvMovie"])) &
    (pl.col('region') == 'RU') &
    (pl.col('genres').str.contains('Comedy'))
).unique(subset=['titleId'])


result = good.select([
    'title',
    'primaryTitle',
    'startYear',
    'genres',
    'averageRating',
    'numVotes'
]).sort('averageRating', descending=True)

final_df = result.collect()
show(final_df)

title,primaryTitle,startYear,genres,averageRating,numVotes
1+1,The Intouchables,2011,"Comedy,Drama",8.5,950505
Джанго освобожденный,Django Unchained,2012,"Comedy,Drama,Western",8.5,1737319
Бедное сердце,Dil Bechara,2020,"Comedy,Drama,Romance",8.3,135545
Налегке,Chhichhore,2019,"Comedy,Drama,Romance",8.3,65859
Банды Вассейпура,Gangs of Wasseypur,2012,"Action,Comedy,Crime",8.2,105876
Жизнь не может быть скучной,Zindagi Na Milegi Dobara,2011,"Comedy,Drama,Musical",8.2,88164
Волк с Уолл-стрит,The Wolf of Wall Street,2013,"Biography,Comedy,Crime",8.2,1630119
Зеленая книга,Green Book,2018,"Biography,Comedy,Drama",8.2,601395
Клаус,Klaus,2019,"Adventure,Animation,Comedy",8.2,196227
Брат Баджранги,Bajrangi Bhaijaan,2015,"Action,Adventure,Comedy",8.1,100860
