In [1]:
import polars as pl
from pathlib import Path

In [7]:
processed_folder = Path(".") / "data" / "processed"

In [8]:
meta_df = pl.read_csv("data/movies_metadata.csv", infer_schema_length=10000000)

In [9]:
meta_df = (
    meta_df.with_columns(
        adult=pl.col("adult") == "True",
        budget=pl.col("budget").str.parse_int(10, strict=False),
        id=pl.col("id").str.parse_int(10, strict=False),
        imdb_id=pl.col("imdb_id").str.replace("tt", "").str.parse_int(10).cast(pl.Int64),
        popularity=pl.col("popularity").cast(pl.Float64, strict=False),
        release_date=pl.col("release_date").str.to_date(format="%Y-%m-%d", strict=False),
        status=pl.col("status").cast(pl.Categorical),
    )
)

In [10]:
movie_df = meta_df.select(
    [
        "id",
        "imdb_id",
        "original_title",
        "popularity",
        "release_date",
        "revenue",
        "runtime",
        "vote_average",
        "vote_count",
    ]
)

In [13]:
movie_df.write_parquet(processed_folder / "movie.parquet")

In [14]:
movie_df = pl.read_parquet(processed_folder / "movie.parquet")
links_df = pl.read_csv("data/links.csv")
review_df = pl.read_csv("data/ratings.csv")

In [15]:
(
    review_df
    .join(links_df, on="movieId", how="inner")
    .join(movie_df, left_on="imdbId", right_on="imdb_id", how="inner")
    .write_parquet(processed_folder / "processed.parquet")
)