In [None]:
from pathlib import Path

import polars as pl
import polars.selectors as cs

## Read CSV

In [None]:
movie_metadata_path = Path.cwd() / "imdb_top_1000.csv"

# mm_raw = raw movie metadata dataframe. Released year is mapped to a string because
# there is at least 1 value in that column that cannot be inferred as an integer.
mm_raw = pl.read_csv(source=movie_metadata_path, dtypes={"Released_Year": pl.String})

### Adjust Max Allowed String Length in Output

In [None]:
# The movie title is treated as the priority for viewing string output in the dataframe.
max_title_len = mm_raw["Series_Title"].str.len_bytes().max()
pl.Config.set_fmt_str_lengths(max_title_len)

### Preview Raw Dataset

In [None]:
mm_raw.head()

### Drop the `Poster_Link` and `Overview` Columns

In [None]:
mm = mm_raw.select(pl.col("*").exclude("Poster_Link", "Overview"))

### Number of Columns Per Data Type

In [None]:
pl.Series(mm.dtypes).value_counts(sort=True)

### Column Names Per Data Type
#### Integer Columns

In [None]:
mm.select(cs.integer()).columns

#### String Columns

In [None]:
mm.select(cs.string()).columns

#### Float Columns

In [None]:
mm.select(cs.float()).columns

## Transformations
### Column Reordering

In [None]:
# A more logical ordering.
col_order = [
    "Series_Title",
    "Released_Year",
    "Genre",
    "Director",
    "Star1",
    "Star2",
    "Star3",
    "Star4",
    "Certificate",
    "Runtime",
    "Gross",
    "Meta_score",
    "IMDB_Rating",
    "No_of_Votes",
]

mm = mm_raw.select(col_order)
mm.head()

### Convert `Released_Year` to Integer Column

In [None]:
# There is one value in the `Released_Year` column that is preventing the column from
# being cast as an integer column. The movie is "Apollo 13"; the `Released_Year` value
# is "PG" and should be changed to "1995". Then the column can be cast to an integer
# column. `pl.lit()` is needed because polars would otherwise try to look for a column
# named "1995".
mm = mm.with_columns(
    pl.when(pl.col("Released_Year") == "PG")
    .then(pl.lit("1995"))
    .otherwise(pl.col("Released_Year"))
    .cast(pl.Int16)
    .alias("Released_Year")
)
mm.head()

In [None]:
# Verify that the `Released_Year` value has been modified for the row that had the value
# of "PG" previously.
mm.filter(pl.col("Series_Title") == "Apollo 13")["Released_Year"]

### Convert `Genre` to a List Column

In [None]:
mm = mm.with_columns(pl.col("Genre").str.split(", "))
mm["Genre"].head()

### Address the `Runtime` Column

In [None]:
mm = mm.with_columns(
    pl.col("Runtime").str.strip_chars_end(characters=" min").cast(pl.Int16)
)
mm["Runtime"].head()

### Address the `Gross` Column

In [None]:
mm = mm.with_columns(
    pl.col("Gross").str.replace_all(pattern=",", value="", literal=True).cast(pl.Int32)
)
mm["Gross"].head()