In [None]:
from pathlib import Path

import polars as pl
import polars.selectors as cs
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

## Extract
1. Read the CSV into a dataframe.
2. Remove columns that will not be used in the analysis: `Poster_Link`, `Overview`, and `Certificate`.
3. Convert all column names to lowercase for uniformity.
4. Adjust the max allowed length of strings in outputs to easily see the full titles of movies.

In [None]:
movie_metadata_path = Path.cwd() / "imdb_top_1000.csv"

# 1. Read CSV into dataframe. mm_raw = raw movie metadata dataframe. `Released_Year` is
# mapped to a string because there is at least 1 value in that column that cannot be
# inferred as an integer. This is investigated further below.
mm_raw = pl.read_csv(source=movie_metadata_path, schema_overrides={"Released_Year": pl.String})

# 2. Drop the `Poster_Link`, `Overview`, and `Certificate` columns.
mm = mm_raw.select(pl.col("*").exclude("Poster_Link", "Overview", "Certificate"))

# 3. Convert column names to lowercase.
mm = mm.rename(lambda col_name: col_name.lower())

# 4. Adjust max allowed string length in output. The movie title is treated as the
# priority for viewing string output in the dataframe.
max_title_len = mm["series_title"].str.len_bytes().max()
pl.Config.set_fmt_str_lengths(max_title_len)

print(f"full dataframe shape: {mm.shape}")
mm.head()

## Explore
### Column Names Per Data Type

In [None]:
# First check what data types are in the dataframe.
pl.Series(mm.dtypes).value_counts(sort=True)

#### Integer Columns

In [None]:
mm.select(cs.integer()).columns

#### String Columns

In [None]:
mm.select(cs.string()).columns

`released_year`, `runtime`, and `gross` should all be integer columns.

#### Float Columns

In [None]:
mm.select(cs.float()).columns

### Find the `released_year` Values That Prevent Casting to Integer Column

In [None]:
# Cast to integer by supressing the error that results. Then find the null values.
mm.filter(pl.col("released_year").cast(pl.Int64, strict=False).is_null())

The only problem row preventing casting is the movie `"Apollo 13"`. The release year for this movie is `1995` and will be updated in the **Column Transformations** section below.

### Missing/Null Values

In [None]:
mm.null_count()

Null values are only found in the `gross` and `meta_score` columns. This is not really an issue therefore no action will be taken for null values.

### Duplicates
#### `series_title`

In [None]:
mm.filter(pl.col("series_title").is_duplicated())

There are two movies that share the same title but are clearly different movies. No action is needed.

#### Combination of `series_title` and `released_year`

In [None]:
# `pl.struct` is used to tie the two columns together in one object.
mm.filter(pl.struct("series_title", "released_year").is_duplicated())

There are no duplicates when `series_title` and `released_year` are used in combination as a unique identifier for each row. Although not the goal of the project, if a database was constructed, `series_title` with `released_year` could be used as a composite primary key.

## Transformations
### Column Reordering

In [None]:
# A more logical ordering.
col_order = [
    "series_title",
    "released_year",
    "genre",
    "director",
    "star1",
    "star2",
    "star3",
    "star4",
    "runtime",
    "gross",
    "meta_score",
    "imdb_rating",
    "no_of_votes",
]

mm = mm.select(col_order)
mm.head()

### Column Transformations
1. `released_year`
    - There is one value that is preventing the column from being cast as an integer column. The movie is `"Apollo 13"`; the `released_year` value is `"PG"` and should be changed to `"1995"`. Then the column can be cast to an integer column. `pl.lit()` is needed because polars would otherwise try to look for a column named `"1995"`. Now the column can be cast to an integer type.
1. `genre`
    - The values need to be split on ", ". The split operation will cast the column as a list type.
1. `runtime`
    - The characters " min" need to be removed. Then cast to an integer type.
1. `gross`
    - All of the "," characters need to be removed. Then cast to an integer type.

In [None]:
mm = mm.with_columns(
    # Convert `released_year` to integer column.
    pl.when(pl.col("released_year") == "PG")
    .then(pl.lit("1995"))
    .otherwise(pl.col("released_year"))
    .cast(pl.Int16)
    .alias("released_year"),
    # Convert `genre` to a list Column.
    pl.col("genre").str.split(", "),
    # Address the `runtime` column.
    pl.col("runtime").str.strip_chars_end(characters=" min").cast(pl.Int16),
    # Address the `gross` column.
    pl.col("gross").str.replace_all(pattern=",", value="", literal=True).cast(pl.Int32),
)
mm.head()

### Distributions of Numeric Columns

In [None]:
# Select only the numeric columns.
numeric_data = mm.select(cs.numeric())

# Create a 2x3 subplot figure.
fig = make_subplots(
    rows=2,
    cols=3,
    subplot_titles=numeric_data.columns,
    horizontal_spacing=0.1,
    vertical_spacing=0.09,
)

# Create a "flat" list of subplot references in order to assign each column's boxplot in
# the for loop below.
subplot_refs = [(row, col) for row in range(1, 3) for col in range(1, 4)]

# Iterate through the columns and create a box plot for each.
for (row, col), col_name in zip(subplot_refs, numeric_data.columns):
    # Create box plot and add to figure.
    box_plot = px.box(numeric_data, y=col_name)
    fig.add_trace(box_plot.data[0], row=row, col=col)

# Update layout
fig.update_layout(
    height=800,  # Adjust the height as needed
    width=800,  # Adjust the width as needed
    title={
        'text': "Numeric Column Distributions",
        'y': 0.965,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    showlegend=False,
    margin=dict(t=85, l=30, r=30, b=30)
)

# Show the plot
fig.show()

### Grouping by `imdb_rating`

In [None]:
# Create groups via the `cut` function's bins.
mm.group_by(
    pl.col("imdb_rating")
    .cut(
        breaks=[8.0, 9.0],
        labels=["rating < 8", "8 <= rating < 9", "rating >= 9"],
        left_closed=True,
    )
    .alias("imdb_rating_group")
).agg(pl.col("imdb_rating").mean().alias("avg_rating_per_group").round(2))

In [None]:
mm.describe()

In [None]:
mm.head()

## Analysis

In [None]:
mm.select(pl.col("genre").list.explode().unique())

There are 21 unique genres. The genres could be multi-hot encoded across 21 new columns as would need to be done in pandas. Instead, the functionality of polars `List` type column will be used for analysis operations involving the genres.