# <span style="color:firebrick">SUMMER MOVIES</span>
<span style="color:crimson">**2024 Tidytuesday  Week 30** </span>

# Setup

## Libraries

In [1]:
import polars as pl 
import plotly.express as px

## Data

_**read_csv option:**_
 - increasing `infer_schema_length` (e.g. `infer_schema_length=10000`),
 - specifying correct dtype with the `dtypes` argument
 - setting `ignore_errors` to `True`,
 - adding `NA` to the `null_values` list.

In [11]:
url_data = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-07-30/summer_movies.csv'
dfr = pl.read_csv(
    url_data,
    infer_schema_length=10000,
    dtypes=[
        pl.String, pl.String, pl.String, pl.String, 
        pl.Int64, pl.Int64,
        pl.String, pl.String, 
        pl.Float64, pl.Int64],
    null_values='NA'
    )

# EDA

## Overview

In [21]:
dfr.columns

['tconst',
 'title_type',
 'primary_title',
 'original_title',
 'year',
 'runtime_minutes',
 'genres',
 'simple_title',
 'average_rating',
 'num_votes']

In [12]:
dfr.dtypes

[String, String, String, String, Int64, Int64, String, String, Float64, Int64]

In [13]:
dfr.shape

(905, 10)

In [14]:
dfr.head(3)

tconst,title_type,primary_title,original_title,year,runtime_minutes,genres,simple_title,average_rating,num_votes
str,str,str,str,i64,i64,str,str,f64,i64
"""tt0011462""","""movie""","""Midsummer Madn…","""Midsummer Madn…",1920,60,"""Drama""","""midsummer madn…",7.4,19
"""tt0026714""","""movie""","""A Midsummer Ni…","""A Midsummer Ni…",1935,133,"""Comedy,Fantasy…","""a midsummer ni…",6.8,3931
"""tt0033864""","""movie""","""The Teachers o…","""Magistrarna på…",1941,86,"""Comedy""","""the teachers o…",5.5,78


In [19]:
dfr.describe()

describe,tconst,title_type,primary_title,original_title,year,runtime_minutes,genres,simple_title,average_rating,num_votes
str,str,str,str,str,f64,f64,str,str,f64,f64
"""count""","""905""","""905""","""905""","""905""",904.0,849.0,"""887""","""905""",905.0,905.0
"""null_count""","""0""","""0""","""0""","""0""",1.0,56.0,"""18""","""0""",0.0,0.0
"""mean""",,,,,2001.353982,91.043581,,,6.273702,2296.285083
"""std""",,,,,19.448166,23.854447,,,1.245414,20630.393488
"""min""","""tt0011462""","""movie""","""#MST3KMindless…","""#MST3KMindless…",1920.0,5.0,"""Action""","""1030 pm summer…",2.3,10.0
"""25%""",,,,,1990.0,82.0,,,5.6,34.0
"""50%""",,,,,2008.0,91.0,,,6.4,119.0
"""75%""",,,,,2016.0,101.0,,,7.1,517.0
"""max""","""tt9842874""","""video""","""a-ha: MTV Unpl…","""Été 85""",2024.0,237.0,"""Western""","""zyuden sentai …",9.8,564894.0


In [60]:
dfr.glimpse()

Rows: 905
Columns: 10
$ tconst          <str> 'tt0011462', 'tt0026714', 'tt0033864', 'tt0037325', 'tt0038406', 'tt0038738', 'tt0039354', 'tt0040848', 'tt0041507', 'tt0042930'
$ title_type      <str> 'movie', 'movie', 'movie', 'movie', 'movie', 'tvMovie', 'movie', 'movie', 'movie', 'movie'
$ primary_title   <str> 'Midsummer Madness', "A Midsummer Night's Dream", 'The Teachers on Summer Vacation', 'Summer Storm', 'Centennial Summer', "A Midsummer Night's Dream", 'One Swallow Does Not Make a Summer', 'Summer Holiday', 'In the Good Old Summertime', 'Bountiful Summer'
$ original_title  <str> 'Midsummer Madness', "A Midsummer Night's Dream", 'Magistrarna på sommarlov', 'Summer Storm', 'Centennial Summer', "A Midsummer Night's Dream", 'En fluga gör ingen sommar', 'Summer Holiday', 'In the Good Old Summertime', 'Shchedroe leto'
$ year            <i64> 1920, 1935, 1941, 1944, 1946, 1946, 1947, 1948, 1949, 1951
$ runtime_minutes <i64> 60, 133, 86, 106, 102, 150, 88, 93, 102, 87
$ genres         

## Data Wrangling

### Drop rows with missing values

In [27]:
dfc = dfr.drop_nulls()
dfc.shape

(837, 10)

### Select a subset of columns

In [28]:
dfc.columns

['tconst',
 'title_type',
 'primary_title',
 'original_title',
 'year',
 'runtime_minutes',
 'genres',
 'simple_title',
 'average_rating',
 'num_votes']

In [33]:
dfs = (dfc
 .select(pl.col('title_type', 'year', 'runtime_minutes', 'genres', 'average_rating', 'num_votes'))
 .rename({"runtime_minutes": "runtime",
          "average_rating": "rating",
          "num_votes": "votes"})
)
dfs.head(2)

title_type,year,runtime,genres,rating,votes
str,i64,i64,str,f64,i64
"""movie""",1920,60,"""Drama""",7.4,19
"""movie""",1935,133,"""Comedy,Fantasy…",6.8,3931


### Separate genres columns (one genre per row)

In [45]:
dfg = (
    dfs
    .with_columns(pl.col("genres").str.split(",").alias("genres"))
    .explode("genres")
)

### Add a decade column

In [54]:
dfd = dfg.with_columns((pl.col("year").floordiv(10) * 10).alias("decade"))
dfd.head(3)

title_type,year,runtime,genres,rating,votes,decade
str,i64,i64,str,f64,i64,i64
"""movie""",1920,60,"""Drama""",7.4,19,1920
"""movie""",1935,133,"""Comedy""",6.8,3931,1930
"""movie""",1935,133,"""Fantasy""",6.8,3931,1930


## Data analysis and viz

### By genres

In [58]:
dfd.select(pl.col("genres").unique().alias("genres"));

In [59]:
dfd.select(pl.col("genres").n_unique().alias("genres"))

genres
u32
24


In [65]:
by_genders = (dfd
.group_by("genres")
.count()
.sort("count", descending=True)
)

In [131]:
fig = px.bar(
    by_genders,
    x="genres",
    y="count",
    width=800,
    title= "<b>Summer Movies</b><br>Distribution by genre",
    template = 'plotly_dark')
fig.update_xaxes(title_text = '')
fig.update_layout(
    font_family="Courier New",
    font_color="skyblue",
    title_font_family="Roboto",
    title_font_size = 30,
    title_font_color="firebrick")
fig.show()


### Average rating by genres

In [82]:
avg_rating = (dfd
 .group_by("genres")
 .agg(pl.col("rating").mean().round(2).alias('average'))
 .sort('average', descending=False)
)

"Courier New", "Droid Sans", "Droid Serif", "Droid Sans Mono", "Gravitas
            One", "Old Standard TT", "Open Sans", "Overpass", "PT
            Sans Narrow", "Raleway", "Times New Roman"

In [122]:
fig = px.bar(
    avg_rating,
    x = 'average',
    y = 'genres',
    width=600, height= 600,
    template = 'plotly_dark')
fig.update_yaxes(title_text = '')
fig.update_xaxes(tickfont_family="Arial black")
fig.update_yaxes(tickfont=dict(family="Balto", color = 'gold'))
fig.update_layout(
    title=dict(text="<b>Summer Movies</b><br><i>Average rating by genre</i>", 
               font=dict(size=20, family="Balto", color = 'seagreen')
               )
)
   
fig.show()