In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

In [39]:
def plot_bar_chart_univariate(data, column, title, x_axis_title, y_axis_title, height=600, width=800):
    # Calculate value counts and convert to DataFrame
    value_counts = data[column].value_counts().reset_index()
    value_counts.columns = [column, 'count']
    
    # Sort by count values
    sorted_data = value_counts.sort_values(by='count', ascending=False)
    
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=sorted_data[column], 
        y=sorted_data['count'],
        marker_color='#db0000'
    ))
    fig.update_layout(
        title={
            'text': f"<b>{title}</b>",
            'x': 0.5,
            'xanchor': 'center'
        },
        xaxis_title=x_axis_title,
        yaxis_title=y_axis_title,
        height=height,
        width=width,
        plot_bgcolor='rgba(0,0,0,0)',  # Transparent plot background
        paper_bgcolor='rgba(0,0,0,0)', # Transparent paper background
        xaxis=dict(showgrid=False),    # Hide x-axis grid lines
        yaxis=dict(showgrid=False)     # Hide y-axis grid lines
    )
    
    
    return fig

def plot_bar_chart_univariate_y_col(data, x_column, y_column, title, x_axis_title, y_axis_title, height=600, width=800):
    sorted_data = data[[x_column, y_column]].sort_values(by=y_column, ascending=False)
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=sorted_data [x_column].value_counts().index, 
        y=sorted_data [y_column],
        marker_color='#db0000'
    ))
    fig.update_layout(
        title={
            'text': f"<b>{title}</b>",
            'x': 0.5,
            'xanchor': 'center'
        },
        xaxis_title=x_axis_title,
        yaxis_title=y_axis_title,
        height=height,
        width=width,
        plot_bgcolor='rgba(0,0,0,0)',  # Transparent plot background
        paper_bgcolor='rgba(0,0,0,0)', # Transparent paper background
        xaxis=dict(showgrid=False),    # Hide x-axis grid lines
        yaxis=dict(showgrid=False)     # Hide y-axis grid lines
    )
    
    return fig

In [40]:
def plot_bar_chart_multivariate_y_col(data, x_column, y_column, group_column,  title, x_axis_title, y_axis_title, height=600, width=800):
    sorted_data = data[[x_column, group_column, y_column]].sort_values(by=y_column, ascending=False)
    fig = go.Figure()
    color_list = ['#831010', '#564d4d','#db0000', ]
    for i, group in enumerate(data[group_column].unique()):
        group_data = sorted_data[sorted_data[group_column] == group]
        fig.add_trace(go.Bar(
            x=group_data[x_column], 
            y=group_data[y_column],
            name=group,
            marker_color=color_list[i]
        ))

    fig.update_layout(
        title={
            'text': f"<b>{title}</b>",
            'x': 0.5,
            'xanchor': 'center'
        },
        xaxis_title=x_axis_title,
        yaxis_title=y_axis_title,
        height=height,
        width=width,
        plot_bgcolor='rgba(0,0,0,0)',  # Transparent plot background
        paper_bgcolor='rgba(0,0,0,0)', # Transparent paper background
        xaxis=dict(showgrid=False),    # Hide x-axis grid lines
        yaxis=dict(showgrid=False)     # Hide y-axis grid lines
    )
    fig.update_layout(barmode='group')
    
    return fig

# **Netflix Content Analysis**

## **Exploratory Data Analysis (EDA)**

In [9]:
# Load the data
df = pd.read_csv('dat/netflix_content_2023.csv')



In [10]:
# add a column for the year and month
df['year'] = pd.DatetimeIndex(df['Release Date']).year
df['month'] = pd.DatetimeIndex(df['Release Date']).month

# add a column for the number of days since the release
df['days_since_release'] = (pd.to_datetime('2023-12-31') - pd.to_datetime(df['Release Date'])).dt.days

# Hours Viewed to numeric
df['Hours Viewed'] = df['Hours Viewed'].str.replace(',', '')
df['Hours Viewed'] = df['Hours Viewed'].astype(int)

# only keep Available Globally = 'Yes
df = df[df['Available Globally?'] == 'Yes']

In [11]:
df.head()

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Language Indicator,Content Type,year,month,days_since_release
0,The Night Agent: Season 1,Yes,2023-03-23,812100000,English,Show,2023.0,3.0,283.0
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000,English,Show,2023.0,1.0,360.0
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000,Korean,Show,2022.0,12.0,366.0
3,Wednesday: Season 1,Yes,2022-11-23,507700000,English,Show,2022.0,11.0,403.0
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000,English,Movie,2023.0,5.0,241.0


In [6]:
df['Language Indicator'].unique()

array(['English', 'Korean', 'Non-English', 'Japanese', 'Hindi', 'Russian'],
      dtype=object)

### **Univariate**

#### **Number of View by Language**

In [12]:
plot_bar_chart_univariate(df, 'Language Indicator', 'Number of Content by Language', 'Language', 'Count')

In [19]:
plot_bar_chart_univariate_y_col(df.groupby('Language Indicator').agg({'Hours Viewed': 'sum'}).reset_index(), 
                          'Language Indicator', 
                          'Hours Viewed',
                          'Number of Hour Viewed by Language', 'Language', 'Total Hours Viewed')

In [20]:
# avg
plot_bar_chart_univariate_y_col(df.groupby('Language Indicator').agg({'Hours Viewed': 'mean'}).reset_index(), 
                          'Language Indicator', 
                          'Hours Viewed',
                          'Number of Average Hour Viewed by Language', 'Language', 'Mean of Hours Viewed')

In [34]:
df_lang = df.groupby(['Language Indicator', 'Content Type']).agg({'Hours Viewed': 'sum', 'Title':'count'}).reset_index()
df_lang['Hours Viewed'] = df_lang['Hours Viewed'] / df_lang['Title']

In [31]:
df_lang

Unnamed: 0,Language Indicator,Content Type,Hours Viewed,Title
0,English,Movie,7375000.0,2664
1,English,Show,14061990.0,3299
2,Hindi,Movie,4212500.0,56
3,Hindi,Show,5017742.0,62
4,Japanese,Movie,4055208.0,96
5,Japanese,Show,7198684.0,228
6,Korean,Movie,30459760.0,164
7,Korean,Show,21942910.0,282
8,Non-English,Movie,4504177.0,407
9,Non-English,Show,9172680.0,388


In [42]:
plot_bar_chart_multivariate_y_col(df_lang, 'Language Indicator', 
                                  'Hours Viewed', 'Content Type',
                                'Number of Average Hour Viewed by Language and Content Type', 'Language', 'Mean of Hours Viewed')

Korean is the most popular language in Netflix in both of Show and Movie. However, Russian has the second highest number of views in TV Show.

### **Mulitvariate**

### 