In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from scipy.stats import chi2_contingency
from plotter import (plot_bar_chart_univariate,
                     plot_bar_chart_univariate_y_col,
                     plot_bar_chart_multivariate_y_col,
                     plot_scatter_plot)

from categorical_association import (chi_square_test,
                                     cal_cramer_v_adjusted,
                                     cal_cramer_v)

# **Netflix Content Analysis**

## **Exploratory Data Analysis (EDA)**

In [2]:
# Load the data
df = pd.read_csv('dat/netflix_content_2023.csv')



In [3]:
# add a column for the year and month
df['year'] = pd.DatetimeIndex(df['Release Date']).year
df['month'] = pd.DatetimeIndex(df['Release Date']).month

# add a column for the number of days since the release
df['days_since_release'] = (pd.to_datetime('2023-12-31') - pd.to_datetime(df['Release Date'])).dt.days

# Hours Viewed to numeric
df['Hours Viewed'] = df['Hours Viewed'].str.replace(',', '')
df['Hours Viewed'] = df['Hours Viewed'].astype(int)

# only keep Available Globally = 'Yes
df = df[df['Available Globally?'] == 'Yes']
df = df.reset_index()

In [4]:
df['Language Indicator'].unique()

array(['English', 'Korean', 'Non-English', 'Japanese', 'Hindi', 'Russian'],
      dtype=object)

### **Univariate**

#### **Number of View by Language**

In [5]:
plot_bar_chart_univariate(df, 'Language Indicator', 'Number of Content by Language', 'Language', 'Count')

In [6]:
plot_bar_chart_univariate_y_col(df.groupby('Language Indicator').agg({'Hours Viewed': 'sum'}).reset_index(), 
                          'Language Indicator', 
                          'Hours Viewed',
                          'Number of Hour Viewed by Language', 'Language', 'Total Hours Viewed')

In [7]:
# avg
plot_bar_chart_univariate_y_col(df.groupby('Language Indicator').agg({'Hours Viewed': 'mean'}).reset_index(), 
                          'Language Indicator', 
                          'Hours Viewed',
                          'Number of Average Hour Viewed by Language', 'Language', 'Mean of Hours Viewed')

In [8]:
df_lang = df.groupby(['Language Indicator', 'Content Type']).agg({'Hours Viewed': 'sum', 'Title':'count'}).reset_index()
df_lang['Hours Viewed'] = df_lang['Hours Viewed'] / df_lang['Title']

In [9]:
df_lang

Unnamed: 0,Language Indicator,Content Type,Hours Viewed,Title
0,English,Movie,7375000.0,2664
1,English,Show,14061990.0,3299
2,Hindi,Movie,4212500.0,56
3,Hindi,Show,5017742.0,62
4,Japanese,Movie,4055208.0,96
5,Japanese,Show,7198684.0,228
6,Korean,Movie,30459760.0,164
7,Korean,Show,21942910.0,282
8,Non-English,Movie,4504177.0,407
9,Non-English,Show,9172680.0,388


In [10]:
plot_bar_chart_multivariate_y_col(df_lang, 'Language Indicator', 
                                  'Hours Viewed', 'Content Type',
                                'Number of Average Hour Viewed by Language and Content Type', 'Language', 'Mean of Hours Viewed')

Korean is the most popular language in Netflix in both of Show and Movie. However, Russian has the second highest number of views in TV Show.

### **Correlation**

#### **Correlation between Days of Release and Number of View**

In [11]:
plot_scatter_plot(df, 'days_since_release', 'Hours Viewed', 'Correlation Between days_since_release and Hour Viewed', color_column='Language Indicator')

In [12]:
# create ordinal data of hours viwed
df['Hours Viewed_Ordinal'] = pd.qcut(df['Hours Viewed'], 5, labels=["0", "0.25", "0.5","0.75", "1"] )

In [13]:
contingency_table = pd.crosstab(df["Language Indicator"], df["Hours Viewed_Ordinal"])

In [15]:
# Perform the Chi-Square test
chi_square_test(contingency_table)

Chi-Square Statistic: 347.5110
P-value: 1.4507e-61
Degrees of Freedom: 20
Significance Level (alpha): 0.05
Decision: Reject H0 (The variables are not independent).

Expected Frequencies:
   ['1318.88', '1075.68', '1208.19', '1173.89', '1186.36']
   ['26.10', '21.29', '23.91', '23.23', '23.48']
   ['71.66', '58.45', '65.65', '63.78', '64.46']
   ['98.64', '80.45', '90.37', '87.80', '88.73']
   ['175.84', '143.41', '161.08', '156.51', '158.17']
   ['0.88', '0.72', '0.81', '0.79', '0.80']


(347.5110209594508,
 1.4507123877025034e-61,
 20,
 array([[1.31887529e+03, 1.07567843e+03, 1.20818954e+03, 1.17389255e+03,
         1.18636418e+03],
        [2.60988235e+01, 2.12862745e+01, 2.39084967e+01, 2.32298039e+01,
         2.34766013e+01],
        [7.16611765e+01, 5.84470588e+01, 6.56470588e+01, 6.37835294e+01,
         6.44611765e+01],
        [9.86447059e+01, 8.04549020e+01, 9.03660131e+01, 8.78007843e+01,
         8.87335948e+01],
        [1.75835294e+02, 1.43411765e+02, 1.61078431e+02, 1.56505882e+02,
         1.58168627e+02],
        [8.84705882e-01, 7.21568627e-01, 8.10457516e-01, 7.87450980e-01,
         7.95816993e-01]]))

In [52]:
cal_cramer_v_adjusted(contingency_table)

0.1289125227185101

### 