# Setup.

## Requirements.

In [None]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


## Imports.

In [2]:
# Typical ML Libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Sample Dataset.

In [3]:
df      = pd.read_csv('data/podcast.csv')
df_num  = df.select_dtypes(include=[np.number])   # Numeric only.

df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


# 1. Description.

## 1.1. `df.info()` : Check columns.

> Note) Memory usage calculation.
> - Memory usage only calculates the size of the ***pointer*** for `object` dtype, by default.
> - `memory_usage = 'deep'` calculates actual size, though could be slower.

In [4]:
df.info(memory_usage  = 'deep',   # Calculate actual memory usage.
        verbose       = True,     # Force to show all columns.
)     

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 316.0 MB


## 1.2. `df.describe()` : Distributions.

In [5]:
df.describe(
    percentiles   = [.25, .5, .75, .9],
    include       = 'all',                 # `all`: numeric and categorical. default: numeric only.
)

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
count,750000.0,750000,750000,662907.0,750000,750000.0,750000,750000,603970.0,749999.0,750000,750000.0
unique,,48,100,,10,,7,4,,,3,
top,,Tech Talks,Episode 71,,Sports,,Sunday,Night,,,Neutral,
freq,,22847,10515,,87606,,115946,196849,,,251291,
mean,374999.5,,,64.504738,,59.859901,,,52.236449,1.348855,,45.437406
std,216506.495284,,,32.969603,,22.873098,,,28.451241,1.15113,,27.138306
min,0.0,,,0.0,,1.3,,,0.0,0.0,,0.0
25%,187499.75,,,35.73,,39.41,,,28.38,0.0,,23.17835
50%,374999.5,,,63.84,,60.05,,,53.58,1.0,,43.37946
75%,562499.25,,,94.07,,79.53,,,76.6,2.0,,64.81158


## 1.3. Simple Checks.

In [6]:
df.shape                            # (rows, columns).

(750000, 12)

In [7]:
df.columns                          # Column names.

Index(['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes',
       'Genre', 'Host_Popularity_percentage', 'Publication_Day',
       'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads',
       'Episode_Sentiment', 'Listening_Time_minutes'],
      dtype='object')

In [8]:
df.dtypes                           # Dtypes.

id                               int64
Podcast_Name                    object
Episode_Title                   object
Episode_Length_minutes         float64
Genre                           object
Host_Popularity_percentage     float64
Publication_Day                 object
Publication_Time                object
Guest_Popularity_percentage    float64
Number_of_Ads                  float64
Episode_Sentiment               object
Listening_Time_minutes         float64
dtype: object

In [9]:
df.nunique()                        # Num of uniques.

id                             750000
Podcast_Name                       48
Episode_Title                     100
Episode_Length_minutes          12268
Genre                              10
Host_Popularity_percentage       8038
Publication_Day                     7
Publication_Time                    4
Guest_Popularity_percentage     10019
Number_of_Ads                      12
Episode_Sentiment                   3
Listening_Time_minutes          42807
dtype: int64

In [10]:
df['Podcast_Name'].value_counts()   # Frequency.

Podcast_Name
Tech Talks             22847
Sports Weekly          20053
Funny Folks            19635
Tech Trends            19549
Fitness First          19488
Business Insights      19480
Style Guide            19364
Game Day               19272
Melody Mix             18889
Criminal Minds         17735
Finance Focus          17628
Detective Diaries      17452
Crime Chronicles       17374
Athlete's Arena        17327
Fashion Forward        17280
Tune Time              17254
Business Briefs        17012
Lifestyle Lounge       16661
True Crime Stories     16373
Sports Central         16191
Digital Digest         16171
Humor Hub              16144
Mystery Matters        16002
Comedy Corner          15927
Joke Junction          15074
Wellness Wave          15009
Sport Spot             14778
Gadget Geek            14770
Home & Living          14686
Laugh Line             14673
Life Lessons           14464
World Watch            14043
Sound Waves            13928
Global News            13649
M

In [11]:
df.isna().sum()                     # Num of nulls.

id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64

In [12]:
df.duplicated().sum()               # Num of duplicates.

np.int64(0)

In [13]:
df.memory_usage()                   # Memory usage, by each column.

Index                              132
id                             6000000
Podcast_Name                   6000000
Episode_Title                  6000000
Episode_Length_minutes         6000000
Genre                          6000000
Host_Popularity_percentage     6000000
Publication_Day                6000000
Publication_Time               6000000
Guest_Popularity_percentage    6000000
Number_of_Ads                  6000000
Episode_Sentiment              6000000
Listening_Time_minutes         6000000
dtype: int64

## 1.4. Skewness and Kurtosis.

- $\text{Skewness} = \frac{\tfrac{1}{n}\sum (x_i - \bar{x})^3}{\left(\tfrac{1}{n}\sum (x_i - \bar{x})^2\right)^{3/2}}$
  - -1 : right
  - 0 : center
  - 1 : left
  - Try log1p, sqrt, or Box-Cox/Yeo-Johnson.
- $\text{Kurtosis} = \frac{\tfrac{1}{n}\sum (x_i - \bar{x})^4}{\left(\tfrac{1}{n}\sum (x_i - \bar{x})^2\right)^{2}}$
  - \> 3 : sharp
  - = 3 : shape of normal distribution
  - < 3 : flat
  - Consider winsorizing, robust scalers, or tree-based models.

- SE(skew) = √(6/n), SE(kurt) = √(24/n).
  - Flag if |skew| > 2·SE(skew) or |kurt| > 2·SE(kurt).

In [14]:
df_num.skew()

id                            -1.963100e-15
Episode_Length_minutes        -2.005613e-03
Host_Popularity_percentage     4.926275e-03
Guest_Popularity_percentage   -1.070354e-01
Number_of_Ads                  6.032992e+00
Listening_Time_minutes         3.508123e-01
dtype: float64

In [15]:
df_num.kurtosis()

id                              -1.200000
Episode_Length_minutes          -1.203033
Host_Popularity_percentage      -1.206702
Guest_Popularity_percentage     -1.150117
Number_of_Ads                  505.893908
Listening_Time_minutes          -0.661236
dtype: float64

# 2. Correlation Analysis.

- Pearson, $r = cos(X_c, Y_c) = \frac{X_c \cdot Y_c}{||X_c|| \cdot ||Y_c||}$
- Spearman, $\rho = 1 - \frac{6 \sum d_i^2}{n(n^2-1)},\ d_i = rank(x_i) - rank(y_i)$
- Kendall, $\tau = \frac{\text{n\_concordant} - \text{n\_discordant}}{\binom{n}{2}},\quad \text{where } n\_concordant = \#\{(i, j): (x_i - x_j)(y_i - y_j) > 0\}$

In [None]:
pearson = df.corr(method='pearson')   # 'pearson', 'spearman', 'kendall'.
plt.matshow(pearson, 
            cmap="gray_r", 
            vmin=-1, 
            vmax=1)

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
id,1.0,-0.000557,0.000274,0.000637,0.000771,-0.000876
Episode_Length_minutes,-0.000557,1.0,0.023596,-0.009704,-0.054953,0.916749
Host_Popularity_percentage,0.000274,0.023596,1.0,0.022431,-0.017896,0.05087
Guest_Popularity_percentage,0.000637,-0.009704,0.022431,1.0,0.007933,-0.016014
Number_of_Ads,0.000771,-0.054953,-0.017896,0.007933,1.0,-0.118337
Listening_Time_minutes,-0.000876,0.916749,0.05087,-0.016014,-0.118337,1.0


# 3. Feature Importance.

## 3.1. Tree-based Approaches.

In [None]:
from sklearn.ensemble import RandomForestRegressor

X = df_num.drop(columns=['id', 'Listening_Time_minutes'])
y = df['Listening_Time_minutes']

feature_names = X.columns

model = RandomForestRegressor(max_leaf_nodes    = 10, 
                              max_depth         = 3, 
                              n_estimators      = 5,
                              random_state      = 42,)
model.fit(X, y)

0,1,2
,n_estimators,5
,criterion,'squared_error'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,10
,min_impurity_decrease,0.0
,bootstrap,True


### 3.1.1. Impurity-based Importance.

- Sum of decrease of impurity per feature, measured by Gini, entropy, MSE, etc.
- Fast, though could be biased on a feature with wide range or many categories. 

In [18]:
gini_importance = pd.Series(model.feature_importances_, index=feature_names)
print("Gini importance:\n", gini_importance.sort_values(ascending=False).head())

Gini importance:
 Episode_Length_minutes         1.0
Host_Popularity_percentage     0.0
Guest_Popularity_percentage    0.0
Number_of_Ads                  0.0
dtype: float64


### 3.1.2. Permutation Importance.

- After training, randomize each feature and measure the performance decrease.
- Slower, but more accurate and robust on a distribution or scale.

In [19]:
from sklearn.inspection import permutation_importance

perm = permutation_importance(model, X, y, n_repeats=2, random_state=42)
perm_importance = pd.Series(perm.importances_mean, index=feature_names)
print("\nPermutation importance:\n", perm_importance.sort_values(ascending=False).head())


Permutation importance:
 Episode_Length_minutes         1.481069
Host_Popularity_percentage     0.000000
Guest_Popularity_percentage    0.000000
Number_of_Ads                  0.000000
dtype: float64


## 3.2. SHAP.

- SHapely Additive exPlanations.
- Calculate a contribution of each feature.

> Note) conda install -n yana_library -c conda-forge shap

In [20]:
import shap

explainer    = shap.TreeExplainer(model)
shap_values  = explainer(X)

# Global feature importance.
shap_importance = pd.Series(np.abs(shap_values.values).mean(axis=0), index=feature_names)
print("\nSHAP importance:\n", shap_importance.sort_values(ascending=False).head())

# Local explanation for a single prediction.
print("\nSHAP values for first sample:\n", pd.Series(shap_values.values[0], index=feature_names).sort_values(ascending=False).head())

  from .autonotebook import tqdm as notebook_tqdm



SHAP importance:
 Episode_Length_minutes         19.156087
Host_Popularity_percentage      0.000000
Guest_Popularity_percentage     0.000000
Number_of_Ads                   0.000000
dtype: float64

SHAP values for first sample:
 Host_Popularity_percentage     0.000000
Guest_Popularity_percentage    0.000000
Number_of_Ads                  0.000000
Episode_Length_minutes        -1.558802
dtype: float64


# 4. Visualization.