# Introduction
_____________

### Interest & Inspirations
- Is it possible to predict a daily revenue of a certain youtuber with his/her channel's daily view, subscribers gained, average viewed duration, etc...?
- Which factor is most related with increasing daily Revenue?
- How much can `Total Views of videos` devote to actual Monetizing?
- If one has more videos posted on his/her channel, would she/he happen to earn more? 


In [None]:
%%capture
import sys

if 'google.colab' in sys.modules:
    # Install packages in Colab
    !pip install category_encoders==2.*
    !pip install eli5
    !pip install pandas-profiling==2.*
    !pip install pdpbox
    !pip install shap

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from IPython.display import display
%matplotlib inline
plt.style.use('ggplot')

# Import Data
________

In [None]:
csv_names = ['Table data 2018.csv', 'Table data 2019.csv', 'Table data 2020.csv']
path = '../input/youtube-revenue-data-20182021/'

df = pd.concat((pd.read_csv(path+csv_names[0])[1:-1], 
                pd.read_csv(path+csv_names[1])[1:-1],
               pd.read_csv(path+csv_names[2])[1:-1])).reset_index(drop=True)

df.columns = df.columns.str.replace(' ', '_')
col_map ={'Av­er­age_views_per_view­er':'Average_views_per_viewer',
          'Unique_view­ers':'Unique_viewers',
          'Av­er­age_per­cent­age_viewed_(%)':'Average_viewed',
          'Im­pres­sions':'Impressions',
          'Dis\xadlikes':'Dislikes',
          'Sub­scribers_lost':'Subscribers_lost',
          'Sub­scribers_gained':'Subscribers_gained',
          'Videos_pub­lished':'Videos_added',
          'Videos_ad­ded':'Videos_published', 
          'Sub­scribers':'Subscribers',
          'Im­pres­sions_click-through_rate_(%)':'Click_rate',
          'Com­ments_ad­ded':'Comments',
          'Watch_time_(hours)':'Watch_hours',
          'Av­er­age_view_dur­a­tion':'Average_view_sec',
          'Your_es­tim­ated_rev­en­ue_(USD)':'Revenue'}
df = df.rename(columns=col_map)
df = df.drop(['Likes_(vs._dis­likes)_(%)', 
              'Videos_added', 
              'Subscribers_lost',
              'Subscribers_gained'], axis=1)

print("2018-2020 YouTuber 'PoohinKorea' Daily Data")
print("="*45)
print('Data Size: ', df.shape)
print('Null Values: ', df.isna().sum().sum())
print('Data types: ', df.dtypes.reset_index().groupby(0).count().reset_index().values.tolist())
df.head(2)

# EDA
_______

In [None]:
df.select_dtypes(include=('object')).head(2)

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df['Average_view_sec'] = pd.to_timedelta(df['Average_view_sec']).dt.seconds

In [None]:
df.isna().sum().reset_index().style.highlight_min()

In [None]:
df.Videos_published.value_counts(dropna=False)

In [None]:
df.Videos_published = df.Videos_published.fillna(0.0, axis=0)
df.isna().sum().sum()

In [None]:
df.columns

In [None]:
df.head(2)

In [None]:
top_corr_name = df.corr()['Revenue'].sort_values(ascending=False).reset_index()['index'][:8].values
sns.pairplot(df[top_corr_name], x_vars=top_corr_name[1:], y_vars=top_corr_name[0])
plt.title('Top 7 Correlations with Target', fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(20, 6))
sns.lineplot(data=df, x='Date', y='Revenue')
plt.axhline(df.Revenue.mean(), color='c', linestyle='--', linewidth=2)
plt.annotate('Mean of Revenue: $ {}'.format(round(df.Revenue.mean(),4)), 
             (df.Date[350], 30), 
             fontsize=12, 
             color='c')
plt.title('YouTube Revenue by Date', fontsize=20)
plt.ylabel('Revenue (USD, $)')
plt.show()

In [None]:
df.corr()['Revenue'].sort_values(ascending=False).reset_index()[1:].style.bar(align='mid')

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(data=df, x='Watch_hours', y='Revenue')
plt.title('Revenue by Watch_hours', fontsize=20)
plt.show()

In [None]:
plt.figure(figsize=(7,5))
sns.histplot(df['Revenue'], kde=True)
plt.title('YouTube Revenue per day', fontsize=15)
plt.xlabel('Revenue (USD, $)')
plt.show()

In [None]:
df = df[(df['Revenue'] <= 150)].reset_index(drop=True)
plt.figure(figsize=(7,5))
sns.histplot(df['Revenue'], kde=True)
plt.title('YouTube Revenue per day', fontsize=15)
plt.xlabel('Revenue (USD, $)')
plt.show()

In [None]:
plt.figure(figsize=(7,5))
sns.histplot(np.log(df['Revenue']), kde=True)
plt.title('YouTube Revenue per day', fontsize=15)
plt.xlabel('Revenue (log transformation)')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), vmin=-1, vmax=1, linewidths=.2)
plt.show()

In [None]:
df.describe().loc[['mean', 'std', 'min', '50%', 'max']]

In [None]:
df.to_csv('./2018-2019 Data.csv')

# Feature Engineering
_________


In [None]:
df.head(2)

In [None]:
Subs = df.Subscribers.values.tolist()
Subs_accumulated = []
count = 0
for s in Subs:
    count += s
    Subs_accumulated.append(count)

In [None]:
df['Subs_accumulated'] = Subs_accumulated
df.head(2)

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(data=df, x='Subs_accumulated', y='Revenue')
plt.title('Revenue by Total Subscribers', fontsize=20)
plt.show()

In [None]:
Video_pub = df.Videos_published.values.tolist()
Videos = []
count = 0
for v in Video_pub:
    count += v
    Videos.append(count)

In [None]:
df['Videos'] = Videos
df.head(2)

In [None]:
df.Videos_published.value_counts()

In [None]:
df['Videos_published'] = df.Videos_published.mask((df.Videos_published>0), 1).astype('int')

In [None]:
f, ax = plt.subplots(1, 1, figsize=(24, 6))

ax = sns.lineplot(data=df, x='Date', y='Revenue', label='Revenue', lw=1.5)
ax = sns.lineplot(data=df, x='Date', y='Videos', label='Videos', color='g', lw=2, linestyle='-')
plt.axvline(df.Date[0], color='y', label='Video Published', lw=0.5)

for i in range(1, len(df)):
    if df.Videos_published[i] == 1:
        plt.axvline(df.Date[i], color='y', lw=0.5)

plt.axhline(df.Revenue.mean(), color='c', linestyle='--', linewidth=2, label='Mean of Revenue')
plt.annotate('Mean of Revenue: $ {}'.format(round(df.Revenue.mean(),4)), 
             (df.Date[5], 30), 
             fontsize=12, 
             color='c')

plt.title('YouTube Revenue by Date', fontsize=20)
plt.ylabel('Revenue (USD, $)')
plt.xticks(fontsize=15)
ax.legend(fontsize='xx-large')
plt.show()

In [None]:
df = df[(df['Date'] > '20200901')].reset_index(drop=True)
df

### Check Variance Inflation Factor for Colinearity

In [None]:
from sklearn.linear_model import LinearRegression

def calculate_vif(df, features):    
    vif, tolerance = {}, {}
    # all the features that you want to examine
    for feature in features:
        # extract all the other features you will regress against
        X = [f for f in features if f != feature]        
        X, y = df[X], df[feature]
        # extract r-squared from the fit
        r2 = LinearRegression().fit(X, y).score(X, y)                
        
        # calculate tolerance
        tolerance[feature] = 1 - r2
        # calculate VIF
        vif[feature] = 1/(tolerance[feature])
    # return VIF DataFrame
    return pd.DataFrame({'VIF': vif, 'Tolerance': tolerance})

In [None]:
cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': [('background-color', 'green')]
}

In [None]:
df.corr()['Revenue'].sort_values(ascending=False).reset_index().style.set_table_styles([cell_hover])

In [None]:
df.corr()['Revenue'].sort_values(ascending=False).reset_index()['index'][1:14].values

In [None]:
features_chosen=[
                #  'Watch_hours', 
                #  'Unique_viewers', 
                #  'Views', 
                #  'Likes', 
                #  'Dislikes',
                #  'Impressions', 
                 'Subs_accumulated',
                #  'Subscribers', 
                 'Comments', 
                 'Shares',
                #  'Videos',
                 'Average_view_sec', 
                #  'Average_views_per_viewer', 
                 'Average_viewed'
                 ]
calculate_vif(df.iloc[:,1:], features=features_chosen).style.apply(lambda x: ['background:green' if v < 0.2 else '' for v in x], axis=1)

In [None]:
# cols_remove = [
#                'Average_views_per_viewer', 
#                 'Watch_hours', 
#                 'Unique_viewers', 
#                 'Views', 
#                 'Likes', 
#                 'Dislikes',
#                 'Impressions',
#                 'Subs_accumulated', 
#                 'Date']

cols_remove = [
                    'Date',
                # 'Watch_hours', 
                #  'Unique_viewers', 
                #  'Views', 
                 'Dislikes',
                #  'Impressions', 
                 'Subscribers', 
                 'Comments', 
                 'Shares',
                 'Videos',
                 'Average_view_sec', 
                 'Average_views_per_viewer', 
                 'Average_viewed',
                 'Videos_published',
                 'Click_rate', 
            #    'Subs_accumulated',
                 ]


df_new = df.copy()
df_new = df_new.drop(cols_remove, axis=1)
print('New Data Shape: ', df_new.shape)
df_new.head(2)

In [None]:
df_new.corr()['Revenue']

In [None]:
sns.pairplot(df_new, x_vars=df_new.columns.drop('Revenue'), y_vars=['Revenue'])
plt.title('Correlations with Target', fontsize=15)
plt.show()

# Split Data
_________

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# train, test = train_test_split(df_new, test_size=0.2, random_state=33)

# train.shape, test.shape

In [None]:
train = df_new[:int(len(df_new)*0.8)]
test = df_new[int(len(df_new)*0.8):]
train.shape, test.shape

In [None]:
target = 'Revenue'

X_train = train.drop(target, axis=1)
y_train = train[target]
X_test = test.drop(target, axis=1)
y_test = test[target]

X_train.shape, y_train.shape

### F-statistic & P-Values

In [None]:
from sklearn.feature_selection import f_regression

pvalue_table = pd.DataFrame(f_regression(X_train, y_train), columns=X_train.columns).T.rename(columns={0:'F_statistic', 1:'p_values'})
pvalue_table.style.apply(lambda x: ["background: green" if v < 0.05 else "" for v in x], axis = 1)

# BaseLine Model
_____

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
def evaluate(title, y, y_):
    mae = mean_absolute_error(y, y_)
    rmse = mean_squared_error(y, y_)**0.5
    r2 = r2_score(y, y_)
    print('*{} Result*'.format(title))
    print('='*50)
    print('MAE Score: $', mae)
    print('RMSE Score: $', rmse)
    print('R2 Score: ', r2)
    print()

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
pipe_lin = make_pipeline(
    StandardScaler(),
    LinearRegression(n_jobs=-1)
)

pipe_lin.fit(X_train, y_train)
y_pred = np.exp(pipe_lin.predict(X_test))
evaluate('Linear Regression', y_test, y_pred)

### Log Transformed Linear Regression

In [None]:
from sklearn.compose import TransformedTargetRegressor

In [None]:
tt_lin = TransformedTargetRegressor(regressor=pipe_lin,
                                func=np.log1p, inverse_func=np.expm1)

tt_lin.fit(X_train, y_train)
y_pred = tt_lin.predict(X_test)
evaluate('Log Transformed Linear Regression', y_test, y_pred)

# Model Selection
_____

### Ridge

In [None]:
pipe_ridge = make_pipeline(
    StandardScaler(),
    RidgeCV(alphas=np.arange(100, 105, 0.01), cv=5)
)

tt_ridge = TransformedTargetRegressor(regressor=pipe_ridge,
                                func=np.log1p, inverse_func=np.expm1)

tt_ridge.fit(X_train, y_train)
y_pred = tt_ridge.predict(X_test)
print('Best alpha: ', tt_ridge.regressor_.named_steps['ridgecv'].alpha_)
evaluate('Log Transformed Ridge', y_test, y_pred)

### Lasso

In [None]:
pipe_lasso = make_pipeline(
    StandardScaler(),
    LassoCV(alphas=np.arange(0, 0.2, 0.001), cv=5, random_state=33)
)

tt_lasso = TransformedTargetRegressor(regressor=pipe_lasso,
                                func=np.log1p, inverse_func=np.expm1)

tt_lasso.fit(X_train, y_train)
y_pred = tt_lasso.predict(X_test)
print('Best alpha: ', tt_lasso.regressor_.named_steps['lassocv'].alpha_)
evaluate('Log Transformed Lasso', y_test, y_pred)

### ElasticNet

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
pipe_elnet = make_pipeline(
    StandardScaler(),
    ElasticNetCV(n_jobs=-1, cv=5, random_state=33)
)

tt_elnet = TransformedTargetRegressor(regressor=pipe_elnet,
                                func=np.log1p, inverse_func=np.expm1)

tt_elnet.fit(X_train, y_train)
y_pred = tt_elnet.predict(X_test)
evaluate('Log Transformed ElasticNet', y_test, y_pred)

### RandomForest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
pipe_rfreg = make_pipeline(
    StandardScaler(),
    RandomForestRegressor(n_estimators=300, n_jobs=-1, random_state=33)
)

tt_rfreg = TransformedTargetRegressor(regressor=pipe_rfreg,
                                func=np.log1p, inverse_func=np.expm1)

tt_rfreg.fit(X_train, y_train)
y_pred = tt_rfreg.predict(X_test)
evaluate('Log Transformed RandomForest Regressor', y_test, y_pred)

### XGB Regressor

In [None]:
from xgboost import XGBRegressor

In [None]:
pipe_xgb = make_pipeline(
    StandardScaler(),
    XGBRegressor(learning_rate=0.1,
                  n_estimators=500,
                  n_jobs=-1,
                  random_state=33)
)

tt_xgb = TransformedTargetRegressor(regressor=pipe_xgb,
                                func=np.log1p, inverse_func=np.expm1)

tt_xgb.fit(X_train, y_train)
y_pred = tt_xgb.predict(X_test)
evaluate('Log Transformed XGB Regressor', y_test, y_pred)

### Light Gradient Boost Machine Regressor

In [None]:
from lightgbm import LGBMRegressor

In [None]:
pipe_lgb = make_pipeline(
    StandardScaler(),
    LGBMRegressor(learning_rate=0.1,
                  n_estimators=500,
                  n_jobs=-1,
                  random_state=33)
)

tt_lgb = TransformedTargetRegressor(regressor=pipe_lgb,
                                func=np.log1p, inverse_func=np.expm1)

tt_lgb.fit(X_train, y_train)
y_pred = tt_lgb.predict(X_test)
evaluate('Log Transformed Light GBM Regressor', y_test, y_pred)

### Model Combined

In [None]:
y_pred = tt_rfreg.predict(X_test)*0.1 + tt_xgb.predict(X_test)*0.8 + + tt_lgb.predict(X_test)*0.1
evaluate('[RF + XGB+ LGBM]', y_test, y_pred)

# Visualization
____

### LinearRegression, Ridge, Lasso, ElasticNet

In [None]:
def get_top_bottom_coef(model):
    coef = pd.Series(model.coef_, index=X_train.columns).sort_values(ascending=False)
    return coef

def visualize_coefficient(models):
    fig, axs = plt.subplots(figsize=(20, 6), nrows=1, ncols=4)
    fig.tight_layout()
    
    for i_num, model in enumerate(models):
        coef = get_top_bottom_coef(model)
        axs[i_num].set_title(model.__class__.__name__+ ' Coefficients', size=20)
        axs[i_num].tick_params(axis='y', direction='in', pad=0)
        for label in (axs[i_num].get_xticklabels() + axs[i_num].get_yticklabels()):
            label.set_fontsize(13)
        sns.barplot(x=coef.values,
                   y=coef.index, ax=axs[i_num])
        plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=1, 
                    top=1, 
                    wspace=0.8, 
                    hspace=0.4)

In [None]:
lin = tt_lin.regressor_.named_steps['linearregression']
ridge = tt_ridge.regressor_.named_steps['ridgecv']
lasso = tt_lasso.regressor_.named_steps['lassocv']
elnet = tt_elnet.regressor_.named_steps['elasticnetcv']
models = [lin, ridge, lasso, elnet]

visualize_coefficient(models)

### Tree-based Regressor

In [None]:
def get_top_features(model):
    ftr_importances_values = model.feature_importances_
    ftr_importances = pd.Series(ftr_importances_values, index=X_train.columns  )
    ftr_top = ftr_importances.sort_values(ascending=False)
    return ftr_top

def visualize_ftr_importances(models):
    fig, axs = plt.subplots(figsize=(17,6), nrows=1, ncols=3)
    fig.tight_layout() 
    for i_num, model in enumerate(models):
        ftr_top = get_top_features(model)
        axs[i_num].set_title(model.__class__.__name__+' Feature Importances', size=20)
        for label in (axs[i_num].get_xticklabels() + axs[i_num].get_yticklabels()):
            label.set_fontsize(12)
        sns.barplot(x=ftr_top.values, y=ftr_top.index , ax=axs[i_num])
        plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=1, 
                    top=1, 
                    wspace=0.8, 
                    hspace=0.4)

rf = tt_rfreg.regressor_.named_steps['randomforestregressor']
xgb = tt_xgb.regressor_.named_steps['xgbregressor']
lgbm = tt_lgb.regressor_.named_steps['lgbmregressor']

models = [rf, xgb, lgbm]
visualize_ftr_importances(models)

# Hyperparameters
_____

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [None]:
rf_reg = RandomForestRegressor(n_estimators=100,
                               n_jobs=-1,
                               random_state=33)

xgb_reg = XGBRegressor(n_estimators=100,
                       n_jobs=-1,
                       random_state=33,
                       seed=33)

lgb_reg = LGBMRegressor(n_estimators=100,
                        n_jobs=-1,
                        random_state=33)

In [None]:
def randcv(estimator, params):
    randcv = RandomizedSearchCV(estimator=estimator,
                                param_distributions=params,
                                n_iter=5,
                                scoring='neg_root_mean_squared_error',
                                cv=3,
                                random_state=33,
                                n_jobs=-1)

    tt = TransformedTargetRegressor(regressor=randcv,
                                func=np.log1p, inverse_func=np.expm1)

    tt.fit(X_train, y_train)
    y_pred = tt.predict(X_test)

    print('Best Params: ', tt.regressor_.best_params_)
    evaluate(estimator.__class__.__name__, y_test, y_pred)

    return tt.regressor_.best_estimator_

In [None]:
def gridcv(estimator, params):
    gridcv = GridSearchCV(estimator=estimator,
                          param_grid=params,
                          scoring='neg_root_mean_squared_error',
                          cv=3,
                          n_jobs=-1)

    tt = TransformedTargetRegressor(regressor=gridcv,
                                func=np.log1p, inverse_func=np.expm1)

    tt.fit(X_train, y_train)
    y_pred = tt.predict(X_test)

    print('Best Params: ', tt.regressor_.best_params_)
    evaluate(estimator.__class__.__name__, y_test, y_pred)
    
    return tt.regressor_.best_estimator_

### RandomForest Hyperparameters

In [None]:
params_rf = {'max_depth': [18], 
             'min_samples_split': [6], 
             'min_samples_leaf': [1]
             }
# randcv_rf = randcv(rf_reg, params_rf)
gridcv_rf = gridcv(rf_reg, params_rf)

In [None]:
pipe_rfreg = make_pipeline(
    # StandardScaler(),
    RandomForestRegressor(n_estimators=16,
                          min_samples_split=6,
                          min_samples_leaf=5,
                          max_depth=21, 
                          n_jobs=-1,
                          oob_score=True, 
                          random_state=33)
)

tt_rfreg = TransformedTargetRegressor(regressor=pipe_rfreg,
                                func=np.log1p, inverse_func=np.expm1)

tt_rfreg.fit(X_train, y_train)
y_pred = tt_rfreg.predict(X_test)
print('OOB Score: ', tt_rfreg.regressor_.named_steps['randomforestregressor'].oob_score_)
print('='*50)
evaluate('Log Transformed RandomForest Regressor for Training Set', y_train, tt_rfreg.predict(X_train))
evaluate('Log Transformed RandomForest Regressor for Test Set', y_test, y_pred)

### XGBoost Hyperparameters

In [None]:
params_xgb = {
              'max_depth':[3], 
              'learning_rate':[0.001, 0.01, 0.1, 0.15], 
            }

# randcv(xgb_reg, params_xgb)
gridcv(xgb_reg, params_xgb)

In [None]:
xgb_reg = XGBRegressor(n_estimators=1000,
                       learning_rate=0.1,
                       max_depth=3,
                       n_jobs=-1,
                       random_state=33,
                       seed=33)

tt_xgb = TransformedTargetRegressor(regressor=xgb_reg,
                                func=np.log1p, inverse_func=np.expm1)

tt_xgb.fit(X_train, y_train,
        verbose=False,
        early_stopping_rounds=100,
        eval_set=[(X_test, y_test)],
        eval_metric='rmse')
y_pred = tt_xgb.predict(X_test)
evaluate('Log Transformed XGB Regressor for Training Set', y_train, tt_xgb.predict(X_train))
evaluate('Log Transformed XGB Regressor for Test Set', y_test, y_pred)

In [None]:
y_true = y_test.reset_index().copy()
y_true['y_pred'] = y_pred
y_true

### LightGBM Hyperparameters

In [None]:
params_lgb = {
              'num_leaves':[29], 
              'max_depth':[-1], 
              'learning_rate':[0.1], 
              }

# randcv(lgb_reg, params_lgb)
gridcv(lgb_reg, params_lgb)

In [None]:
lgb_reg = LGBMRegressor(n_estimators=1000,
                        num_leaves=11,
                        max_depth=-1,
                        learning_rate=0.271,
                        reg_alpha=0.0003,
                        reg_lambda=1.2,
                        n_jobs=-1,
                        objective='regression',
                        random_state=33)

tt_lgb = TransformedTargetRegressor(regressor=lgb_reg,
                                func=np.log1p, inverse_func=np.expm1)

tt_lgb.fit(X_train, y_train,
        verbose=False,
        early_stopping_rounds=100,
        eval_set=[(X_test, y_test)],
        eval_metric='rmse')
y_pred = tt_lgb.predict(X_test)
evaluate('Log Transformed Light GBM Regressor for Training Set', y_train, tt_lgb.predict(X_train))
evaluate('Log Transformed Light GBM Regressor for Test Set', y_test, y_pred)

### Tuned Models Combined

In [None]:
y_pred = tt_rfreg.predict(X_test)*0.1 + tt_xgb.predict(X_test)*0.8 + tt_lgb.predict(X_test)*0.1
evaluate('[RF + XGB + LGBM]', y_test, y_pred)

# Result
________

### Permutation Importance & Feature Importance

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

In [None]:
from IPython.display import display

# Permutation Importance
def show_permutation_importance(model):
    prm_imps = PermutationImportance(model,
                                    scoring='neg_root_mean_squared_error',
                                    n_iter=5,
                                    random_state=33)

    prm_imps.fit(X_train, y_train)
    print('{} Permutation Importance'.format(model.__class__.__name__))
    display(eli5.show_weights(
        prm_imps,
        top=None,
        feature_names=X_train.columns.to_list()
    ))

In [None]:
show_permutation_importance(tt_rfreg)
show_permutation_importance(tt_xgb)
show_permutation_importance(tt_lgb)

In [None]:
rf = tt_rfreg.regressor_.named_steps['randomforestregressor']
xgb = tt_xgb.regressor_
lgbm = tt_lgb.regressor_

models = [rf, xgb, lgbm]
visualize_ftr_importances(models)

### Partial Dependence Plot(PDP)

In [None]:
from pdpbox.pdp import pdp_isolate, pdp_plot
from pdpbox.pdp import pdp_interact, pdp_interact_plot
from pdpbox import pdp
import shap

In [None]:
def draw_pdp_plot(feature, model, X_val, num_grid_points = 10):
    isolated = pdp_isolate(
        model=model,
        dataset=X_val,
        model_features=X_val.columns,
        feature=feature,
        grid_type='percentile',
        num_grid_points=num_grid_points
    )
    if model.regressor.__class__.__name__ == 'Pipeline':
        print('<<<{} MODEL>>>'.format(model.regressor.named_steps['randomforestregressor'].__class__.__name__))
    else:
        print('<<<{} MODEL>>>'.format(model.regressor.__class__.__name__))
    print('='*100)
    pdp.pdp_plot(isolated, feature)

In [None]:
features = X_train.columns.tolist()[2:-1]
features

In [None]:
X = pd.concat((X_train, X_test))

In [None]:
draw_pdp_plot(features[0], tt_xgb, X)

In [None]:
draw_pdp_plot(features[0], tt_rfreg, X)

In [None]:
draw_pdp_plot(features[0], tt_lgb, X)

In [None]:
draw_pdp_plot(features[1], tt_xgb, X)

In [None]:
draw_pdp_plot(features[1], tt_rfreg, X)

In [None]:
draw_pdp_plot(features[1], tt_lgb, X)

In [None]:
draw_pdp_plot(features[2], tt_xgb, X)

In [None]:
draw_pdp_plot(features[2], tt_rfreg, X)

In [None]:
draw_pdp_plot(features[2], tt_lgb, X)

In [None]:
def draw_pdp_interaction(model, features, X_test, plot_type):
    interaction = pdp_interact(
        model=model, 
        dataset=X_test,
        model_features=X_test.columns, 
        features=features
    )
    if model.regressor.__class__.__name__ == 'Pipeline':
        print('<<<{} MODEL>>>'.format(model.regressor.named_steps['randomforestregressor'].__class__.__name__))
    else:
        print('<<<{} MODEL>>>'.format(model.regressor.__class__.__name__))
    print('='*100)
    pdp_interact_plot(interaction, plot_type=plot_type, feature_names=features)
    plt.show()

features = ['Likes', 'Watch_hours']
draw_pdp_interaction(tt_xgb, features, X, 'contour')
draw_pdp_interaction(tt_rfreg, features, X, 'contour')
draw_pdp_interaction(tt_lgb, features, X, 'contour')

In [None]:
features = ['Views', 'Watch_hours']
draw_pdp_interaction(tt_xgb, features, X, 'contour')
draw_pdp_interaction(tt_rfreg, features, X, 'contour')
draw_pdp_interaction(tt_lgb, features, X, 'contour')

In [None]:
features = ['Likes', 'Watch_hours']
draw_pdp_interaction(tt_xgb, features, X, 'contour')
draw_pdp_interaction(tt_rfreg, features, X, 'contour')
draw_pdp_interaction(tt_lgb, features, X, 'contour')

### SHAP Values

In [None]:
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, X)

In [None]:
shap.summary_plot(shap_values, X, plot_size=(15,8))

In [None]:
shap.summary_plot(shap_values, X, plot_type='bar')

In [None]:
def draw_shap_value(sample):
    explainer = shap.TreeExplainer(xgb)
    shap.initjs()
    samp_df = pd.DataFrame(dict(zip(X_test.columns, np.array(sample).reshape(-1,1))))
    display(shap.force_plot(explainer.expected_value, explainer.shap_values(samp_df[:1]), samp_df[:]))
    y_pred = np.exp(xgb.predict(samp_df))[0]
    base_value = np.exp(explainer.expected_value)
    samp_df.loc[1] = explainer.shap_values(samp_df[:1])[0].tolist()
    samp_df.index = ['YouTuber\'s Info', 'SHAP values']
    display(samp_df.style.apply(lambda x: ['background: red' if v>0 else 'background: blue' for v in x], axis=0))
    print('Average YouTube Daily Revenue: $ {}'.format(base_value))
    print('Predicted YouTube Daily Revenue: $ {}'.format(y_pred))

In [None]:
import random

sample = [round(random.uniform(10000,500000)),
          round(random.uniform(100000,10226681)),
          round(random.uniform(1000,40326)),
          round(random.uniform(10000,503655)),
          round(random.uniform(3000,40000), 4),
          round(random.uniform(2000,500000))]
draw_shap_value(sample)

In [None]:
def pred_revenue(unique_viewers, impressions, likes, views, watch_hours, subs_accumulated):
    YouTuber_info = [unique_viewers, impressions, likes, views, watch_hours, subs_accumulated]
    draw_shap_value(YouTuber_info)

In [None]:
pred_revenue(57626.0, 2586855.0, 4954.0, 141204.0, 10312.5295, 253140.0)
print('Actual Revenue: $ {}'.format(y_test.iloc[0]))