In [87]:
# import library
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import streamlit as st
from pandas.tseries.offsets import DateOffset

In [182]:
def load_data():

    # load data
    df_vid = pd.read_csv('resource/Video_Performance_Over_Time.csv')

    # dropping row Total to avoid issue with calculations
    df_agg = pd.read_csv('resource/Aggregated_Metrics_By_Video.csv').iloc[1:,:]

    df_agg_sub = pd.read_csv('resource/Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')

    df_com = pd.read_csv('resource/All_Comments_Final.csv')

    # converting all columns to uppercase
    for df in [df_agg,df_vid,df_agg_sub, df_com]:
        df.columns = df.columns.str.upper()

    # remove \xad
    df_agg.columns = df_agg.columns.str.replace('\xad','')
    
    # cleaning inf val
    df_agg = df_agg.replace([np.inf, -np.inf], np.nan)

    # drop Nan value
    df_agg.dropna(inplace=True)

    # convert date to datetype

    df_agg['VIDEO PUBLISH TIME'] = pd.to_datetime(df_agg['VIDEO PUBLISH TIME'], format='mixed')

    # rename column

    df_agg.rename(columns={'VIDEO PUBLISH TIME': 'PUBLISH DATE', 'COMMENTS ADDED' : 'COMMENTS'}, inplace=True)
        
    df_agg['AVERAGE VIEW DURATION'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: datetime.strptime(x, '%H:%M:%S'))

    # create new column for df_agg['AVERAGE VIEW SECONDS'
    df_agg['AVERAGE VIEW SECONDS'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: x.second + x.minute * 60 + x.hour * 60 * 60)

    # engagement ration, every engagement a view could do divided by the number of viewrs
    df_agg['ENGAGEMENT RATIO'] = (df_agg['SHARES'] + df_agg['LIKES'] + df_agg['DISLIKES'] + df_agg['COMMENTS']) / df_agg['VIEWS'] 

    # ratio of views to subscribers gained
    df_agg['VIEW TO SUBSCRIBER RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS GAINED'] # how views does it take to gain a subscriber

    # ratio of views to subscribers lost, 
    df_agg['VIEW TO SUBSCRIBER LOST RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS LOST'] # how views does it take to lose a subscriber

    # sort data by 'VIDEO PUBLISH TIME'
    df_agg.sort_values(by = 'PUBLISH DATE', ascending=False, inplace=True)

    # CONVERTING DATE to datetime
    df_vid['DATE'] = pd.to_datetime(df_vid['DATE'], format='mixed')

    df_com['DATE'] = pd.to_datetime(df_com['DATE'])
    # renaming vidId
    df_com.rename(columns={'VIDID':'VIDEO'}, inplace=True)
    # removing '_'
    df_com.columns = df_com.columns.str.replace('_', ' ')

    # create dataframe
    return df_vid, df_agg, df_agg_sub, df_com

df_vid, df_agg, df_agg_sub, df_com = load_data()

In [154]:
for i in [df_vid, df_agg, df_agg_sub, df_com]:
    print(i.columns.tolist())
    print('*'*40)

['DATE', 'VIDEO TITLE', 'EXTERNAL VIDEO ID', 'VIDEO LENGTH', 'THUMBNAIL LINK', 'VIEWS', 'VIDEO LIKES ADDED', 'VIDEO DISLIKES ADDED', 'VIDEO LIKES REMOVED', 'USER SUBSCRIPTIONS ADDED', 'USER SUBSCRIPTIONS REMOVED', 'AVERAGE VIEW PERCENTAGE', 'AVERAGE WATCH TIME', 'USER COMMENTS ADDED']
****************************************
['VIDEO', 'VIDEO TITLE', 'PUBLISH DATE', 'COMMENTS', 'SHARES', 'DISLIKES', 'LIKES', 'SUBSCRIBERS LOST', 'SUBSCRIBERS GAINED', 'RPM (USD)', 'CPM (USD)', 'AVERAGE PERCENTAGE VIEWED (%)', 'AVERAGE VIEW DURATION', 'VIEWS', 'WATCH TIME (HOURS)', 'SUBSCRIBERS', 'YOUR ESTIMATED REVENUE (USD)', 'IMPRESSIONS', 'IMPRESSIONS CLICK-THROUGH RATE (%)', 'AVERAGE VIEW SECONDS', 'ENGAGEMENT RATIO', 'VIEW TO SUBSCRIBER RATIO', 'VIEW TO SUBSCRIBER LOST RATIO']
****************************************
['VIDEO TITLE', 'EXTERNAL VIDEO ID', 'VIDEO LENGTH', 'THUMBNAIL LINK', 'COUNTRY CODE', 'IS SUBSCRIBED', 'VIEWS', 'VIDEO LIKES ADDED', 'VIDEO DISLIKES ADDED', 'VIDEO LIKES REMOVED', 'USER

In [90]:
df_merge1 = pd.merge(left=df_agg_sub.loc[:,['VIDEO TITLE', 'EXTERNAL VIDEO ID']], right=df_agg, how='inner')
df_merge1.head(1)

Unnamed: 0,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO,PUBLISH DATE,COMMENTS,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,...,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
0,🌶 Hot Topics in Tech: Data Science Explained #...,OtqQYqRNDGI,OtqQYqRNDGI,2021-01-29,56,25,26,380,18,53,...,8009,79.7725,35,10.292,178890,1.64,35,0.060807,151.113208,444.944444


In [91]:
df_merge1.columns

Index(['VIDEO TITLE', 'EXTERNAL VIDEO ID', 'VIDEO', 'PUBLISH DATE', 'COMMENTS',
       'SHARES', 'DISLIKES', 'LIKES', 'SUBSCRIBERS LOST', 'SUBSCRIBERS GAINED',
       'RPM (USD)', 'CPM (USD)', 'AVERAGE PERCENTAGE VIEWED (%)',
       'AVERAGE VIEW DURATION', 'VIEWS', 'WATCH TIME (HOURS)', 'SUBSCRIBERS',
       'YOUR ESTIMATED REVENUE (USD)', 'IMPRESSIONS',
       'IMPRESSIONS CLICK-THROUGH RATE (%)', 'AVERAGE VIEW SECONDS',
       'ENGAGEMENT RATIO', 'VIEW TO SUBSCRIBER RATIO',
       'VIEW TO SUBSCRIBER LOST RATIO'],
      dtype='object')

In [92]:
df_merge2 = pd.merge(left=df_com, right=df_merge1.loc[:,['VIDEO TITLE','VIDEO', 'PUBLISH DATE', 'SHARES', 'DISLIKES', 'LIKES','SUBSCRIBERS']], how='inner')
df_merge2.head(1)

Unnamed: 0,COMMENTS,COMMENT ID,REPLY COUNT,LIKE COUNT,DATE,VIDEO,USER ID,VIDEO TITLE,PUBLISH DATE,SHARES,DISLIKES,LIKES,SUBSCRIBERS
0,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22 08:13:29+00:00,xpIFS6jZbe8,user_981,How I Would Learn Data Science in 2022 (If I H...,2021-12-27,767,53,4413,2507


In [93]:
df_diff_vi = pd.merge(left=df_agg_sub.loc[:,['VIDEO TITLE', 'EXTERNAL VIDEO ID']], right=df_agg, how='inner')
df_diff_vid = pd.merge(left=df_com, right=df_diff_vi.loc[:,['VIDEO TITLE','VIDEO']], how='inner')
df_diff_vid.dropna(inplace=True)

In [94]:
df_diff_vid.head()

Unnamed: 0,COMMENTS,COMMENT ID,REPLY COUNT,LIKE COUNT,DATE,VIDEO,USER ID,VIDEO TITLE
0,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22 08:13:29+00:00,xpIFS6jZbe8,user_981,How I Would Learn Data Science in 2022 (If I H...
1,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22 08:13:29+00:00,xpIFS6jZbe8,user_981,How I Would Learn Data Science in 2022 (If I H...
2,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22 08:13:29+00:00,xpIFS6jZbe8,user_981,How I Would Learn Data Science in 2022 (If I H...
3,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22 08:13:29+00:00,xpIFS6jZbe8,user_981,How I Would Learn Data Science in 2022 (If I H...
4,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22 08:13:29+00:00,xpIFS6jZbe8,user_981,How I Would Learn Data Science in 2022 (If I H...


In [95]:
df_diff_vid.shape

(3085140, 8)

In [96]:
filtered_agg_sub = df_diff_vid[df_diff_vid['VIDEO TITLE'] == selected_vid]
filtered_agg_sub.shape

(35435, 8)

In [103]:
filtered_agg_sub.head()

Unnamed: 0,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,COUNTRY CODE,IS SUBSCRIBED,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
34713,How I Would Learn Data Science in 2022 (If I H...,xpIFS6jZbe8,734,https://i.ytimg.com/vi/xpIFS6jZbe8/hqdefault.jpg,BZ,False,7,1,0,0,0,0,0.064502,47.344286,0
34714,How I Would Learn Data Science in 2022 (If I H...,xpIFS6jZbe8,734,https://i.ytimg.com/vi/xpIFS6jZbe8/hqdefault.jpg,FJ,False,13,1,1,0,0,0,0.371296,272.531615,0
34715,How I Would Learn Data Science in 2022 (If I H...,xpIFS6jZbe8,734,https://i.ytimg.com/vi/xpIFS6jZbe8/hqdefault.jpg,KY,False,2,0,0,0,0,0,0.349795,256.7495,0
34716,How I Would Learn Data Science in 2022 (If I H...,xpIFS6jZbe8,734,https://i.ytimg.com/vi/xpIFS6jZbe8/hqdefault.jpg,SO,True,6,1,0,0,0,0,0.253822,186.305167,0
34717,How I Would Learn Data Science in 2022 (If I H...,xpIFS6jZbe8,734,https://i.ytimg.com/vi/xpIFS6jZbe8/hqdefault.jpg,LY,False,7,0,0,0,0,0,0.388529,285.180429,0


In [98]:
numeric_col = df_diff_vid.select_dtypes(include=['number'])


In [99]:
selected_vid = df_merge2['VIDEO TITLE'][9]

In [100]:
filtered_agg_sub = df_agg_sub[df_agg_sub['VIDEO TITLE'] == selected_vid]
filtered_agg_sub.shape

(373, 15)

In [123]:
filtered_agg_sub.describe()

Unnamed: 0,VIDEO LENGTH,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
count,373.0,373.0,373.0,373.0,373.0,373.0,373.0,363.0,363.0,373.0
mean,734.0,207.257373,11.844504,0.19571,0.268097,6.589812,0.120643,0.372155,273.162082,0.0
std,0.0,971.366189,49.857889,1.00364,1.34723,38.03508,0.941246,0.172536,126.641141,0.0
min,734.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001082,0.794,0.0
25%,734.0,5.0,0.0,0.0,0.0,0.0,0.0,0.317124,232.76897,0.0
50%,734.0,25.0,1.0,0.0,0.0,0.0,0.0,0.37558,275.675375,0.0
75%,734.0,126.0,7.0,0.0,0.0,2.0,0.0,0.428968,314.862801,0.0
max,734.0,12216.0,500.0,11.0,17.0,506.0,16.0,1.138646,835.766,0.0


In [134]:
ind_col = filtered_agg_sub.loc[:,[
        'VIDEO LENGTH', 
        'VIEWS', 
        'VIDEO LIKES ADDED', 
        'VIDEO DISLIKES ADDED', 
        'USER SUBSCRIPTIONS ADDED',
        'AVERAGE WATCH TIME', 
        'USER COMMENTS ADDED']
    ]


ind_col

Unnamed: 0,VIDEO LENGTH,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,USER SUBSCRIPTIONS ADDED,AVERAGE WATCH TIME,USER COMMENTS ADDED
34713,734,7,1,0,0,47.344286,0
34714,734,13,1,1,0,272.531615,0
34715,734,2,0,0,0,256.749500,0
34716,734,6,1,0,0,186.305167,0
34717,734,7,0,0,0,285.180429,0
...,...,...,...,...,...,...,...
35081,734,3,0,0,0,85.050000,0
35082,734,1,0,0,0,736.516000,0
35083,734,21,1,0,2,374.342524,0
35084,734,592,26,1,0,279.496821,0


In [135]:
ind_col.describe().median()

VIDEO LENGTH                734.000000
VIEWS                       166.628686
VIDEO LIKES ADDED             9.422252
VIDEO DISLIKES ADDED          0.097855
USER SUBSCRIPTIONS ADDED      4.294906
AVERAGE WATCH TIME          274.418729
USER COMMENTS ADDED           0.000000
dtype: float64

In [136]:
ind_col.describe().mean()

VIDEO LENGTH                 597.125000
VIEWS                       1740.452945
VIDEO LIKES ADDED            117.837799
VIDEO DISLIKES ADDED          48.149919
USER SUBSCRIPTIONS ADDED     115.703112
AVERAGE WATCH TIME           302.833796
USER COMMENTS ADDED           46.625000
dtype: float64

In [131]:
video length, mean
views, sum
like, sum 
dislike, sum
commet, sum
average watch time, mean
subscription, sum

array([734])

In [148]:
ind_col = filtered_agg_sub.loc[:,[
        'VIDEO LENGTH', 
        'VIEWS', 
        'VIDEO LIKES ADDED', 
        'VIDEO DISLIKES ADDED', 
        'USER SUBSCRIPTIONS ADDED',
        'AVERAGE WATCH TIME', 
        'USER COMMENTS ADDED']
    ]

col1, col2, col3,col4,col5,col6,col7 = st.columns(7)
columns = [col1, col2, col3,col4,col5,col6,col7]

count = 0
for i in ind_col.index:
    with columns[count]:
        if i not in ['VIDEO LENGTH', 'AVERAGE WATCH TIME']:   
            st.metric(label = i, value =round(ind_col[i].sum()))
        else:
            st.metric(label = i, value = ind_col[i].mean())
        count += 1
        if count >= 3:
            count = 0

KeyError: 34713

In [146]:
for i in ind_col.columns:
    if i not in ['VIDEO LENGTH', 'AVERAGE WATCH TIME']:   
        print(i,round(ind_col[i].sum()))
    else:
        print(i,ind_col[i].mean())

VIDEO LENGTH 734.0
VIEWS 77307
VIDEO LIKES ADDED 4418
VIDEO DISLIKES ADDED 73
USER SUBSCRIPTIONS ADDED 2458
AVERAGE WATCH TIME 273.162082274703
USER COMMENTS ADDED 0


In [187]:
# cleaning inf val
df_agg = df_agg.replace([np.inf, -np.inf], np.nan).dropna()

In [188]:
for i, df in enumerate([df_vid, df_agg, df_agg_sub, df_com]):
    df = df.select_dtypes(include=['number'])
    print(f"DataFrame {i+1}:", np.isinf(df).any().any())

DataFrame 1: False
DataFrame 2: False
DataFrame 3: False
DataFrame 4: False


Unnamed: 0,VIDEO,VIDEO TITLE,PUBLISH DATE,COMMENTS,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),...,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
111,0jTtHYie3CU,Should You Be Excited About Web 3? (As a Data ...,2022-01-17,37,43,8,267,14,18,4.055,...,4383,192.5779,4,16.549,65130,2.95,158,0.080995,243.500000,313.071429
187,2RWwN5ZT4tA,Should @Luke Barousse Take This Data Analyst ...,2022-01-14,12,2,3,78,1,1,1.882,...,2401,25.9375,0,1.720,25094,2.64,38,0.039567,2401.000000,2401.000000
64,rEWPqw6rMGI,The Only Data Science Explanation You Need,2022-01-10,62,141,5,722,28,136,5.971,...,10277,801.5549,108,60.498,215491,2.22,280,0.090493,75.566176,367.035714
59,o-wsyxWbPOw,We Need to Talk About The LinkedIn Machine Lea...,2022-01-03,65,36,12,592,10,78,5.321,...,11808,545.6332,68,62.568,166915,3.32,166,0.059705,151.384615,1180.800000
32,xpIFS6jZbe8,How I Would Learn Data Science in 2022 (If I H...,2021-12-27,109,767,53,4413,46,2553,6.836,...,79283,5945.5420,2507,528.286,1420968,3.31,269,0.067379,31.054837,1723.543478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,FBgs-BSTIJE,Demystifying Data Science Roles,2018-11-30,3,5,1,48,1,8,5.218,...,978,56.5930,7,5.103,26202,2.24,208,0.058282,122.250000,978.000000
158,irjTWNV0eAY,How to Simulate NBA Games in Python,2018-11-19,22,105,2,308,2,152,3.363,...,13742,629.6323,150,46.214,149844,5.45,164,0.031800,90.407895,6871.000000
75,RRSRKf9eQxc,Should You Get A Masters in Data Science?,2018-11-14,56,41,10,276,2,81,7.398,...,18488,782.5937,79,136.708,173610,8.40,152,0.020716,228.246914,9244.000000
204,Y_SMU701qlA,Predicting Season Long NBA Wins Using Multiple...,2018-07-10,7,45,2,159,1,34,2.883,...,6863,276.7257,33,19.772,53865,4.03,145,0.031036,201.852941,6863.000000


In [195]:
df_agg.loc[['VIDEO' == 'scSc6YSanQ0'],:]

IndexError: Boolean index has wrong length: 1 instead of 199