In [2]:
# import library
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import streamlit as st
from pandas.tseries.offsets import DateOffset

In [3]:
# define functions

# load data
df_vid = pd.read_csv('resource/Video_Performance_Over_Time.csv')

# dropping row Total to avoid issue with calculations
df_agg = pd.read_csv('resource/Aggregated_Metrics_By_Video.csv').iloc[1:,:]

df_agg_sub = pd.read_csv('resource/Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')

df_com = pd.read_csv('resource/All_Comments_Final.csv')

In [4]:
# clean our data

In [5]:
# converting all columns to uppercase
for df in [df_agg,df_vid,df_agg_sub, df_com]:
    df.columns = df.columns.str.upper()


In [6]:
# checking
df_com.columns

Index(['COMMENTS', 'COMMENT_ID', 'REPLY_COUNT', 'LIKE_COUNT', 'DATE', 'VIDID',
       'USER_ID'],
      dtype='object')

In [158]:
df_com.rename(columns={'VIDID':'VIDEO'}, inplace=True)

In [7]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 1 to 223
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   VIDEO                                 223 non-null    object 
 1   VIDEO TITLE                           223 non-null    object 
 2   VIDEO PUB­LISH TIME                   223 non-null    object 
 3   COM­MENTS AD­DED                      223 non-null    int64  
 4   SHARES                                223 non-null    int64  
 5   DIS­LIKES                             223 non-null    int64  
 6   LIKES                                 223 non-null    int64  
 7   SUB­SCRIBERS LOST                     223 non-null    int64  
 8   SUB­SCRIBERS GAINED                   223 non-null    int64  
 9   RPM (USD)                             223 non-null    float64
 10  CPM (USD)                             221 non-null    float64
 11  AV­ER­AGE PER­CENT­

In [8]:
df_agg.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUB­LISH TIME,COM­MENTS AD­DED,SHARES,DIS­LIKES,LIKES,SUB­SCRIBERS LOST,SUB­SCRIBERS GAINED,RPM (USD),CPM (USD),AV­ER­AGE PER­CENT­AGE VIEWED (%),AV­ER­AGE VIEW DUR­A­TION,VIEWS,WATCH TIME (HOURS),SUB­SCRIBERS,YOUR ES­TIM­ATED REV­EN­UE (USD),IM­PRES­SIONS,IM­PRES­SIONS CLICK-THROUGH RATE (%)
1,4OZip0cgOho,How I Would Learn Data Science (If I Had to St...,"May 8, 2020",907,9583,942,46903,451,46904,6.353,12.835,36.65,0:03:09,1253559,65850.7042,46453,7959.533,26498799,3.14


In [9]:
# checking columns
df_agg.columns.tolist() # notice \xad in our columns

['VIDEO',
 'VIDEO TITLE',
 'VIDEO PUB\xadLISH TIME',
 'COM\xadMENTS AD\xadDED',
 'SHARES',
 'DIS\xadLIKES',
 'LIKES',
 'SUB\xadSCRIBERS LOST',
 'SUB\xadSCRIBERS GAINED',
 'RPM (USD)',
 'CPM (USD)',
 'AV\xadER\xadAGE PER\xadCENT\xadAGE VIEWED (%)',
 'AV\xadER\xadAGE VIEW DUR\xadA\xadTION',
 'VIEWS',
 'WATCH TIME (HOURS)',
 'SUB\xadSCRIBERS',
 'YOUR ES\xadTIM\xadATED REV\xadEN\xadUE (USD)',
 'IM\xadPRES\xadSIONS',
 'IM\xadPRES\xadSIONS CLICK-THROUGH RATE (%)']

In [10]:
# remove \xad
df_agg.columns = df_agg.columns.str.replace('\xad','')

In [11]:
# check 
df_agg.columns.tolist()

['VIDEO',
 'VIDEO TITLE',
 'VIDEO PUBLISH TIME',
 'COMMENTS ADDED',
 'SHARES',
 'DISLIKES',
 'LIKES',
 'SUBSCRIBERS LOST',
 'SUBSCRIBERS GAINED',
 'RPM (USD)',
 'CPM (USD)',
 'AVERAGE PERCENTAGE VIEWED (%)',
 'AVERAGE VIEW DURATION',
 'VIEWS',
 'WATCH TIME (HOURS)',
 'SUBSCRIBERS',
 'YOUR ESTIMATED REVENUE (USD)',
 'IMPRESSIONS',
 'IMPRESSIONS CLICK-THROUGH RATE (%)']

In [12]:
# convert date to datetype

df_agg['VIDEO PUBLISH TIME'] = pd.to_datetime(df_agg['VIDEO PUBLISH TIME'], format='mixed')

In [13]:
df_agg['VIDEO PUBLISH TIME']

1     2020-05-08
2     2020-11-12
3     2020-07-16
4     2020-08-29
5     2020-08-05
         ...    
219   2018-11-30
220   2019-05-25
221   2018-12-18
222   2019-05-05
223   2017-06-06
Name: VIDEO PUBLISH TIME, Length: 223, dtype: datetime64[ns]

In [14]:
df_agg['VIDEO PUBLISH TIME'][9]

Timestamp('2020-07-01 00:00:00')

In [15]:
df_agg.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),CPM (USD),AVERAGE PERCENTAGE VIEWED (%),AVERAGE VIEW DURATION,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%)
1,4OZip0cgOho,How I Would Learn Data Science (If I Had to St...,2020-05-08,907,9583,942,46903,451,46904,6.353,12.835,36.65,0:03:09,1253559,65850.7042,46453,7959.533,26498799,3.14


In [16]:
df_agg.isna().sum()

VIDEO                                 0
VIDEO TITLE                           0
VIDEO PUBLISH TIME                    0
COMMENTS ADDED                        0
SHARES                                0
DISLIKES                              0
LIKES                                 0
SUBSCRIBERS LOST                      0
SUBSCRIBERS GAINED                    0
RPM (USD)                             0
CPM (USD)                             2
AVERAGE PERCENTAGE VIEWED (%)         0
AVERAGE VIEW DURATION                 0
VIEWS                                 0
WATCH TIME (HOURS)                    0
SUBSCRIBERS                           0
YOUR ESTIMATED REVENUE (USD)          0
IMPRESSIONS                           0
IMPRESSIONS CLICK-THROUGH RATE (%)    0
dtype: int64

In [17]:
# dropping Nan value
df_agg.dropna(inplace=True)

In [18]:
df_agg.isna().sum()

VIDEO                                 0
VIDEO TITLE                           0
VIDEO PUBLISH TIME                    0
COMMENTS ADDED                        0
SHARES                                0
DISLIKES                              0
LIKES                                 0
SUBSCRIBERS LOST                      0
SUBSCRIBERS GAINED                    0
RPM (USD)                             0
CPM (USD)                             0
AVERAGE PERCENTAGE VIEWED (%)         0
AVERAGE VIEW DURATION                 0
VIEWS                                 0
WATCH TIME (HOURS)                    0
SUBSCRIBERS                           0
YOUR ESTIMATED REVENUE (USD)          0
IMPRESSIONS                           0
IMPRESSIONS CLICK-THROUGH RATE (%)    0
dtype: int64

In [19]:
# AVERAGE VIEW DURATION is an object meaning it contains numbers and string
df_agg['AVERAGE VIEW DURATION'][9] # a string

'0:04:56'

In [20]:
df_agg['AVERAGE VIEW DURATION'] = pd.to_datetime(df_agg['AVERAGE VIEW DURATION']).dt.time


# check
df_agg['AVERAGE VIEW DURATION'][9]

  df_agg['AVERAGE VIEW DURATION'] = pd.to_datetime(df_agg['AVERAGE VIEW DURATION']).dt.time


datetime.time(0, 4, 56)

In [21]:
# create new column for df_agg['AVERAGE VIEW SECONDS'
df_agg['AVERAGE VIEW SECONDS'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: x.second + x.minute * 60 + x.hour * 60 * 60)

# check
df_agg['AVERAGE VIEW SECONDS'][9]

296

In [22]:
df_agg.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),CPM (USD),AVERAGE PERCENTAGE VIEWED (%),AVERAGE VIEW DURATION,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS
1,4OZip0cgOho,How I Would Learn Data Science (If I Had to St...,2020-05-08,907,9583,942,46903,451,46904,6.353,12.835,36.65,00:03:09,1253559,65850.7042,46453,7959.533,26498799,3.14,189


In [23]:
df_agg.info() # our data type is corrected

<class 'pandas.core.frame.DataFrame'>
Index: 221 entries, 1 to 222
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   VIDEO                               221 non-null    object        
 1   VIDEO TITLE                         221 non-null    object        
 2   VIDEO PUBLISH TIME                  221 non-null    datetime64[ns]
 3   COMMENTS ADDED                      221 non-null    int64         
 4   SHARES                              221 non-null    int64         
 5   DISLIKES                            221 non-null    int64         
 6   LIKES                               221 non-null    int64         
 7   SUBSCRIBERS LOST                    221 non-null    int64         
 8   SUBSCRIBERS GAINED                  221 non-null    int64         
 9   RPM (USD)                           221 non-null    float64       
 10  CPM (USD)                      

In [24]:
# engagement ration, every engagement a view could do divided by the number of viewrs
df_agg['ENGAGEMENT RATIO'] = (df_agg['SHARES'] + df_agg['LIKES'] + df_agg['DISLIKES'] + df_agg['COMMENTS ADDED']) / df_agg['VIEWS'] 

# ratio of views to subscribers gained
df_agg['VIEW TO SUBSCRIBER RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS GAINED'] # how views does it take to gain a subscriber

# ratio of views to subscribers lost, 
df_agg['VIEW TO SUBSCRIBER LOST RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS LOST'] # how views does it take to lose a subscriber

In [25]:
df_agg.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),...,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
1,4OZip0cgOho,How I Would Learn Data Science (If I Had to St...,2020-05-08,907,9583,942,46903,451,46904,6.353,...,1253559,65850.7042,46453,7959.533,26498799,3.14,189,0.046536,26.726057,2779.509978


In [26]:
# sort data by 'VIDEO PUBLISH TIME'
df_agg.sort_values(by = 'VIDEO PUBLISH TIME', ascending=False, inplace=True)

df_agg = df_agg.reset_index(drop=True)

In [27]:
df_agg.head(2)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),...,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
0,0jTtHYie3CU,Should You Be Excited About Web 3? (As a Data ...,2022-01-17,37,43,8,267,14,18,4.055,...,4383,192.5779,4,16.549,65130,2.95,158,0.080995,243.5,313.071429
1,2RWwN5ZT4tA,Should @Luke Barousse Take This Data Analyst ...,2022-01-14,12,2,3,78,1,1,1.882,...,2401,25.9375,0,1.72,25094,2.64,38,0.039567,2401.0,2401.0


In [28]:
df_vid.head(1)

Unnamed: 0,DATE,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
0,19 Jan 2022,Kaggle Project From Scratch - Part 2 (Explorat...,KQ80oD_boBM,2191,https://i.ytimg.com/vi/KQ80oD_boBM/hqdefault.jpg,13,0,0,0,0,0,0.069055,151.300154,0


In [29]:
# CONVERTING DATE to datetime
df_vid['DATE'] = pd.to_datetime(df_vid['DATE'], format='mixed').dt.date

In [30]:
df_agg_sub.head(1)

Unnamed: 0,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,COUNTRY CODE,IS SUBSCRIBED,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
0,🌶 Hot Topics in Tech: Data Science Explained #...,OtqQYqRNDGI,59,https://i.ytimg.com/vi/OtqQYqRNDGI/hqdefault.jpg,HK,True,23,1,0,0,2,0,0.67187,39.640348,0


In [31]:
df_com.head(1)

Unnamed: 0,COMMENTS,COMMENT_ID,REPLY_COUNT,LIKE_COUNT,DATE,VIDID,USER_ID
0,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22T08:13:29Z,xpIFS6jZbe8,user_981


In [32]:
df_com.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   COMMENTS     10239 non-null  object
 1   COMMENT_ID   10240 non-null  object
 2   REPLY_COUNT  10240 non-null  int64 
 3   LIKE_COUNT   10240 non-null  int64 
 4   DATE         10240 non-null  object
 5   VIDID        10240 non-null  object
 6   USER_ID      10240 non-null  object
dtypes: int64(2), object(5)
memory usage: 560.1+ KB


In [33]:
df_com['DATE'] = pd.to_datetime(df_com['DATE']).dt.date

In [34]:
df_com.head(1)

Unnamed: 0,COMMENTS,COMMENT_ID,REPLY_COUNT,LIKE_COUNT,DATE,VIDID,USER_ID
0,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22,xpIFS6jZbe8,user_981


In [35]:
df_com.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   COMMENTS     10239 non-null  object
 1   COMMENT_ID   10240 non-null  object
 2   REPLY_COUNT  10240 non-null  int64 
 3   LIKE_COUNT   10240 non-null  int64 
 4   DATE         10240 non-null  object
 5   VIDID        10240 non-null  object
 6   USER_ID      10240 non-null  object
dtypes: int64(2), object(5)
memory usage: 560.1+ KB


In [36]:
# engineer data
## what metrics wil be relevant
## difference from baseline
## percent change

# build dashboard
## local picture
## individual video

# improvement

In [37]:
def load_data():

    # load data
    df_vid = pd.read_csv('resource/Video_Performance_Over_Time.csv')

    # dropping row Total to avoid issue with calculations
    df_agg = pd.read_csv('resource/Aggregated_Metrics_By_Video.csv').iloc[1:,:]

    df_agg_sub = pd.read_csv('resource/Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')

    df_com = pd.read_csv('resource/All_Comments_Final.csv')

    # converting all columns to uppercase
    for df in [df_agg,df_vid,df_agg_sub, df_com]:
        df.columns = df.columns.str.upper()
        
    # remove \xad
    df_agg.columns = df_agg.columns.str.replace('\xad','')
    # convert date to datetype

    df_agg['VIDEO PUBLISH TIME'] = pd.to_datetime(df_agg['VIDEO PUBLISH TIME'], format='mixed')

    df_agg['AVERAGE VIEW DURATION'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: datetime.strptime(x, '%H:%M:%S'))

    # create new column for df_agg['AVERAGE VIEW SECONDS'
    df_agg['AVERAGE VIEW SECONDS'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: x.second + x.minute * 60 + x.hour * 60 * 60)

    # engagement ration, every engagement a view could do divided by the number of viewrs
    df_agg['ENGAGEMENT RATIO'] = (df_agg['SHARES'] + df_agg['LIKES'] + df_agg['DISLIKES'] + df_agg['COMMENTS ADDED']) / df_agg['VIEWS'] 

    # ratio of views to subscribers gained
    df_agg['VIEW TO SUBSCRIBER RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS GAINED'] # how views does it take to gain a subscriber

    # ratio of views to subscribers lost, 
    df_agg['VIEW TO SUBSCRIBER LOST RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS LOST'] # how views does it take to lose a subscriber

    # sort data by 'VIDEO PUBLISH TIME'
    df_agg.sort_values(by = 'VIDEO PUBLISH TIME', ascending=False, inplace=True)

    # CONVERTING DATE to datetime
    df_vid['DATE'] = pd.to_datetime(df_vid['DATE'], format='mixed')

    df_com['DATE'] = pd.to_datetime(df_com['DATE'])
    
    return df_vid, df_agg, df_agg_sub, df_com

In [38]:
    
df_vid, df_agg, df_agg_sub, df_com = load_data()

In [39]:
# aggregated differential 

# create a copy of our dataframe
df_agg_diff = df_agg.copy()

In [40]:
# for the last 12 months, most recently date back to 12 months
metric_date_12mo = df_agg_diff['VIDEO PUBLISH TIME'].max() - DateOffset(months=12)
metric_date_12mo # 12 months early date

Timestamp('2021-01-17 00:00:00')

In [41]:
# dataframe from metric_date_12mo to df_agg_diff['VIDEO PUBLISH TIME'].max()
# that is, from 12 monts early to current date

df_agg_diff_12mo = df_agg_diff[df_agg_diff['VIDEO PUBLISH TIME'] >= metric_date_12mo] # we have 48 rows
df_agg_diff_12mo.shape

(48, 23)

In [42]:
median_agg = df_agg_diff_12mo[df_agg_diff_12mo.columns[2:]].median()
median_agg

VIDEO PUBLISH TIME                    2021-07-06 00:00:00
COMMENTS ADDED                                       43.5
SHARES                                               42.5
DISLIKES                                              5.0
LIKES                                               382.5
SUBSCRIBERS LOST                                     13.0
SUBSCRIBERS GAINED                                   56.5
RPM (USD)                                            4.37
CPM (USD)                                          10.573
AVERAGE PERCENTAGE VIEWED (%)                      41.175
AVERAGE VIEW DURATION                 1900-01-01 00:02:46
VIEWS                                              7417.0
WATCH TIME (HOURS)                               279.9851
SUBSCRIBERS                                          38.5
YOUR ESTIMATED REVENUE (USD)                      24.7995
IMPRESSIONS                                      155102.5
IMPRESSIONS CLICK-THROUGH RATE (%)                   2.43
AVERAGE VIEW S

In [43]:
# aggregated differential 

# create a copy of our dataframe
df_agg_diff = df_agg.copy()

In [44]:
df_agg['VIDEO PUBLISH TIME']

111   2022-01-17
187   2022-01-14
64    2022-01-10
59    2022-01-03
32    2021-12-27
         ...    
75    2018-11-14
190   2018-11-12
204   2018-07-10
138   2017-11-18
223   2017-06-06
Name: VIDEO PUBLISH TIME, Length: 223, dtype: datetime64[ns]

In [45]:
# for the last 12 months, most recently date back to 12 months
metric_date_12mo = df_agg_diff['VIDEO PUBLISH TIME'].max() - DateOffset(months=12)

metric_date_12mo = metric_date_12mo

# dataframe from metric_date_12mo to df_agg_diff['VIDEO PUBLISH TIME'].max()
# that is, from 12 months early to current date

df_agg_diff_12mo = df_agg_diff[df_agg_diff['VIDEO PUBLISH TIME'] >= metric_date_12mo]
# median 
median_agg = df_agg_diff_12mo[df_agg_diff_12mo.columns[2:]].median()

In [46]:
median_agg

VIDEO PUBLISH TIME                    2021-07-06 00:00:00
COMMENTS ADDED                                       43.5
SHARES                                               42.5
DISLIKES                                              5.0
LIKES                                               382.5
SUBSCRIBERS LOST                                     13.0
SUBSCRIBERS GAINED                                   56.5
RPM (USD)                                            4.37
CPM (USD)                                          10.573
AVERAGE PERCENTAGE VIEWED (%)                      41.175
AVERAGE VIEW DURATION                 1900-01-01 00:02:46
VIEWS                                              7417.0
WATCH TIME (HOURS)                               279.9851
SUBSCRIBERS                                          38.5
YOUR ESTIMATED REVENUE (USD)                      24.7995
IMPRESSIONS                                      155102.5
IMPRESSIONS CLICK-THROUGH RATE (%)                   2.43
AVERAGE VIEW S

In [47]:
# local picture

metric_agg = df_agg[[
    'VIDEO PUBLISH TIME',
    'COMMENTS ADDED', 
    'SHARES', 
    'DISLIKES', 
    'LIKES', 
    'SUBSCRIBERS GAINED', 
    'RPM (USD)', 
    'VIEWS', 
    'YOUR ESTIMATED REVENUE (USD)',
    'AVERAGE VIEW SECONDS', 
    'ENGAGEMENT RATIO', 
    'VIEW TO SUBSCRIBER RATIO',
]]

def metric_median(n):
    # 
    metric_date_n = metric_agg['VIDEO PUBLISH TIME'].max() - DateOffset(months=n)
    median_date_n = metric_agg[metric_agg['VIDEO PUBLISH TIME'] >= metric_date_n].median()
    
    return metric_date_n,median_date_n

metric_12mo, median_12mo = metric_median(12)
metric_6mo, median_6mo = metric_median(6)

In [48]:
len(metric_agg.columns)

12

In [49]:
median_6mo.index

Index(['VIDEO PUBLISH TIME', 'COMMENTS ADDED', 'SHARES', 'DISLIKES', 'LIKES',
       'SUBSCRIBERS GAINED', 'RPM (USD)', 'VIEWS',
       'YOUR ESTIMATED REVENUE (USD)', 'AVERAGE VIEW SECONDS',
       'ENGAGEMENT RATIO', 'VIEW TO SUBSCRIBER RATIO'],
      dtype='object')

In [50]:
median_6mo['VIEWS']

6062.0

In [51]:
for i in median_6mo.index:
    if i != 'VIDEO PUBLISH TIME':
        pass
        print(median_6mo[i] - median_12mo[i])
        #print((median_6mo[i] - median_12mo[i])/median_12mo[i])
    else:
        delta = median_6mo[i] - median_12mo[i]
        print(delta)
        print(delta.days)
        
        #st.metric(label = 'Duration', value = delta, delta=f"{delta} Days")

125 days 12:00:00
125
-7.5
-1.5
-0.5
-77.5
-21.0
0.024999999999999467
-1355.0
0.0
9.0
0.00682825754484416
6.723321127484667


In [52]:
df_agg_diff.dtypes

VIDEO                                         object
VIDEO TITLE                                   object
VIDEO PUBLISH TIME                    datetime64[ns]
COMMENTS ADDED                                 int64
SHARES                                         int64
DISLIKES                                       int64
LIKES                                          int64
SUBSCRIBERS LOST                               int64
SUBSCRIBERS GAINED                             int64
RPM (USD)                                    float64
CPM (USD)                                    float64
AVERAGE PERCENTAGE VIEWED (%)                float64
AVERAGE VIEW DURATION                 datetime64[ns]
VIEWS                                          int64
WATCH TIME (HOURS)                           float64
SUBSCRIBERS                                    int64
YOUR ESTIMATED REVENUE (USD)                 float64
IMPRESSIONS                                    int64
IMPRESSIONS CLICK-THROUGH RATE (%)           f

In [53]:
df_agg_diff.columns.tolist()

['VIDEO',
 'VIDEO TITLE',
 'VIDEO PUBLISH TIME',
 'COMMENTS ADDED',
 'SHARES',
 'DISLIKES',
 'LIKES',
 'SUBSCRIBERS LOST',
 'SUBSCRIBERS GAINED',
 'RPM (USD)',
 'CPM (USD)',
 'AVERAGE PERCENTAGE VIEWED (%)',
 'AVERAGE VIEW DURATION',
 'VIEWS',
 'WATCH TIME (HOURS)',
 'SUBSCRIBERS',
 'YOUR ESTIMATED REVENUE (USD)',
 'IMPRESSIONS',
 'IMPRESSIONS CLICK-THROUGH RATE (%)',
 'AVERAGE VIEW SECONDS',
 'ENGAGEMENT RATIO',
 'VIEW TO SUBSCRIBER RATIO',
 'VIEW TO SUBSCRIBER LOST RATIO']

In [54]:
df_agg_diff_final = df_agg_diff.loc[:,[
 'VIDEO',
 'VIDEO TITLE',
 'VIDEO PUBLISH TIME',
 'COMMENTS ADDED',
 'SHARES',
 'DISLIKES',
 'LIKES',
 'SUBSCRIBERS LOST',
 'SUBSCRIBERS GAINED',
 'VIEWS',
 'SUBSCRIBERS',
 'YOUR ESTIMATED REVENUE (USD)',
 'IMPRESSIONS',
 'IMPRESSIONS CLICK-THROUGH RATE (%)',
 'AVERAGE VIEW SECONDS',
 'ENGAGEMENT RATIO',
 'VIEW TO SUBSCRIBER RATIO',
 'VIEW TO SUBSCRIBER LOST RATIO']
]
df_agg_diff_final.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,VIEWS,SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
111,0jTtHYie3CU,Should You Be Excited About Web 3? (As a Data ...,2022-01-17,37,43,8,267,14,18,4383,4,16.549,65130,2.95,158,0.080995,243.5,313.071429


In [55]:
df_agg['PUBLISH DATE'] = df_agg.loc[:,'VIDEO PUBLISH TIME']

In [56]:
df_agg['VIDEO PUBLISH TIME'][9]

Timestamp('2020-07-01 00:00:00')

In [57]:
df_agg['PUBLISH DATE'][9]

Timestamp('2020-07-01 00:00:00')

In [58]:
df_agg.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),...,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO,PUBLISH DATE
111,0jTtHYie3CU,Should You Be Excited About Web 3? (As a Data ...,2022-01-17,37,43,8,267,14,18,4.055,...,192.5779,4,16.549,65130,2.95,158,0.080995,243.5,313.071429,2022-01-17


In [59]:
# list of numerical columns
df_agg_numeric_lst = df_agg_diff_final[df_agg_diff_final.columns[2:]].columns.tolist()
df_agg_numeric_lst

['VIDEO PUBLISH TIME',
 'COMMENTS ADDED',
 'SHARES',
 'DISLIKES',
 'LIKES',
 'SUBSCRIBERS LOST',
 'SUBSCRIBERS GAINED',
 'VIEWS',
 'SUBSCRIBERS',
 'YOUR ESTIMATED REVENUE (USD)',
 'IMPRESSIONS',
 'IMPRESSIONS CLICK-THROUGH RATE (%)',
 'AVERAGE VIEW SECONDS',
 'ENGAGEMENT RATIO',
 'VIEW TO SUBSCRIBER RATIO',
 'VIEW TO SUBSCRIBER LOST RATIO']

In [60]:
df_to_percent = {}
for i in df_agg_numeric_lst:
    df_to_percent[i] = '{:.1%}'.format

In [61]:
df_to_percent

{'VIDEO PUBLISH TIME': <function str.format>,
 'COMMENTS ADDED': <function str.format>,
 'SHARES': <function str.format>,
 'DISLIKES': <function str.format>,
 'LIKES': <function str.format>,
 'SUBSCRIBERS LOST': <function str.format>,
 'SUBSCRIBERS GAINED': <function str.format>,
 'VIEWS': <function str.format>,
 'SUBSCRIBERS': <function str.format>,
 'YOUR ESTIMATED REVENUE (USD)': <function str.format>,
 'IMPRESSIONS': <function str.format>,
 'IMPRESSIONS CLICK-THROUGH RATE (%)': <function str.format>,
 'AVERAGE VIEW SECONDS': <function str.format>,
 'ENGAGEMENT RATIO': <function str.format>,
 'VIEW TO SUBSCRIBER RATIO': <function str.format>,
 'VIEW TO SUBSCRIBER LOST RATIO': <function str.format>}

In [62]:
numeric_columns = df_agg_diff_final.select_dtypes(include=['number'])

In [63]:
# rename column
df_agg.rename(columns={'VIDEO PUBLISH TIME': 'PUBLISH DATE', 'COMMENTS ADDED' : 'COMMENTS'}, inplace=True)

In [64]:
df_agg.columns

Index(['VIDEO', 'VIDEO TITLE', 'PUBLISH DATE', 'COMMENTS', 'SHARES',
       'DISLIKES', 'LIKES', 'SUBSCRIBERS LOST', 'SUBSCRIBERS GAINED',
       'RPM (USD)', 'CPM (USD)', 'AVERAGE PERCENTAGE VIEWED (%)',
       'AVERAGE VIEW DURATION', 'VIEWS', 'WATCH TIME (HOURS)', 'SUBSCRIBERS',
       'YOUR ESTIMATED REVENUE (USD)', 'IMPRESSIONS',
       'IMPRESSIONS CLICK-THROUGH RATE (%)', 'AVERAGE VIEW SECONDS',
       'ENGAGEMENT RATIO', 'VIEW TO SUBSCRIBER RATIO',
       'VIEW TO SUBSCRIBER LOST RATIO', 'PUBLISH DATE'],
      dtype='object')

In [65]:
# list of numerical columns
df_agg_numeric_lst = df_agg_diff_final[df_agg_diff_final.columns[2:]].columns.tolist()

df_to_percent = {}
for i in df_agg_numeric_lst:
    df_to_percent[i] = '{:.1%}'.format
    
df_to_percent

{'VIDEO PUBLISH TIME': <function str.format>,
 'COMMENTS ADDED': <function str.format>,
 'SHARES': <function str.format>,
 'DISLIKES': <function str.format>,
 'LIKES': <function str.format>,
 'SUBSCRIBERS LOST': <function str.format>,
 'SUBSCRIBERS GAINED': <function str.format>,
 'VIEWS': <function str.format>,
 'SUBSCRIBERS': <function str.format>,
 'YOUR ESTIMATED REVENUE (USD)': <function str.format>,
 'IMPRESSIONS': <function str.format>,
 'IMPRESSIONS CLICK-THROUGH RATE (%)': <function str.format>,
 'AVERAGE VIEW SECONDS': <function str.format>,
 'ENGAGEMENT RATIO': <function str.format>,
 'VIEW TO SUBSCRIBER RATIO': <function str.format>,
 'VIEW TO SUBSCRIBER LOST RATIO': <function str.format>}

In [66]:
df_agg_diff_final['VIEWS'].sum()

5567998

In [67]:
def ratio_percent(col):
    col = col / col.sum()
    return col

In [68]:
df = df_agg_diff_final

# Select only the numeric columns
numeric_columns = df.select_dtypes(include=['number'])

# Calculate column sums
column_sums = numeric_columns.sum()

# Divide each element by its column sum and multiply by 100 to get percentages
df_percentage = (numeric_columns / column_sums) * 100

# Round the percentages to 2 decimal places
df_percentage = df_percentage.round(4)

# Include the previously excluded non-numeric columns
non_numeric_columns = df.select_dtypes(exclude=['number'])

# Concatenate numeric and non-numeric columns
df_result = pd.concat([non_numeric_columns, df_percentage], axis=1)

df_result

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,VIEWS,SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
111,0jTtHYie3CU,Should You Be Excited About Web 3? (As a Data ...,2022-01-17,0.2607,0.1085,0.2050,0.1187,0.4735,0.0137,0.0787,0.0031,0.0569,0.0645,0.4290,0.3531,0.6290,0.0,0.0
187,2RWwN5ZT4tA,Should @Luke Barousse Take This Data Analyst ...,2022-01-14,0.0845,0.0050,0.0769,0.0347,0.0338,0.0008,0.0431,0.0000,0.0059,0.0249,0.3839,0.0849,0.3073,0.0,0.0
64,rEWPqw6rMGI,The Only Data Science Explanation You Need,2022-01-10,0.4368,0.3557,0.1281,0.3209,0.9469,0.1038,0.1846,0.0844,0.2081,0.2135,0.3228,0.6257,0.7027,0.0,0.0
59,o-wsyxWbPOw,We Need to Talk About The LinkedIn Machine Lea...,2022-01-03,0.4579,0.0908,0.3075,0.2631,0.3382,0.0596,0.2121,0.0531,0.2152,0.1653,0.4828,0.3710,0.4637,0.0,0.0
32,xpIFS6jZbe8,How I Would Learn Data Science in 2022 (If I H...,2021-12-27,0.7679,1.9350,1.3583,1.9615,1.5556,1.9493,1.4239,1.9584,1.8174,1.4076,0.4813,0.6012,0.5232,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,RRSRKf9eQxc,Should You Get A Masters in Data Science?,2018-11-14,0.3945,0.1034,0.2563,0.1227,0.0676,0.0618,0.3320,0.0617,0.4703,0.1720,1.2215,0.3397,0.1609,0.0,0.0
190,IFceyuL6GZY,How I Became A Data Scientist From a Business ...,2018-11-12,0.0775,0.0833,0.1025,0.0747,0.0000,0.0618,0.0990,0.0633,0.0838,0.0583,0.8318,0.5296,0.3042,0.0,
204,Y_SMU701qlA,Predicting Season Long NBA Wins Using Multiple...,2018-07-10,0.0493,0.1135,0.0513,0.0707,0.0338,0.0260,0.1233,0.0258,0.0680,0.0534,0.5860,0.3240,0.2410,0.0,0.0
138,qfRhKHV8-t4,Predicting Crypto-Currency Price Using RNN lST...,2017-11-18,0.1973,0.2876,0.4613,0.1098,0.0338,0.0848,0.2974,0.0859,0.0755,0.1669,0.8216,0.2347,0.1909,0.0,0.0


In [69]:
df.select_dtypes(include=['number','datetime'])

Unnamed: 0,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,VIEWS,SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
111,2022-01-17,37,43,8,267,14,18,4383,4,16.549,65130,2.95,158,0.080995,243.500000,3.130714e+02
187,2022-01-14,12,2,3,78,1,1,2401,0,1.720,25094,2.64,38,0.039567,2401.000000,2.401000e+03
64,2022-01-10,62,141,5,722,28,136,10277,108,60.498,215491,2.22,280,0.090493,75.566176,3.670357e+02
59,2022-01-03,65,36,12,592,10,78,11808,68,62.568,166915,3.32,166,0.059705,151.384615,1.180800e+03
32,2021-12-27,109,767,53,4413,46,2553,79283,2507,528.286,1420968,3.31,269,0.067379,31.054837,1.723543e+03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,2018-11-14,56,41,10,276,2,81,18488,79,136.708,173610,8.40,152,0.020716,228.246914,9.244000e+03
190,2018-11-12,11,33,4,168,0,81,5515,81,24.358,58816,5.72,237,0.039166,68.086420,inf
204,2018-07-10,7,45,2,159,1,34,6863,33,19.772,53865,4.03,145,0.031036,201.852941,6.863000e+03
138,2017-11-18,28,114,18,247,1,111,16558,110,21.944,168508,5.65,105,0.024580,149.171171,1.655800e+04


In [70]:
x = '45%'
int(x.split('%')[0])

45

In [71]:
if '%' in x:
    print('x')

x


In [72]:
int(x.split('%')[0]) if '%' in x else x

45

In [79]:
df.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,VIEWS,SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
111,0jTtHYie3CU,Should You Be Excited About Web 3? (As a Data ...,2022-01-17,37,43,8,267,14,18,4383,4,16.549,65130,2.95,158,0.080995,243.5,313.071429


In [81]:
df['VIDEO PUBLISH TIME'].describe()

count                              223
mean     2020-06-08 18:37:07.802690560
min                2017-06-06 00:00:00
25%                2019-12-21 12:00:00
50%                2020-06-20 00:00:00
75%                2020-11-16 12:00:00
max                2022-01-17 00:00:00
Name: VIDEO PUBLISH TIME, dtype: object

In [93]:
df['VIDEO PUBLISH TIME'].max()

Timestamp('2022-01-17 00:00:00')

In [94]:
df['VIDEO PUBLISH TIME'].min()

Timestamp('2017-06-06 00:00:00')

In [86]:
(df['VIDEO PUBLISH TIME'].max() - df['VIDEO PUBLISH TIME'].min())/30

Timedelta('56 days 04:48:00')

In [95]:
df['VIDEO PUBLISH TIME'].max() - DateOffset(months=4)

Timestamp('2021-09-17 00:00:00')

In [73]:
items = [1,23,'de','56%']
for item in items:
    if isinstance(item, int):
        if item >= 0:
            print(item, 'greater than zero')
    else:
        if '%' in item:
            item = int(item.split('%')[0])
            if item >= 0:
                print(item, 'was splited then converted')
        else:
            None

1 greater than zero
23 greater than zero
56 was splited then converted


In [74]:
if isinstance(value, int):
    if value >= 0:
        return props
else:
    if '%' in value:
        value = int(value.split('%')[0])
        if value >= 0:
            return props
    else:
        None

SyntaxError: 'return' outside function (2727128901.py, line 3)

In [75]:
# define functions


In [76]:
# load data
@st.cache_resource # loads this function once and doesn't reload everytime we reload our page
def load_data():

    # load data
    df_vid = pd.read_csv('resource/Video_Performance_Over_Time.csv')

    # dropping row Total to avoid issue with calculations
    df_agg = pd.read_csv('resource/Aggregated_Metrics_By_Video.csv').iloc[1:,:]

    df_agg_sub = pd.read_csv('resource/Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')

    df_com = pd.read_csv('resource/All_Comments_Final.csv')

    # converting all columns to uppercase
    for df in [df_agg,df_vid,df_agg_sub, df_com]:
        df.columns = df.columns.str.upper()

    # remove \xad
    df_agg.columns = df_agg.columns.str.replace('\xad','')
    # convert date to datetype

    df_agg['VIDEO PUBLISH TIME'] = pd.to_datetime(df_agg['VIDEO PUBLISH TIME'], format='mixed')
        
    df_agg['AVERAGE VIEW DURATION'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: datetime.strptime(x, '%H:%M:%S'))

    # create new column for df_agg['AVERAGE VIEW SECONDS'
    df_agg['AVERAGE VIEW SECONDS'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: x.second + x.minute * 60 + x.hour * 60 * 60)

    # engagement ration, every engagement a view could do divided by the number of viewrs
    df_agg['ENGAGEMENT RATIO'] = (df_agg['SHARES'] + df_agg['LIKES'] + df_agg['DISLIKES'] + df_agg['COMMENTS ADDED']) / df_agg['VIEWS'] 

    # ratio of views to subscribers gained
    df_agg['VIEW TO SUBSCRIBER RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS GAINED'] # how views does it take to gain a subscriber

    # ratio of views to subscribers lost, 
    df_agg['VIEW TO SUBSCRIBER LOST RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS LOST'] # how views does it take to lose a subscriber

    # sort data by 'VIDEO PUBLISH TIME'
    df_agg.sort_values(by = 'VIDEO PUBLISH TIME', ascending=False, inplace=True)

    # CONVERTING DATE to datetime
    df_vid['DATE'] = pd.to_datetime(df_vid['DATE'], format='mixed')

    df_com['DATE'] = pd.to_datetime(df_com['DATE'])

    # create dataframe
    return df_vid, df_agg, df_agg_sub, df_com

df_vid, df_agg, df_agg_sub, df_com = load_data()


# engineer data

# aggregated differential 

# create a copy of our dataframe
df_agg_diff = df_agg.copy()

# for the last 12 months, most recently date back to 12 months
metric_date_12mo = df_agg_diff['VIDEO PUBLISH TIME'].max() - DateOffset(months=12)

# dataframe from metric_date_12mo to df_agg_diff['VIDEO PUBLISH TIME'].max()
# that is, from 12 months early to current date

df_agg_diff_12mo = df_agg_diff[df_agg_diff['VIDEO PUBLISH TIME'] >= metric_date_12mo]

# median 
median_agg = df_agg_diff_12mo[df_agg_diff_12mo.columns[2:]].median()

## what metrics wil be relevant
## difference from baseline
## percent change

# build dashboard

# sidebar
add_sidebar = st.sidebar.selectbox("Aggregate or Individual Video", ("Aggregate Metrics", "Individual Video Analysis"))

# styling dataframe
def styling_positive(value, props):
    try:
        return props if value >= 0 else None
    except:
        pass


def styling_negative(value, props):
    try:
        return props if value < 0 else None
    except:
        pass

## local picture
if add_sidebar == "Aggregate Metrics":

    metric_agg = df_agg[[
        'VIDEO PUBLISH TIME',
        'COMMENTS ADDED', 
        'SHARES', 
        'DISLIKES', 
        'LIKES', 
        'SUBSCRIBERS GAINED', 
        'RPM (USD)', 
        'VIEWS', 
        'YOUR ESTIMATED REVENUE (USD)',
        'AVERAGE VIEW SECONDS', 
        'ENGAGEMENT RATIO', 
        'VIEW TO SUBSCRIBER RATIO',
    ]]  

    def metric_median(n):
        # date range
        metric_date_n = metric_agg['VIDEO PUBLISH TIME'].max() - DateOffset(months=n)
        median_date_n = metric_agg[metric_agg['VIDEO PUBLISH TIME'] >= metric_date_n].median()
        
        return metric_date_n,median_date_n

    metric_12mo, median_12mo = metric_median(12)
    metric_6mo, median_6mo = metric_median(6)

    col1, col2, col3,col4,col5,col6 = st.columns(6)
    columns = [col1, col2, col3,col4,col5,col6]

    count = 0
    for i in median_6mo.index:
        with columns[count]:
            if i != 'VIDEO PUBLISH TIME':
                delta = (median_6mo[i] - median_12mo[i])/median_12mo[i]
                st.metric(label = i, value =round(median_6mo[i]), delta="{:.2%}".format(delta))
            else:
                delta = median_6mo[i] - median_12mo[i]
                st.metric(label = 'Duration', value = delta.days, delta=f"{(delta//30)} Months")
            count += 1
            if count >= 6:
                count = 0


    df_agg_diff_final = df_agg_diff.loc[:,[
    'VIDEO',
    'VIDEO TITLE',
    'VIDEO PUBLISH TIME',
    'COMMENTS ADDED',
    'SHARES',
    'DISLIKES',
    'LIKES',
    'SUBSCRIBERS LOST',
    'SUBSCRIBERS GAINED',
    'VIEWS',
    'SUBSCRIBERS',
    'YOUR ESTIMATED REVENUE (USD)',
    'IMPRESSIONS',
    'IMPRESSIONS CLICK-THROUGH RATE (%)',
    'AVERAGE VIEW DURATION',
    'AVERAGE VIEW SECONDS',
    'ENGAGEMENT RATIO',
    'VIEW TO SUBSCRIBER RATIO',
    'VIEW TO SUBSCRIBER LOST RATIO']
    ]
    
    # extract only date
    df_agg_diff_final['VIDEO PUBLISH TIME'] = df_agg_diff_final['VIDEO PUBLISH TIME'].dt.date

    # rename column

    df_agg_diff_final.rename(columns={'VIDEO PUBLISH TIME': 'PUBLISH DATE'}, inplace=True)

    # extracting time
    df_agg_diff_final['AVERAGE VIEW DURATION'] = df_agg_diff_final['AVERAGE VIEW DURATION'].dt.time


    # formating each data to percentage
    

    st.dataframe(df_agg_diff_final.style.hide().map(styling_positive, props = 'color:green;').map(styling_negative, props = 'color:red;'))

elif add_sidebar =="Individual Video Analysis":
    st.write('Ind')


## individual video

# improvement

# styling

2024-03-15 19:28:44.777 
  command:

    streamlit run /home/onscript/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]


In [98]:
tuple(df_agg['VIDEO TITLE'])[9]

'Is Data Science Dying?'

In [99]:
# videos title list
selected_video = tuple(df_agg['VIDEO TITLE'])[9]
# filtered by title
filtered_video = df_agg[df_agg['VIDEO TITLE'] == selected_video]

filtered_agg_sub = df_agg_sub[df_agg_sub['VIDEO TITLE'] == selected_video]

In [100]:
filtered_video

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),...,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
24,2qVWurPFwfc,Is Data Science Dying?,2021-11-19,148,408,56,2378,34,981,5.272,...,70043,3736.819,947,368.53,1071989,3.78,192,0.042688,71.399592,2060.088235


In [102]:
filtered_agg_sub.head(1)

Unnamed: 0,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,COUNTRY CODE,IS SUBSCRIBED,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
27543,Is Data Science Dying?,2qVWurPFwfc,549,https://i.ytimg.com/vi/2qVWurPFwfc/hqdefault.jpg,MD,False,10,0,0,0,0,0,0.368369,202.2347,0


In [105]:
filtered_agg_sub.columns

Index(['VIDEO TITLE', 'EXTERNAL VIDEO ID', 'VIDEO LENGTH', 'THUMBNAIL LINK',
       'COUNTRY CODE', 'IS SUBSCRIBED', 'VIEWS', 'VIDEO LIKES ADDED',
       'VIDEO DISLIKES ADDED', 'VIDEO LIKES REMOVED',
       'USER SUBSCRIPTIONS ADDED', 'USER SUBSCRIPTIONS REMOVED',
       'AVERAGE VIEW PERCENTAGE', 'AVERAGE WATCH TIME', 'USER COMMENTS ADDED'],
      dtype='object')

In [123]:
filtered_agg_sub['COUNTRY CODE'].unique()

array(['MD', 'BD', 'BA', 'ZM', 'FJ', 'GU', 'BW', 'EG', 'BY', 'CM', 'VE',
       'NL', 'IS', 'MK', 'CU', 'UZ', 'MC', 'BR', 'UA', 'SV', 'PT', 'SY',
       'IN', 'LC', 'MR', 'UG', 'BG', 'GA', 'TJ', 'ES', 'MZ', 'AZ', 'HU',
       'CI', 'HN', 'SG', 'BB', 'CR', 'MO', 'SZ', 'ET', 'NZ', 'TZ', 'BM',
       'PY', 'BO', 'CA', 'PL', 'CY', 'BF', 'JO', 'HK', 'GH', 'ML', 'ID',
       'AF', 'AW', 'LB', 'CZ', 'MM', 'GQ', 'TN', 'DO', 'KW', 'BN', 'NI',
       'KG', 'ZA', 'JM', 'AO', 'RU', 'GD', 'LK', 'IR', 'SE', 'IT', 'CL',
       'DJ', 'MN', 'AR', 'MT', 'PE', 'CW', 'MU', 'TD', 'TR', 'YE', 'RW',
       'FR', 'EE', 'CH', 'DZ', 'SO', 'SL', 'MQ', 'AG', 'LS', 'UY', 'TW',
       'AD', 'FI', 'OM', 'EC', 'ZW', 'VI', 'EH', 'SN', 'CG', 'BT', 'KY',
       'LA', 'KE', 'BJ', 'BZ', 'NE', 'NP', 'BH', 'TC', 'CN', 'HT', 'LT',
       'DE', 'LI', 'IL', 'PS', 'SD', 'LY', 'PK', 'WS', 'AT', 'KR', 'AM',
       'MX', 'TG', 'MA', 'AE', 'TH', 'BE', 'NG', 'KZ', 'GT', 'GN', 'US',
       'PA', 'MV', 'GM', 'GB', 'ZZ', 'JE', 'LU', 'F

In [136]:
filtered_agg_sub.head()

Unnamed: 0,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,COUNTRY CODE,IS SUBSCRIBED,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED,COUNTRY
27543,Is Data Science Dying?,2qVWurPFwfc,549,https://i.ytimg.com/vi/2qVWurPFwfc/hqdefault.jpg,MD,False,10,0,0,0,0,0,0.368369,202.2347,0,OTHERS
27544,Is Data Science Dying?,2qVWurPFwfc,549,https://i.ytimg.com/vi/2qVWurPFwfc/hqdefault.jpg,BD,False,494,10,0,0,0,0,0.310578,170.507051,0,OTHERS
27545,Is Data Science Dying?,2qVWurPFwfc,549,https://i.ytimg.com/vi/2qVWurPFwfc/hqdefault.jpg,BA,False,25,1,0,0,0,0,0.288771,158.53508,0,OTHERS
27546,Is Data Science Dying?,2qVWurPFwfc,549,https://i.ytimg.com/vi/2qVWurPFwfc/hqdefault.jpg,ZM,False,21,0,0,0,0,0,0.504413,276.922524,0,OTHERS
27547,Is Data Science Dying?,2qVWurPFwfc,549,https://i.ytimg.com/vi/2qVWurPFwfc/hqdefault.jpg,FJ,True,1,1,0,0,1,0,0.99745,547.6,0,OTHERS


In [117]:
df_agg_sub[df_agg_sub['COUNTRY CODE'] == 'US']

Unnamed: 0,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,COUNTRY CODE,IS SUBSCRIBED,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
3,🌶 Hot Topics in Tech: Data Science Explained #...,OtqQYqRNDGI,59,https://i.ytimg.com/vi/OtqQYqRNDGI/hqdefault.jpg,US,True,979,81,6,8,16,4,0.694854,40.996389,0
12,🌶 Hot Topics in Tech: Data Science Explained #...,OtqQYqRNDGI,59,https://i.ytimg.com/vi/OtqQYqRNDGI/hqdefault.jpg,US,False,881,20,10,1,0,0,0.614361,36.247291,0
330,git for Data Science Made Simple... (Hopefully),_0rHU6qAQe0,392,https://i.ytimg.com/vi/_0rHU6qAQe0/hqdefault.jpg,US,False,2083,25,0,2,0,0,0.271922,106.593408,0
395,git for Data Science Made Simple... (Hopefully),_0rHU6qAQe0,392,https://i.ytimg.com/vi/_0rHU6qAQe0/hqdefault.jpg,US,True,1570,144,1,2,23,6,0.457398,179.300191,0
599,Work From Home Data Scientist: Day in the Life,4CpmB4TR2C4,331,https://i.ytimg.com/vi/4CpmB4TR2C4/hqdefault.jpg,US,False,6079,110,13,7,0,0,0.503921,166.797816,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54837,#66DaysOfData Round 3 Live Event! (feat. @Stat...,qUK5Vk4NvBw,3735,https://i.ytimg.com/vi/qUK5Vk4NvBw/hqdefault.jpg,US,True,299,23,0,0,1,3,0.086261,322.183716,0
55005,#66DaysOfData - What is it? #shorts,iiSZqsQKNX8,47,https://i.ytimg.com/vi/iiSZqsQKNX8/hqdefault.jpg,US,True,727,41,0,1,6,0,0.692768,32.560109,0
55014,#66DaysOfData - What is it? #shorts,iiSZqsQKNX8,47,https://i.ytimg.com/vi/iiSZqsQKNX8/hqdefault.jpg,US,False,294,7,1,3,0,0,0.685302,32.209211,0
55130,#66DaysOfData - 3 Reasons to Start!,sICJ6a2wX5g,53,https://i.ytimg.com/vi/sICJ6a2wX5g/hqdefault.jpg,US,False,122,1,0,0,0,0,0.585581,31.035811,0


In [137]:
filtered_agg_sub.describe()

Unnamed: 0,VIDEO LENGTH,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
count,353.0,353.0,353.0,353.0,353.0,353.0,353.0,348.0,348.0,353.0
mean,549.0,198.016997,6.886686,0.235127,0.172805,2.773371,0.096317,0.403631,221.593625,0.0
std,0.0,1212.977212,35.702506,1.437608,1.400547,21.315399,0.667265,0.165549,90.886294,0.0
min,549.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001614,0.886,0.0
25%,549.0,3.0,0.0,0.0,0.0,0.0,0.0,0.328741,180.478609,0.0
50%,549.0,18.0,1.0,0.0,0.0,0.0,0.0,0.402617,221.036922,0.0
75%,549.0,98.0,4.0,0.0,0.0,1.0,0.0,0.470506,258.307785,0.0
max,549.0,20490.0,527.0,23.0,24.0,381.0,9.0,0.99745,547.6,0.0


In [140]:
df_com.head(1)

Unnamed: 0,COMMENTS,COMMENT_ID,REPLY_COUNT,LIKE_COUNT,DATE,VIDID,USER_ID
0,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22 08:13:29+00:00,xpIFS6jZbe8,user_981


In [144]:
df_com['DATE']

0       2022-01-22 08:13:29+00:00
1       2022-01-22 01:09:46+00:00
2       2022-01-21 23:59:05+00:00
3       2022-01-21 23:00:37+00:00
4       2022-01-21 20:24:20+00:00
                   ...           
10235   2018-07-06 07:08:39+00:00
10236   2018-05-15 00:01:12+00:00
10237   2018-05-04 20:58:54+00:00
10238   2018-02-20 16:46:19+00:00
10239   2017-12-03 18:52:35+00:00
Name: DATE, Length: 10240, dtype: datetime64[ns, UTC]

In [146]:
df_agg.columns

Index(['VIDEO', 'VIDEO TITLE', 'VIDEO PUBLISH TIME', 'COMMENTS ADDED',
       'SHARES', 'DISLIKES', 'LIKES', 'SUBSCRIBERS LOST', 'SUBSCRIBERS GAINED',
       'RPM (USD)', 'CPM (USD)', 'AVERAGE PERCENTAGE VIEWED (%)',
       'AVERAGE VIEW DURATION', 'VIEWS', 'WATCH TIME (HOURS)', 'SUBSCRIBERS',
       'YOUR ESTIMATED REVENUE (USD)', 'IMPRESSIONS',
       'IMPRESSIONS CLICK-THROUGH RATE (%)', 'AVERAGE VIEW SECONDS',
       'ENGAGEMENT RATIO', 'VIEW TO SUBSCRIBER RATIO',
       'VIEW TO SUBSCRIBER LOST RATIO'],
      dtype='object')

In [170]:
df_com.columns

Index(['COMMENTS', 'COMMENT_ID', 'REPLY_COUNT', 'LIKE_COUNT', 'DATE', 'VIDEO',
       'USER_ID'],
      dtype='object')

In [150]:
df_agg.loc[:, ['VIDEO', 'VIDEO PUBLISH TIME', 'SHARES']]

Unnamed: 0,VIDEO,VIDEO PUBLISH TIME,SHARES
111,0jTtHYie3CU,2022-01-17,43
187,2RWwN5ZT4tA,2022-01-14,2
64,rEWPqw6rMGI,2022-01-10,141
59,o-wsyxWbPOw,2022-01-03,36
32,xpIFS6jZbe8,2021-12-27,767
...,...,...,...
75,RRSRKf9eQxc,2018-11-14,41
190,IFceyuL6GZY,2018-11-12,33
204,Y_SMU701qlA,2018-07-10,45
138,qfRhKHV8-t4,2017-11-18,114


In [162]:
p

In [163]:
df_diff_time.columns

Index(['COMMENTS', 'COMMENT_ID', 'REPLY_COUNT', 'LIKE_COUNT', 'DATE', 'VIDEO',
       'USER_ID', 'VIDEO PUBLISH TIME', 'SHARES'],
      dtype='object')

In [166]:
df_diff_time.describe().columns

Index(['REPLY_COUNT', 'LIKE_COUNT', 'VIDEO PUBLISH TIME', 'SHARES'], dtype='object')

In [None]:
# numeic columns
df_diff_time.describe().columns

col1,col2,col3,col4 = st.columns
count = 0
for i in df_diff_time.describe().columns:
    with columns[count]:
        delta = df_diff_time.[i].count()
        st.metric(label = i.replace('_', ' '), value=delta, delta=delta)
        coun +=1
        if count >= 2:
            count = 0

In [None]:
display_col = [
    [
        'PUBLISH DATE',
        'VIDEO LENGTH',
        'SHARES',
        'LIKES',
        'DISLIKES',
        'SUBSCRIPTIONS',
        'COMMENTS',
        '% AVERAGE VIEW',
    ]
]

In [119]:
def audience_sample(country):
    if country == 'US':
        return 'USA'
    elif country == 'IN':
        return 'INDIA'
    elif country == 'CI':
        return 'CHINA'
    else:
        return 'OTHERS'
    
filtered_agg_sub['COUNTRY'] = filtered_agg_sub['COUNTRY CODE'].apply(audience_sample)

In [120]:
filtered_agg_sub['COUNTRY'] = filtered_agg_sub['COUNTRY CODE'].apply(audience_sample)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_agg_sub['COUNTRY'] = filtered_agg_sub['COUNTRY CODE'].apply(audience_sample)


In [121]:
filtered_agg_sub.head(1)

Unnamed: 0,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,COUNTRY CODE,IS SUBSCRIBED,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED,COUNTRY
27543,Is Data Science Dying?,2qVWurPFwfc,549,https://i.ytimg.com/vi/2qVWurPFwfc/hqdefault.jpg,MD,False,10,0,0,0,0,0,0.368369,202.2347,0,OTHERS


In [192]:
for i in [df_vid, df_agg, df_agg_sub, df_com]:
    print(i.columns.tolist())
    print('-'* 10)

['DATE', 'VIDEO TITLE', 'EXTERNAL VIDEO ID', 'VIDEO LENGTH', 'THUMBNAIL LINK', 'VIEWS', 'VIDEO LIKES ADDED', 'VIDEO DISLIKES ADDED', 'VIDEO LIKES REMOVED', 'USER SUBSCRIPTIONS ADDED', 'USER SUBSCRIPTIONS REMOVED', 'AVERAGE VIEW PERCENTAGE', 'AVERAGE WATCH TIME', 'USER COMMENTS ADDED']
----------
['VIDEO', 'VIDEO TITLE', 'VIDEO PUBLISH TIME', 'COMMENTS ADDED', 'SHARES', 'DISLIKES', 'LIKES', 'SUBSCRIBERS LOST', 'SUBSCRIBERS GAINED', 'RPM (USD)', 'CPM (USD)', 'AVERAGE PERCENTAGE VIEWED (%)', 'AVERAGE VIEW DURATION', 'VIEWS', 'WATCH TIME (HOURS)', 'SUBSCRIBERS', 'YOUR ESTIMATED REVENUE (USD)', 'IMPRESSIONS', 'IMPRESSIONS CLICK-THROUGH RATE (%)', 'AVERAGE VIEW SECONDS', 'ENGAGEMENT RATIO', 'VIEW TO SUBSCRIBER RATIO', 'VIEW TO SUBSCRIBER LOST RATIO']
----------
['VIDEO TITLE', 'EXTERNAL VIDEO ID', 'VIDEO LENGTH', 'THUMBNAIL LINK', 'COUNTRY CODE', 'IS SUBSCRIBED', 'VIEWS', 'VIDEO LIKES ADDED', 'VIDEO DISLIKES ADDED', 'VIDEO LIKES REMOVED', 'USER SUBSCRIPTIONS ADDED', 'USER SUBSCRIPTIONS REMOV

In [173]:
filtered_agg_sub.head()

Unnamed: 0,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,COUNTRY CODE,IS SUBSCRIBED,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED,COUNTRY
27543,Is Data Science Dying?,2qVWurPFwfc,549,https://i.ytimg.com/vi/2qVWurPFwfc/hqdefault.jpg,MD,False,10,0,0,0,0,0,0.368369,202.2347,0,OTHERS
27544,Is Data Science Dying?,2qVWurPFwfc,549,https://i.ytimg.com/vi/2qVWurPFwfc/hqdefault.jpg,BD,False,494,10,0,0,0,0,0.310578,170.507051,0,OTHERS
27545,Is Data Science Dying?,2qVWurPFwfc,549,https://i.ytimg.com/vi/2qVWurPFwfc/hqdefault.jpg,BA,False,25,1,0,0,0,0,0.288771,158.53508,0,OTHERS
27546,Is Data Science Dying?,2qVWurPFwfc,549,https://i.ytimg.com/vi/2qVWurPFwfc/hqdefault.jpg,ZM,False,21,0,0,0,0,0,0.504413,276.922524,0,OTHERS
27547,Is Data Science Dying?,2qVWurPFwfc,549,https://i.ytimg.com/vi/2qVWurPFwfc/hqdefault.jpg,FJ,True,1,1,0,0,1,0,0.99745,547.6,0,OTHERS


In [175]:
filtered_agg_sub['EXTERNAL VIDEO ID']

array(['2qVWurPFwfc'], dtype=object)

In [177]:
df_diff_time['VIDEO'] == filtered_agg_sub['EXTERNAL VIDEO ID'].unique()

ValueError: ('Lengths must match to compare', (10095,), (1,))

In [182]:
if '2qVWurPFwfc' in df_diff_time['VIDEO']:
    print('yes')
else:
    print('no')

no


In [183]:
df_diff_time.head(1)

Unnamed: 0,COMMENTS,COMMENT_ID,REPLY_COUNT,LIKE_COUNT,DATE,VIDEO,USER_ID,VIDEO PUBLISH TIME,SHARES
0,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22 08:13:29+00:00,xpIFS6jZbe8,user_981,2021-12-27,767
