In [224]:
# import library
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import streamlit as st

In [225]:
# define functions

# load data
df_vid = pd.read_csv('resource/Video_Performance_Over_Time.csv')

# dropping row Total to avoid issue with calculations
df_agg = pd.read_csv('resource/Aggregated_Metrics_By_Video.csv').iloc[1:,:]

df_agg_sub = pd.read_csv('resource/Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')

df_com = pd.read_csv('resource/All_Comments_Final.csv')

In [226]:
# clean our data

In [227]:
# converting all columns to uppercase
for df in [df_agg,df_vid,df_agg_sub, df_com]:
    df.columns = df.columns.str.upper()


In [228]:
# checking
df_com.columns

Index(['COMMENTS', 'COMMENT_ID', 'REPLY_COUNT', 'LIKE_COUNT', 'DATE', 'VIDID',
       'USER_ID'],
      dtype='object')

In [229]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 1 to 223
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   VIDEO                                 223 non-null    object 
 1   VIDEO TITLE                           223 non-null    object 
 2   VIDEO PUB­LISH TIME                   223 non-null    object 
 3   COM­MENTS AD­DED                      223 non-null    int64  
 4   SHARES                                223 non-null    int64  
 5   DIS­LIKES                             223 non-null    int64  
 6   LIKES                                 223 non-null    int64  
 7   SUB­SCRIBERS LOST                     223 non-null    int64  
 8   SUB­SCRIBERS GAINED                   223 non-null    int64  
 9   RPM (USD)                             223 non-null    float64
 10  CPM (USD)                             221 non-null    float64
 11  AV­ER­AGE PER­CENT­

In [230]:
df_agg.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUB­LISH TIME,COM­MENTS AD­DED,SHARES,DIS­LIKES,LIKES,SUB­SCRIBERS LOST,SUB­SCRIBERS GAINED,RPM (USD),CPM (USD),AV­ER­AGE PER­CENT­AGE VIEWED (%),AV­ER­AGE VIEW DUR­A­TION,VIEWS,WATCH TIME (HOURS),SUB­SCRIBERS,YOUR ES­TIM­ATED REV­EN­UE (USD),IM­PRES­SIONS,IM­PRES­SIONS CLICK-THROUGH RATE (%)
1,4OZip0cgOho,How I Would Learn Data Science (If I Had to St...,"May 8, 2020",907,9583,942,46903,451,46904,6.353,12.835,36.65,0:03:09,1253559,65850.7042,46453,7959.533,26498799,3.14


In [231]:
# checking columns
df_agg.columns.tolist() # notice \xad in our columns

['VIDEO',
 'VIDEO TITLE',
 'VIDEO PUB\xadLISH TIME',
 'COM\xadMENTS AD\xadDED',
 'SHARES',
 'DIS\xadLIKES',
 'LIKES',
 'SUB\xadSCRIBERS LOST',
 'SUB\xadSCRIBERS GAINED',
 'RPM (USD)',
 'CPM (USD)',
 'AV\xadER\xadAGE PER\xadCENT\xadAGE VIEWED (%)',
 'AV\xadER\xadAGE VIEW DUR\xadA\xadTION',
 'VIEWS',
 'WATCH TIME (HOURS)',
 'SUB\xadSCRIBERS',
 'YOUR ES\xadTIM\xadATED REV\xadEN\xadUE (USD)',
 'IM\xadPRES\xadSIONS',
 'IM\xadPRES\xadSIONS CLICK-THROUGH RATE (%)']

In [232]:
# remove \xad
df_agg.columns = df_agg.columns.str.replace('\xad','')

In [233]:
# check 
df_agg.columns.tolist()

['VIDEO',
 'VIDEO TITLE',
 'VIDEO PUBLISH TIME',
 'COMMENTS ADDED',
 'SHARES',
 'DISLIKES',
 'LIKES',
 'SUBSCRIBERS LOST',
 'SUBSCRIBERS GAINED',
 'RPM (USD)',
 'CPM (USD)',
 'AVERAGE PERCENTAGE VIEWED (%)',
 'AVERAGE VIEW DURATION',
 'VIEWS',
 'WATCH TIME (HOURS)',
 'SUBSCRIBERS',
 'YOUR ESTIMATED REVENUE (USD)',
 'IMPRESSIONS',
 'IMPRESSIONS CLICK-THROUGH RATE (%)']

In [234]:
# convert date to datetype

df_agg['VIDEO PUBLISH TIME'] = pd.to_datetime(df_agg['VIDEO PUBLISH TIME'], format='mixed')

In [235]:
df_agg['VIDEO PUBLISH TIME'].dtype #datetime dtype

dtype('<M8[ns]')

In [236]:
# AVERAGE VIEW DURATION is an object meaning it contains numbers and string
df_agg['AVERAGE VIEW DURATION'][9] # a string

'0:04:56'

In [237]:
df_agg['AVERAGE VIEW DURATION'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: datetime.strptime(x, '%H:%M:%S'))

In [238]:
# check
df_agg['AVERAGE VIEW DURATION'][9]

Timestamp('1900-01-01 00:04:56')

In [239]:
# create new column for df_agg['AVERAGE VIEW SECONDS'
df_agg['AVERAGE VIEW SECONDS'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: x.second + x.minute * 60 + x.hour * 60 * 60)

In [240]:
# check
df_agg['AVERAGE VIEW SECONDS'][9]

296

In [241]:
df_agg.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),CPM (USD),AVERAGE PERCENTAGE VIEWED (%),AVERAGE VIEW DURATION,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS
1,4OZip0cgOho,How I Would Learn Data Science (If I Had to St...,2020-05-08,907,9583,942,46903,451,46904,6.353,12.835,36.65,1900-01-01 00:03:09,1253559,65850.7042,46453,7959.533,26498799,3.14,189


In [242]:
df_agg.info() # our data type is corrected

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 1 to 223
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   VIDEO                               223 non-null    object        
 1   VIDEO TITLE                         223 non-null    object        
 2   VIDEO PUBLISH TIME                  223 non-null    datetime64[ns]
 3   COMMENTS ADDED                      223 non-null    int64         
 4   SHARES                              223 non-null    int64         
 5   DISLIKES                            223 non-null    int64         
 6   LIKES                               223 non-null    int64         
 7   SUBSCRIBERS LOST                    223 non-null    int64         
 8   SUBSCRIBERS GAINED                  223 non-null    int64         
 9   RPM (USD)                           223 non-null    float64       
 10  CPM (USD)                 

In [243]:
# engagement ration, every engagement a view could do divided by the number of viewrs
df_agg['ENGAGEMENT RATIO'] = (df_agg['SHARES'] + df_agg['LIKES'] + df_agg['DISLIKES'] + df_agg['COMMENTS ADDED']) / df_agg['VIEWS'] 

# ratio of views to subscribers gained
df_agg['VIEW TO SUBSCRIBER RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS GAINED'] # how views does it take to gain a subscriber

# ratio of views to subscribers lost, 
df_agg['VIEW TO SUBSCRIBER LOST RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS LOST'] # how views does it take to lose a subscriber

In [244]:
df_agg.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),...,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
1,4OZip0cgOho,How I Would Learn Data Science (If I Had to St...,2020-05-08,907,9583,942,46903,451,46904,6.353,...,1253559,65850.7042,46453,7959.533,26498799,3.14,189,0.046536,26.726057,2779.509978


In [278]:
# sort data by 'VIDEO PUBLISH TIME'
df_agg.sort_values(by = 'VIDEO PUBLISH TIME', ascending=False, inplace=True)

In [279]:
df_agg.head(2)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),...,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
111,0jTtHYie3CU,Should You Be Excited About Web 3? (As a Data ...,2022-01-17,37,43,8,267,14,18,4.055,...,4383,192.5779,4,16.549,65130,2.95,158,0.080995,243.5,313.071429
187,2RWwN5ZT4tA,Should @Luke Barousse Take This Data Analyst ...,2022-01-14,12,2,3,78,1,1,1.882,...,2401,25.9375,0,1.72,25094,2.64,38,0.039567,2401.0,2401.0


In [280]:
df_vid.head(1)

Unnamed: 0,DATE,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
0,19 Jan 2022,Kaggle Project From Scratch - Part 2 (Explorat...,KQ80oD_boBM,2191,https://i.ytimg.com/vi/KQ80oD_boBM/hqdefault.jpg,13,0,0,0,0,0,0.069055,151.300154,0


In [288]:
# CONVERTING DATE to datetime
df_vid['DATE'] = pd.to_datetime(df_vid['DATE'], format='mixed')

In [290]:
df_agg_sub.head(1)

Unnamed: 0,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,COUNTRY CODE,IS SUBSCRIBED,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
0,🌶 Hot Topics in Tech: Data Science Explained #...,OtqQYqRNDGI,59,https://i.ytimg.com/vi/OtqQYqRNDGI/hqdefault.jpg,HK,True,23,1,0,0,2,0,0.67187,39.640348,0


In [291]:
df_com.head(1)

Unnamed: 0,COMMENTS,COMMENT_ID,REPLY_COUNT,LIKE_COUNT,DATE,VIDID,USER_ID
0,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22T08:13:29Z,xpIFS6jZbe8,user_981


In [292]:
df_com.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   COMMENTS     10239 non-null  object
 1   COMMENT_ID   10240 non-null  object
 2   REPLY_COUNT  10240 non-null  int64 
 3   LIKE_COUNT   10240 non-null  int64 
 4   DATE         10240 non-null  object
 5   VIDID        10240 non-null  object
 6   USER_ID      10240 non-null  object
dtypes: int64(2), object(5)
memory usage: 560.1+ KB


In [296]:
df_com['DATE'] = pd.to_datetime(df_com['DATE'])

In [300]:
df_com.head(1)

Unnamed: 0,COMMENTS,COMMENT_ID,REPLY_COUNT,LIKE_COUNT,DATE,VIDID,USER_ID
0,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22 08:13:29+00:00,xpIFS6jZbe8,user_981


In [299]:
df_com.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   COMMENTS     10239 non-null  object             
 1   COMMENT_ID   10240 non-null  object             
 2   REPLY_COUNT  10240 non-null  int64              
 3   LIKE_COUNT   10240 non-null  int64              
 4   DATE         10240 non-null  datetime64[ns, UTC]
 5   VIDID        10240 non-null  object             
 6   USER_ID      10240 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(2), object(4)
memory usage: 560.1+ KB


In [256]:
# engineer data
## what metrics wil be relevant
## difference from baseline
## percent change

# build dashboard
## local picture
## individual video

# improvement

In [304]:
def load_data():

    # load data
    df_vid = pd.read_csv('resource/Video_Performance_Over_Time.csv')

    # dropping row Total to avoid issue with calculations
    df_agg = pd.read_csv('resource/Aggregated_Metrics_By_Video.csv').iloc[1:,:]

    df_agg_sub = pd.read_csv('resource/Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')

    df_com = pd.read_csv('resource/All_Comments_Final.csv')

    # converting all columns to uppercase
    for df in [df_agg,df_vid,df_agg_sub, df_com]:
        df.columns = df.columns.str.upper()
        
    # remove \xad
    df_agg.columns = df_agg.columns.str.replace('\xad','')
    # convert date to datetype

    df_agg['VIDEO PUBLISH TIME'] = pd.to_datetime(df_agg['VIDEO PUBLISH TIME'], format='mixed')

    df_agg['AVERAGE VIEW DURATION'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: datetime.strptime(x, '%H:%M:%S'))

    # create new column for df_agg['AVERAGE VIEW SECONDS'
    df_agg['AVERAGE VIEW SECONDS'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: x.second + x.minute * 60 + x.hour * 60 * 60)

    # engagement ration, every engagement a view could do divided by the number of viewrs
    df_agg['ENGAGEMENT RATIO'] = (df_agg['SHARES'] + df_agg['LIKES'] + df_agg['DISLIKES'] + df_agg['COMMENTS ADDED']) / df_agg['VIEWS'] 

    # ratio of views to subscribers gained
    df_agg['VIEW TO SUBSCRIBER RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS GAINED'] # how views does it take to gain a subscriber

    # ratio of views to subscribers lost, 
    df_agg['VIEW TO SUBSCRIBER LOST RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS LOST'] # how views does it take to lose a subscriber

    # sort data by 'VIDEO PUBLISH TIME'
    df_agg.sort_values(by = 'VIDEO PUBLISH TIME', ascending=False, inplace=True)

    # CONVERTING DATE to datetime
    df_vid['DATE'] = pd.to_datetime(df_vid['DATE'], format='mixed')

    df_com['DATE'] = pd.to_datetime(df_com['DATE'])
    
    return df_agg, df_agg_sub, df_com, df_vid 

load_data()

(           VIDEO                                        VIDEO TITLE  \
 111  0jTtHYie3CU  Should You Be Excited About Web 3? (As a Data ...   
 187  2RWwN5ZT4tA  Should  @Luke Barousse Take This Data Analyst ...   
 64   rEWPqw6rMGI         The Only Data Science Explanation You Need   
 59   o-wsyxWbPOw  We Need to Talk About The LinkedIn Machine Lea...   
 32   xpIFS6jZbe8  How I Would Learn Data Science in 2022 (If I H...   
 ..           ...                                                ...   
 75   RRSRKf9eQxc          Should You Get A Masters in Data Science?   
 190  IFceyuL6GZY  How I Became A Data Scientist From a Business ...   
 204  Y_SMU701qlA  Predicting Season Long NBA Wins Using Multiple...   
 138  qfRhKHV8-t4  Predicting Crypto-Currency Price Using RNN lST...   
 223  5p73cIRYCZg                     ProjectDemoCSC478_UFCFightData   
 
     VIDEO PUBLISH TIME  COMMENTS ADDED  SHARES  DISLIKES  LIKES  \
 111         2022-01-17              37      43         8    267  