In [224]:
# import library
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import streamlit as st

In [225]:
# define functions

# load data
df_vid = pd.read_csv('resource/Video_Performance_Over_Time.csv')

# dropping row Total to avoid issue with calculations
df_agg = pd.read_csv('resource/Aggregated_Metrics_By_Video.csv').iloc[1:,:]

df_agg_sub = pd.read_csv('resource/Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')

df_com = pd.read_csv('resource/All_Comments_Final.csv')

In [226]:
# clean our data

In [227]:
# converting all columns to uppercase
for df in [df_agg,df_vid,df_agg_sub, df_com]:
    df.columns = df.columns.str.upper()


In [228]:
# checking
df_com.columns

Index(['COMMENTS', 'COMMENT_ID', 'REPLY_COUNT', 'LIKE_COUNT', 'DATE', 'VIDID',
       'USER_ID'],
      dtype='object')

__Cleaning df_agg__ dataset

In [229]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 1 to 223
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   VIDEO                                 223 non-null    object 
 1   VIDEO TITLE                           223 non-null    object 
 2   VIDEO PUB­LISH TIME                   223 non-null    object 
 3   COM­MENTS AD­DED                      223 non-null    int64  
 4   SHARES                                223 non-null    int64  
 5   DIS­LIKES                             223 non-null    int64  
 6   LIKES                                 223 non-null    int64  
 7   SUB­SCRIBERS LOST                     223 non-null    int64  
 8   SUB­SCRIBERS GAINED                   223 non-null    int64  
 9   RPM (USD)                             223 non-null    float64
 10  CPM (USD)                             221 non-null    float64
 11  AV­ER­AGE PER­CENT­

In [230]:
df_agg.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUB­LISH TIME,COM­MENTS AD­DED,SHARES,DIS­LIKES,LIKES,SUB­SCRIBERS LOST,SUB­SCRIBERS GAINED,RPM (USD),CPM (USD),AV­ER­AGE PER­CENT­AGE VIEWED (%),AV­ER­AGE VIEW DUR­A­TION,VIEWS,WATCH TIME (HOURS),SUB­SCRIBERS,YOUR ES­TIM­ATED REV­EN­UE (USD),IM­PRES­SIONS,IM­PRES­SIONS CLICK-THROUGH RATE (%)
1,4OZip0cgOho,How I Would Learn Data Science (If I Had to St...,"May 8, 2020",907,9583,942,46903,451,46904,6.353,12.835,36.65,0:03:09,1253559,65850.7042,46453,7959.533,26498799,3.14


In [231]:
# checking columns
df_agg.columns.tolist() # notice \xad in our columns

['VIDEO',
 'VIDEO TITLE',
 'VIDEO PUB\xadLISH TIME',
 'COM\xadMENTS AD\xadDED',
 'SHARES',
 'DIS\xadLIKES',
 'LIKES',
 'SUB\xadSCRIBERS LOST',
 'SUB\xadSCRIBERS GAINED',
 'RPM (USD)',
 'CPM (USD)',
 'AV\xadER\xadAGE PER\xadCENT\xadAGE VIEWED (%)',
 'AV\xadER\xadAGE VIEW DUR\xadA\xadTION',
 'VIEWS',
 'WATCH TIME (HOURS)',
 'SUB\xadSCRIBERS',
 'YOUR ES\xadTIM\xadATED REV\xadEN\xadUE (USD)',
 'IM\xadPRES\xadSIONS',
 'IM\xadPRES\xadSIONS CLICK-THROUGH RATE (%)']

In [232]:
# remove \xad
df_agg.columns = df_agg.columns.str.replace('\xad','')

In [233]:
# check 
df_agg.columns.tolist()

['VIDEO',
 'VIDEO TITLE',
 'VIDEO PUBLISH TIME',
 'COMMENTS ADDED',
 'SHARES',
 'DISLIKES',
 'LIKES',
 'SUBSCRIBERS LOST',
 'SUBSCRIBERS GAINED',
 'RPM (USD)',
 'CPM (USD)',
 'AVERAGE PERCENTAGE VIEWED (%)',
 'AVERAGE VIEW DURATION',
 'VIEWS',
 'WATCH TIME (HOURS)',
 'SUBSCRIBERS',
 'YOUR ESTIMATED REVENUE (USD)',
 'IMPRESSIONS',
 'IMPRESSIONS CLICK-THROUGH RATE (%)']

In [234]:
# convert date to datetype

df_agg['VIDEO PUBLISH TIME'] = pd.to_datetime(df_agg['VIDEO PUBLISH TIME'], format='mixed')

In [235]:
df_agg['VIDEO PUBLISH TIME'].dtype #datetime dtype

dtype('<M8[ns]')

In [236]:
# AVERAGE VIEW DURATION is an object meaning it contains numbers and string
df_agg['AVERAGE VIEW DURATION'][9] # a string

'0:04:56'

In [237]:
df_agg['AVERAGE VIEW DURATION'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: datetime.strptime(x, '%H:%M:%S'))

In [238]:
# check
df_agg['AVERAGE VIEW DURATION'][9]

Timestamp('1900-01-01 00:04:56')

In [239]:
# create new column for df_agg['AVERAGE VIEW SECONDS'
df_agg['AVERAGE VIEW SECONDS'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: x.second + x.minute * 60 + x.hour * 60 * 60)

In [240]:
# check
df_agg['AVERAGE VIEW SECONDS'][9]

296

In [241]:
df_agg.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),CPM (USD),AVERAGE PERCENTAGE VIEWED (%),AVERAGE VIEW DURATION,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS
1,4OZip0cgOho,How I Would Learn Data Science (If I Had to St...,2020-05-08,907,9583,942,46903,451,46904,6.353,12.835,36.65,1900-01-01 00:03:09,1253559,65850.7042,46453,7959.533,26498799,3.14,189


In [242]:
df_agg.info() # our data type is corrected

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 1 to 223
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   VIDEO                               223 non-null    object        
 1   VIDEO TITLE                         223 non-null    object        
 2   VIDEO PUBLISH TIME                  223 non-null    datetime64[ns]
 3   COMMENTS ADDED                      223 non-null    int64         
 4   SHARES                              223 non-null    int64         
 5   DISLIKES                            223 non-null    int64         
 6   LIKES                               223 non-null    int64         
 7   SUBSCRIBERS LOST                    223 non-null    int64         
 8   SUBSCRIBERS GAINED                  223 non-null    int64         
 9   RPM (USD)                           223 non-null    float64       
 10  CPM (USD)                 

In [243]:
# engagement ration, every engagement a view could do divided by the number of viewrs
df_agg['ENGAGEMENT RATIO'] = (df_agg['SHARES'] + df_agg['LIKES'] + df_agg['DISLIKES'] + df_agg['COMMENTS ADDED']) / df_agg['VIEWS'] 

# ratio of views to subscribers gained
df_agg['VIEW TO SUBSCRIBER RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS GAINED'] # how views does it take to gain a subscriber

# ratio of views to subscribers lost, 
df_agg['VIEW TO SUBSCRIBER LOST RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS LOST'] # how views does it take to lose a subscriber

In [244]:
df_agg.head(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),...,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
1,4OZip0cgOho,How I Would Learn Data Science (If I Had to St...,2020-05-08,907,9583,942,46903,451,46904,6.353,...,1253559,65850.7042,46453,7959.533,26498799,3.14,189,0.046536,26.726057,2779.509978


In [274]:
# sort data by 'VIDEO PUBLISH TIME'
df_agg.sort_values(by = 'VIDEO PUBLISH TIME', ascending=True, inplace=True)

In [275]:
df_agg.head























































































































































































































































































































































































(1)

Unnamed: 0,VIDEO,VIDEO TITLE,VIDEO PUBLISH TIME,COMMENTS ADDED,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,RPM (USD),...,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
223,5p73cIRYCZg,ProjectDemoCSC478_UFCFightData,2017-06-06,0,2,0,1,0,0,0.05,...,60,1.0684,0,0.003,365,11.51,64,0.05,inf,inf


In [245]:
# checking for missing value

df_agg.isna().sum()

VIDEO                                 0
VIDEO TITLE                           0
VIDEO PUBLISH TIME                    0
COMMENTS ADDED                        0
SHARES                                0
DISLIKES                              0
LIKES                                 0
SUBSCRIBERS LOST                      0
SUBSCRIBERS GAINED                    0
RPM (USD)                             0
CPM (USD)                             2
AVERAGE PERCENTAGE VIEWED (%)         0
AVERAGE VIEW DURATION                 0
VIEWS                                 0
WATCH TIME (HOURS)                    0
SUBSCRIBERS                           0
YOUR ESTIMATED REVENUE (USD)          0
IMPRESSIONS                           0
IMPRESSIONS CLICK-THROUGH RATE (%)    0
AVERAGE VIEW SECONDS                  0
ENGAGEMENT RATIO                      0
VIEW TO SUBSCRIBER RATIO              0
VIEW TO SUBSCRIBER LOST RATIO         0
dtype: int64

__cleaning df_vid__

In [247]:
df_vid.shape

(111857, 14)

In [248]:
df_vid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111857 entries, 0 to 111856
Data columns (total 14 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   DATE                        111857 non-null  object 
 1   VIDEO TITLE                 111857 non-null  object 
 2   EXTERNAL VIDEO ID           111857 non-null  object 
 3   VIDEO LENGTH                111857 non-null  int64  
 4   THUMBNAIL LINK              111857 non-null  object 
 5   VIEWS                       111857 non-null  int64  
 6   VIDEO LIKES ADDED           111857 non-null  int64  
 7   VIDEO DISLIKES ADDED        111857 non-null  int64  
 8   VIDEO LIKES REMOVED         111857 non-null  int64  
 9   USER SUBSCRIPTIONS ADDED    111857 non-null  int64  
 10  USER SUBSCRIPTIONS REMOVED  111857 non-null  int64  
 11  AVERAGE VIEW PERCENTAGE     110510 non-null  float64
 12  AVERAGE WATCH TIME          110510 non-null  float64
 13  USER COMMENTS 

In [249]:
df_vid.columns.tolist()

['DATE',
 'VIDEO TITLE',
 'EXTERNAL VIDEO ID',
 'VIDEO LENGTH',
 'THUMBNAIL LINK',
 'VIEWS',
 'VIDEO LIKES ADDED',
 'VIDEO DISLIKES ADDED',
 'VIDEO LIKES REMOVED',
 'USER SUBSCRIPTIONS ADDED',
 'USER SUBSCRIPTIONS REMOVED',
 'AVERAGE VIEW PERCENTAGE',
 'AVERAGE WATCH TIME',
 'USER COMMENTS ADDED']

In [250]:
df_vid.isna().sum()

DATE                             0
VIDEO TITLE                      0
EXTERNAL VIDEO ID                0
VIDEO LENGTH                     0
THUMBNAIL LINK                   0
VIEWS                            0
VIDEO LIKES ADDED                0
VIDEO DISLIKES ADDED             0
VIDEO LIKES REMOVED              0
USER SUBSCRIPTIONS ADDED         0
USER SUBSCRIPTIONS REMOVED       0
AVERAGE VIEW PERCENTAGE       1347
AVERAGE WATCH TIME            1347
USER COMMENTS ADDED              0
dtype: int64

In [251]:
df_vid.head(1)

Unnamed: 0,DATE,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
0,19 Jan 2022,Kaggle Project From Scratch - Part 2 (Explorat...,KQ80oD_boBM,2191,https://i.ytimg.com/vi/KQ80oD_boBM/hqdefault.jpg,13,0,0,0,0,0,0.069055,151.300154,0


In [252]:
df_vid[df_vid['AVERAGE WATCH TIME'].isna()].head(5) # Nan values row

Unnamed: 0,DATE,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
215,18 Jan 2022,Reviewing Your Data Science Projects - Episode...,jam0xApfC-U,804,https://i.ytimg.com/vi/jam0xApfC-U/hqdefault.jpg,0,0,0,0,0,0,,,0
253,18 Jan 2022,Can You Learn Data Science Without a Computer?,Fs_LG5Y8GIU,199,https://i.ytimg.com/vi/Fs_LG5Y8GIU/hqdefault.jpg,0,0,0,0,0,0,,,0
259,18 Jan 2022,His Startup Will Land You a Data Science Job (...,7VcdvSyoxnc,3706,https://i.ytimg.com/vi/7VcdvSyoxnc/hqdefault.jpg,0,0,0,0,0,0,,,0
293,18 Jan 2022,Is it Important to Share Your Data Science Wor...,ELFGsNqZrlM,1181,https://i.ytimg.com/vi/ELFGsNqZrlM/hqdefault.jpg,0,0,0,0,0,0,,,0
296,18 Jan 2022,Building a Deep Learning BEAST (NVIDIA TITAN R...,_rbrdNTpZGo,578,https://i.ytimg.com/vi/_rbrdNTpZGo/hqdefault.jpg,0,0,0,0,0,0,,,0


In [253]:
df_vid[df_vid['AVERAGE WATCH TIME'].isna()].tail(5) # Nan values row

Unnamed: 0,DATE,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
111416,12 Nov 2018,ProjectDemoCSC478_UFCFightData,5p73cIRYCZg,729,https://i.ytimg.com/vi/5p73cIRYCZg/hqdefault.jpg,0,0,0,0,0,0,,,0
111440,29 Oct 2018,Predicting Season Long NBA Wins Using Multiple...,Y_SMU701qlA,539,https://i.ytimg.com/vi/Y_SMU701qlA/hqdefault.jpg,0,1,0,0,0,0,,,0
111488,22 Sept 2018,Predicting Season Long NBA Wins Using Multiple...,Y_SMU701qlA,539,https://i.ytimg.com/vi/Y_SMU701qlA/hqdefault.jpg,0,0,0,0,0,0,,,0
111493,20 Sept 2018,Predicting Season Long NBA Wins Using Multiple...,Y_SMU701qlA,539,https://i.ytimg.com/vi/Y_SMU701qlA/hqdefault.jpg,0,0,0,0,0,0,,,0
111650,19 May 2018,ProjectDemoCSC478_UFCFightData,5p73cIRYCZg,729,https://i.ytimg.com/vi/5p73cIRYCZg/hqdefault.jpg,0,0,0,0,0,0,,,0


In [254]:
for col in df_vid.columns[5:]:
    print('-'*20)
    print(col)
    print(df_vid[col].unique())
    
    
# this shows that we cant just drop these rows since they contain unique data

--------------------
VIEWS
[ 13   2  10 ... 811 389 595]
--------------------
VIDEO LIKES ADDED
[   0    2    1    4    3   26   16    8    9    6    5   14   80   19
   12   11   95   36    7   21   10   22  199   17   78  107   25   13
   24  103   53   20   30   41   15  113   18   63   94  104  146  131
   23  346   33  110  101   76   32  143   38   28  175   27  218   70
  191  362  293  194  299   40  356  404   43  657   34   37   45   31
   35   88   29   59   89  124   44  321   60   81  111   99  258  176
   48   52  172  123   51  137  134  208  340  140   54   42   47   58
   55   69   64   49   73   67   57   96   82   62   50   39  106   46
   56   61   83  169  118   85  114   65  108  284  135  139   68  109
  231  235  158  155  165   71   74  136  343  322   90  294   72   75
  198  138  170  167  491  115  116  129  261   77   79  210   87  102
  142  161  200  467   86   93   98   84  207   66  151  205  195  122
  127  132  372  149  227  360  119   91  144  325  

In [255]:
df_vid.duplicated().sum() # no duplicates

0

In [256]:
# engineer data
## what metrics wil be relevant
## difference from baseline
## percent change

# build dashboard
## local picture
## individual video

# improvement