In [15]:
# import library
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import streamlit as st
from pandas.tseries.offsets import DateOffset

In [20]:
def load_data():

    # load data
    df_vid = pd.read_csv('resource/Video_Performance_Over_Time.csv')

    # dropping row Total to avoid issue with calculations
    df_agg = pd.read_csv('resource/Aggregated_Metrics_By_Video.csv').iloc[1:,:]

    df_agg_sub = pd.read_csv('resource/Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')

    df_com = pd.read_csv('resource/All_Comments_Final.csv')

    # converting all columns to uppercase
    for df in [df_agg,df_vid,df_agg_sub, df_com]:
        df.columns = df.columns.str.upper()

    # remove \xad
    df_agg.columns = df_agg.columns.str.replace('\xad','')

    # drop Nan value
    df_agg.dropna(inplace=True)

    # convert date to datetype

    df_agg['VIDEO PUBLISH TIME'] = pd.to_datetime(df_agg['VIDEO PUBLISH TIME'], format='mixed')

    # rename column

    df_agg.rename(columns={'VIDEO PUBLISH TIME': 'PUBLISH DATE', 'COMMENTS ADDED' : 'COMMENTS'}, inplace=True)
        
    df_agg['AVERAGE VIEW DURATION'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: datetime.strptime(x, '%H:%M:%S'))

    # create new column for df_agg['AVERAGE VIEW SECONDS'
    df_agg['AVERAGE VIEW SECONDS'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: x.second + x.minute * 60 + x.hour * 60 * 60)

    # engagement ration, every engagement a view could do divided by the number of viewrs
    df_agg['ENGAGEMENT RATIO'] = (df_agg['SHARES'] + df_agg['LIKES'] + df_agg['DISLIKES'] + df_agg['COMMENTS']) / df_agg['VIEWS'] 

    # ratio of views to subscribers gained
    df_agg['VIEW TO SUBSCRIBER RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS GAINED'] # how views does it take to gain a subscriber

    # ratio of views to subscribers lost, 
    df_agg['VIEW TO SUBSCRIBER LOST RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS LOST'] # how views does it take to lose a subscriber

    # sort data by 'VIDEO PUBLISH TIME'
    df_agg.sort_values(by = 'PUBLISH DATE', ascending=False, inplace=True)

    # CONVERTING DATE to datetime
    df_vid['DATE'] = pd.to_datetime(df_vid['DATE'], format='mixed')

    df_com['DATE'] = pd.to_datetime(df_com['DATE'])
    # renaming vidId
    df_com.rename(columns={'VIDID':'VIDEO'}, inplace=True)
    # removing '_'
    df_com.columns = df_com.columns.str.replace('_', ' ')

    # create dataframe
    return df_vid, df_agg, df_agg_sub, df_com

df_vid, df_agg, df_agg_sub, df_com = load_data()

In [22]:
for i in [df_vid, df_agg, df_agg_sub, df_com]:
    print(i.columns.tolist())
    print('*'*40)

['DATE', 'VIDEO TITLE', 'EXTERNAL VIDEO ID', 'VIDEO LENGTH', 'THUMBNAIL LINK', 'VIEWS', 'VIDEO LIKES ADDED', 'VIDEO DISLIKES ADDED', 'VIDEO LIKES REMOVED', 'USER SUBSCRIPTIONS ADDED', 'USER SUBSCRIPTIONS REMOVED', 'AVERAGE VIEW PERCENTAGE', 'AVERAGE WATCH TIME', 'USER COMMENTS ADDED']
****************************************
['VIDEO', 'VIDEO TITLE', 'PUBLISH DATE', 'COMMENTS', 'SHARES', 'DISLIKES', 'LIKES', 'SUBSCRIBERS LOST', 'SUBSCRIBERS GAINED', 'RPM (USD)', 'CPM (USD)', 'AVERAGE PERCENTAGE VIEWED (%)', 'AVERAGE VIEW DURATION', 'VIEWS', 'WATCH TIME (HOURS)', 'SUBSCRIBERS', 'YOUR ESTIMATED REVENUE (USD)', 'IMPRESSIONS', 'IMPRESSIONS CLICK-THROUGH RATE (%)', 'AVERAGE VIEW SECONDS', 'ENGAGEMENT RATIO', 'VIEW TO SUBSCRIBER RATIO', 'VIEW TO SUBSCRIBER LOST RATIO']
****************************************
['VIDEO TITLE', 'EXTERNAL VIDEO ID', 'VIDEO LENGTH', 'THUMBNAIL LINK', 'COUNTRY CODE', 'IS SUBSCRIBED', 'VIEWS', 'VIDEO LIKES ADDED', 'VIDEO DISLIKES ADDED', 'VIDEO LIKES REMOVED', 'USER

In [26]:
df_merge1 = pd.merge(left=df_agg_sub.loc[:,['VIDEO TITLE', 'EXTERNAL VIDEO ID']], right=df_agg, how='inner')
df_merge1.head(1)

Unnamed: 0,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO,PUBLISH DATE,COMMENTS,SHARES,DISLIKES,LIKES,SUBSCRIBERS LOST,SUBSCRIBERS GAINED,...,VIEWS,WATCH TIME (HOURS),SUBSCRIBERS,YOUR ESTIMATED REVENUE (USD),IMPRESSIONS,IMPRESSIONS CLICK-THROUGH RATE (%),AVERAGE VIEW SECONDS,ENGAGEMENT RATIO,VIEW TO SUBSCRIBER RATIO,VIEW TO SUBSCRIBER LOST RATIO
0,🌶 Hot Topics in Tech: Data Science Explained #...,OtqQYqRNDGI,OtqQYqRNDGI,2021-01-29,56,25,26,380,18,53,...,8009,79.7725,35,10.292,178890,1.64,35,0.060807,151.113208,444.944444


In [29]:
df_merge1.columns

Index(['VIDEO TITLE', 'EXTERNAL VIDEO ID', 'VIDEO', 'PUBLISH DATE', 'COMMENTS',
       'SHARES', 'DISLIKES', 'LIKES', 'SUBSCRIBERS LOST', 'SUBSCRIBERS GAINED',
       'RPM (USD)', 'CPM (USD)', 'AVERAGE PERCENTAGE VIEWED (%)',
       'AVERAGE VIEW DURATION', 'VIEWS', 'WATCH TIME (HOURS)', 'SUBSCRIBERS',
       'YOUR ESTIMATED REVENUE (USD)', 'IMPRESSIONS',
       'IMPRESSIONS CLICK-THROUGH RATE (%)', 'AVERAGE VIEW SECONDS',
       'ENGAGEMENT RATIO', 'VIEW TO SUBSCRIBER RATIO',
       'VIEW TO SUBSCRIBER LOST RATIO'],
      dtype='object')

In [31]:
df_merge2 = pd.merge(left=df_com, right=df_merge1.loc[:,['VIDEO TITLE','VIDEO', 'PUBLISH DATE', 'SHARES', 'DISLIKES', 'LIKES','SUBSCRIBERS']], how='inner')
df_merge2.head(1)

Unnamed: 0,COMMENTS,COMMENT ID,REPLY COUNT,LIKE COUNT,DATE,VIDEO,USER ID,VIDEO TITLE,PUBLISH DATE,SHARES,DISLIKES,LIKES,SUBSCRIBERS
0,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22 08:13:29+00:00,xpIFS6jZbe8,user_981,How I Would Learn Data Science in 2022 (If I H...,2021-12-27,767,53,4413,2507


In [58]:
df_diff_vi = pd.merge(left=df_agg_sub.loc[:,['VIDEO TITLE', 'EXTERNAL VIDEO ID']], right=df_agg, how='inner')
df_diff_vid = pd.merge(left=df_com, right=df_diff_vi.loc[:,['VIDEO TITLE','VIDEO', 'PUBLISH DATE', 'SHARES', 'DISLIKES', 'LIKES','SUBSCRIBERS']], how='inner')
df_diff_vid.dropna(inplace=True)

In [59]:
df_diff_vid.select_dtypes(include=['number'])

Unnamed: 0,REPLY COUNT,LIKE COUNT,SHARES,DISLIKES,LIKES,SUBSCRIBERS
0,0,0,767,53,4413,2507
1,0,0,767,53,4413,2507
2,0,0,767,53,4413,2507
3,0,0,767,53,4413,2507
4,0,0,767,53,4413,2507
...,...,...,...,...,...,...
3085359,1,0,13,1,82,27
3085360,1,0,13,1,82,27
3085361,1,0,13,1,82,27
3085362,1,0,13,1,82,27


In [60]:
numeric_col = df_diff_vid.select_dtypes(include=['number'])


In [61]:
selected_vid = df_merge2['VIDEO TITLE'][9]

In [62]:
filtered_agg_sub = df_agg_sub[df_agg_sub['VIDEO TITLE'] == selected_vid]

In [63]:
filtered_agg_sub.head(1)

Unnamed: 0,VIDEO TITLE,EXTERNAL VIDEO ID,VIDEO LENGTH,THUMBNAIL LINK,COUNTRY CODE,IS SUBSCRIBED,VIEWS,VIDEO LIKES ADDED,VIDEO DISLIKES ADDED,VIDEO LIKES REMOVED,USER SUBSCRIPTIONS ADDED,USER SUBSCRIPTIONS REMOVED,AVERAGE VIEW PERCENTAGE,AVERAGE WATCH TIME,USER COMMENTS ADDED
34713,How I Would Learn Data Science in 2022 (If I H...,xpIFS6jZbe8,734,https://i.ytimg.com/vi/xpIFS6jZbe8/hqdefault.jpg,BZ,False,7,1,0,0,0,0,0.064502,47.344286,0


In [64]:
df_diff_vid.head(1)

Unnamed: 0,COMMENTS,COMMENT ID,REPLY COUNT,LIKE COUNT,DATE,VIDEO,USER ID,VIDEO TITLE,PUBLISH DATE,SHARES,DISLIKES,LIKES,SUBSCRIBERS
0,Thanks for this video Ken.\n\nI decided to go ...,UgxFZTIzC4UPyhhX_PZ4AaABAg,0,0,2022-01-22 08:13:29+00:00,xpIFS6jZbe8,user_981,How I Would Learn Data Science in 2022 (If I H...,2021-12-27,767,53,4413,2507


In [65]:
df_diff_vid.describe()

Unnamed: 0,REPLY COUNT,LIKE COUNT,PUBLISH DATE,SHARES,DISLIKES,LIKES,SUBSCRIBERS
count,3085140.0,3085140.0,3085140,3085140.0,3085140.0,3085140.0,3085140.0
mean,1.352441,2.950825,2020-07-24 01:43:10.132052224,1485.088,134.2455,7535.465,6407.918
min,0.0,0.0,2017-11-18 00:00:00,0.0,0.0,18.0,-21.0
25%,1.0,1.0,2020-03-09 00:00:00,60.0,7.0,527.0,123.0
50%,1.0,1.0,2020-06-05 00:00:00,243.0,16.0,1559.0,603.0
75%,2.0,2.0,2020-12-19 00:00:00,584.0,64.0,3225.0,2362.0
max,21.0,501.0,2022-01-17 00:00:00,9583.0,942.0,46903.0,46453.0
std,1.121572,12.04192,,2986.207,288.1231,14451.55,14316.3


In [67]:
df_diff_vid[df_diff_vid['VIDEO TITLE'] == selected_vid]['COMMENT ID'].value_counts()

COMMENT ID
UgxFZTIzC4UPyhhX_PZ4AaABAg    373
Ugxdn_a08AFhYattkeh4AaABAg    373
UgxVLb6HxNYkx7fO0vl4AaABAg    373
UgzrvkS08yItWd0GI_d4AaABAg    373
UgwnBAE8tycx5R0PAxR4AaABAg    373
                             ... 
Ugw5552q-ZLAytk8Ljh4AaABAg    373
UgxPiRMrlMYGo_QFzTx4AaABAg    373
UgwbfK869LFHGbZYgB14AaABAg    373
Ugy8k-serm7iktXuHQZ4AaABAg    373
UgxrNlcsez6W2cqMqW94AaABAg    373
Name: count, Length: 95, dtype: int64