In [1]:
# import library
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import streamlit as st
from pandas.tseries.offsets import DateOffset

In [2]:
# define functions

# load data
df_vid = pd.read_csv('resource/Video_Performance_Over_Time.csv')

# dropping row Total to avoid issue with calculations
df_agg = pd.read_csv('resource/Aggregated_Metrics_By_Video.csv').iloc[1:,:]

df_agg_sub = pd.read_csv('resource/Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')

df_com = pd.read_csv('resource/All_Comments_Final.csv')

In [3]:
# clean our data

In [4]:
# converting all columns to uppercase
for df in [df_agg,df_vid,df_agg_sub, df_com]:
    df.columns = df.columns.str.upper()


In [5]:
# checking
df_com.columns

In [6]:
df_agg.info()

In [7]:
df_agg.head(1)

In [8]:
# checking columns
df_agg.columns.tolist() # notice \xad in our columns

In [9]:
# remove \xad
df_agg.columns = df_agg.columns.str.replace('\xad','')

In [10]:
# check 
df_agg.columns.tolist()

In [11]:
# convert date to datetype

df_agg['VIDEO PUBLISH TIME'] = pd.to_datetime(df_agg['VIDEO PUBLISH TIME'], format='mixed')

In [12]:
df_agg['VIDEO PUBLISH TIME']

In [13]:
df_agg['VIDEO PUBLISH TIME'][9]

In [14]:
df_agg.head(1)

In [15]:
# AVERAGE VIEW DURATION is an object meaning it contains numbers and string
df_agg['AVERAGE VIEW DURATION'][9] # a string

In [16]:
df_agg['AVERAGE VIEW DURATION'] = pd.to_datetime(df_agg['AVERAGE VIEW DURATION']).dt.time


# check
df_agg['AVERAGE VIEW DURATION'][9]

In [17]:
# create new column for df_agg['AVERAGE VIEW SECONDS'
df_agg['AVERAGE VIEW SECONDS'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: x.second + x.minute * 60 + x.hour * 60 * 60)

# check
df_agg['AVERAGE VIEW SECONDS'][9]

In [18]:
df_agg.head(1)

In [19]:
df_agg.info() # our data type is corrected

In [20]:
# engagement ration, every engagement a view could do divided by the number of viewrs
df_agg['ENGAGEMENT RATIO'] = (df_agg['SHARES'] + df_agg['LIKES'] + df_agg['DISLIKES'] + df_agg['COMMENTS ADDED']) / df_agg['VIEWS'] 

# ratio of views to subscribers gained
df_agg['VIEW TO SUBSCRIBER RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS GAINED'] # how views does it take to gain a subscriber

# ratio of views to subscribers lost, 
df_agg['VIEW TO SUBSCRIBER LOST RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS LOST'] # how views does it take to lose a subscriber

In [21]:
df_agg.head(1)

In [22]:
# sort data by 'VIDEO PUBLISH TIME'
df_agg.sort_values(by = 'VIDEO PUBLISH TIME', ascending=False, inplace=True)

df_agg = df_agg.reset_index(drop=True)

In [23]:
df_agg.head(2)

In [24]:
df_vid.head(1)

In [25]:
# CONVERTING DATE to datetime
df_vid['DATE'] = pd.to_datetime(df_vid['DATE'], format='mixed').dt.date

In [26]:
df_agg_sub.head(1)

In [27]:
df_com.head(1)

In [28]:
df_com.info()

In [29]:
df_com['DATE'] = pd.to_datetime(df_com['DATE']).dt.date

In [30]:
df_com.head(1)

In [31]:
df_com.info()

In [32]:
# engineer data
## what metrics wil be relevant
## difference from baseline
## percent change

# build dashboard
## local picture
## individual video

# improvement

In [33]:
def load_data():

    # load data
    df_vid = pd.read_csv('resource/Video_Performance_Over_Time.csv')

    # dropping row Total to avoid issue with calculations
    df_agg = pd.read_csv('resource/Aggregated_Metrics_By_Video.csv').iloc[1:,:]

    df_agg_sub = pd.read_csv('resource/Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')

    df_com = pd.read_csv('resource/All_Comments_Final.csv')

    # converting all columns to uppercase
    for df in [df_agg,df_vid,df_agg_sub, df_com]:
        df.columns = df.columns.str.upper()
        
    # remove \xad
    df_agg.columns = df_agg.columns.str.replace('\xad','')
    # convert date to datetype

    df_agg['VIDEO PUBLISH TIME'] = pd.to_datetime(df_agg['VIDEO PUBLISH TIME'], format='mixed')

    df_agg['AVERAGE VIEW DURATION'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: datetime.strptime(x, '%H:%M:%S'))

    # create new column for df_agg['AVERAGE VIEW SECONDS'
    df_agg['AVERAGE VIEW SECONDS'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: x.second + x.minute * 60 + x.hour * 60 * 60)

    # engagement ration, every engagement a view could do divided by the number of viewrs
    df_agg['ENGAGEMENT RATIO'] = (df_agg['SHARES'] + df_agg['LIKES'] + df_agg['DISLIKES'] + df_agg['COMMENTS ADDED']) / df_agg['VIEWS'] 

    # ratio of views to subscribers gained
    df_agg['VIEW TO SUBSCRIBER RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS GAINED'] # how views does it take to gain a subscriber

    # ratio of views to subscribers lost, 
    df_agg['VIEW TO SUBSCRIBER LOST RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS LOST'] # how views does it take to lose a subscriber

    # sort data by 'VIDEO PUBLISH TIME'
    df_agg.sort_values(by = 'VIDEO PUBLISH TIME', ascending=False, inplace=True)

    # CONVERTING DATE to datetime
    df_vid['DATE'] = pd.to_datetime(df_vid['DATE'], format='mixed')

    df_com['DATE'] = pd.to_datetime(df_com['DATE'])
    
    return df_vid, df_agg, df_agg_sub, df_com

In [34]:
    
df_vid, df_agg, df_agg_sub, df_com = load_data()

In [35]:
# aggregated differential 

# create a copy of our dataframe
df_agg_diff = df_agg.copy()

In [36]:
# for the last 12 months, most recently date back to 12 months
metric_date_12mo = df_agg_diff['VIDEO PUBLISH TIME'].max() - DateOffset(months=12)
metric_date_12mo # 12 months early date

In [37]:
# dataframe from metric_date_12mo to df_agg_diff['VIDEO PUBLISH TIME'].max()
# that is, from 12 monts early to current date

df_agg_diff_12mo = df_agg_diff[df_agg_diff['VIDEO PUBLISH TIME'] >= metric_date_12mo] # we have 48 rows
df_agg_diff_12mo.shape

In [38]:
median_agg = df_agg_diff_12mo[df_agg_diff_12mo.columns[2:]].median()
median_agg

In [39]:
# aggregated differential 

# create a copy of our dataframe
df_agg_diff = df_agg.copy()

In [40]:
df_agg['VIDEO PUBLISH TIME']

In [41]:
# for the last 12 months, most recently date back to 12 months
metric_date_12mo = df_agg_diff['VIDEO PUBLISH TIME'].max() - DateOffset(months=12)

metric_date_12mo = metric_date_12mo

# dataframe from metric_date_12mo to df_agg_diff['VIDEO PUBLISH TIME'].max()
# that is, from 12 months early to current date

df_agg_diff_12mo = df_agg_diff[df_agg_diff['VIDEO PUBLISH TIME'] >= metric_date_12mo]
# median 
median_agg = df_agg_diff_12mo[df_agg_diff_12mo.columns[2:]].median()

In [42]:
median_agg

In [43]:
# local picture

metric_agg = df_agg[[
    'VIDEO PUBLISH TIME',
    'COMMENTS ADDED', 
    'SHARES', 
    'DISLIKES', 
    'LIKES', 
    'SUBSCRIBERS GAINED', 
    'RPM (USD)', 
    'VIEWS', 
    'YOUR ESTIMATED REVENUE (USD)',
    'AVERAGE VIEW SECONDS', 
    'ENGAGEMENT RATIO', 
    'VIEW TO SUBSCRIBER RATIO',
]]

def metric_median(n):
    # 
    metric_date_n = metric_agg['VIDEO PUBLISH TIME'].max() - DateOffset(months=n)
    median_date_n = metric_agg[metric_agg['VIDEO PUBLISH TIME'] >= metric_date_n].median()
    
    return metric_date_n,median_date_n

metric_12mo, median_12mo = metric_median(12)
metric_6mo, median_6mo = metric_median(6)

In [44]:
len(metric_agg.columns)

In [45]:
median_6mo.index

In [46]:
median_6mo['VIEWS']

In [47]:
for i in median_6mo.index:
    if i != 'VIDEO PUBLISH TIME':
        pass
        print(median_6mo[i] - median_12mo[i])
        #print((median_6mo[i] - median_12mo[i])/median_12mo[i])
    else:
        delta = median_6mo[i] - median_12mo[i]
        print(delta)
        print(delta.days)
        
        #st.metric(label = 'Duration', value = delta, delta=f"{delta} Days")

In [48]:
df_agg_diff.dtypes

In [49]:
df_agg_diff.columns.tolist()

In [50]:
df_agg_diff_final = df_agg_diff.loc[:,[
 'VIDEO',
 'VIDEO TITLE',
 'VIDEO PUBLISH TIME',
 'COMMENTS ADDED',
 'SHARES',
 'DISLIKES',
 'LIKES',
 'SUBSCRIBERS LOST',
 'SUBSCRIBERS GAINED',
 'VIEWS',
 'SUBSCRIBERS',
 'YOUR ESTIMATED REVENUE (USD)',
 'IMPRESSIONS',
 'IMPRESSIONS CLICK-THROUGH RATE (%)',
 'AVERAGE VIEW SECONDS',
 'ENGAGEMENT RATIO',
 'VIEW TO SUBSCRIBER RATIO',
 'VIEW TO SUBSCRIBER LOST RATIO']
]
df_agg_diff_final.head(1)

In [51]:
df_agg['PUBLISH DATE'] = df_agg.loc[:,'VIDEO PUBLISH TIME']

In [52]:
df_agg['VIDEO PUBLISH TIME'][9]

In [53]:
df_agg['PUBLISH DATE'][9]

In [54]:
df_agg.head(1)

In [55]:
def load_data():

    # load data
    df_vid = pd.read_csv('resource/Video_Performance_Over_Time.csv')

    # dropping row Total to avoid issue with calculations
    df_agg = pd.read_csv('resource/Aggregated_Metrics_By_Video.csv').iloc[1:,:]

    df_agg_sub = pd.read_csv('resource/Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')

    df_com = pd.read_csv('resource/All_Comments_Final.csv')

    # converting all columns to uppercase
    for df in [df_agg,df_vid,df_agg_sub, df_com]:
        df.columns = df.columns.str.upper()

    # remove \xad
    df_agg.columns = df_agg.columns.str.replace('\xad','')
    # convert date to datetype

    df_agg['VIDEO PUBLISH TIME'] = pd.to_datetime(df_agg['VIDEO PUBLISH TIME'], format='mixed')
    print(df_agg['VIDEO PUBLISH TIME'][1])
        
    df_agg['AVERAGE VIEW DURATION'] = pd.to_datetime(df_agg['AVERAGE VIEW DURATION']).dt.time
    print(df_agg['AVERAGE VIEW DURATION'])

    # create new column for df_agg['AVERAGE VIEW SECONDS'
    df_agg['AVERAGE VIEW SECONDS'] = df_agg['AVERAGE VIEW DURATION'].apply(lambda x: x.second + x.minute * 60 + x.hour * 60 * 60)
    
    print(df_agg['AVERAGE VIEW SECONDS'])

    # engagement ration, every engagement a view could do divided by the number of viewrs
    df_agg['ENGAGEMENT RATIO'] = (df_agg['SHARES'] + df_agg['LIKES'] + df_agg['DISLIKES'] + df_agg['COMMENTS ADDED']) / df_agg['VIEWS'] 

    # ratio of views to subscribers gained
    df_agg['VIEW TO SUBSCRIBER RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS GAINED'] # how views does it take to gain a subscriber

    # ratio of views to subscribers lost, 
    df_agg['VIEW TO SUBSCRIBER LOST RATIO'] = df_agg['VIEWS'] / df_agg['SUBSCRIBERS LOST'] # how views does it take to lose a subscriber

    # sort data by 'VIDEO PUBLISH TIME'
    df_agg.sort_values(by = 'VIDEO PUBLISH TIME', ascending=False, inplace=True)

    # reset index
    df_agg = df_agg.reset_index(drop=True)

    # CONVERTING DATE to datetime
    df_vid['DATE'] = pd.to_datetime(df_vid['DATE'], format='mixed')

    df_com['DATE'] = pd.to_datetime(df_com['DATE'])

    # create dataframe
    return df_vid, df_agg, df_agg_sub, df_com

df_vid, df_agg, df_agg_sub, df_com = load_data()

In [56]:
df_agg.head(1)

In [57]:
# engineer data

# aggregated differential 

# create a copy of our dataframe
df_agg_diff = df_agg.copy()

# for the last 12 months, most recently date back to 12 months
metric_date_12mo = df_agg_diff['VIDEO PUBLISH TIME'].max() - DateOffset(months=12)

# dataframe from metric_date_12mo to df_agg_diff['VIDEO PUBLISH TIME'].max()
# that is, from 12 months early to current date

df_agg_diff_12mo = df_agg_diff[df_agg_diff['VIDEO PUBLISH TIME'] >= metric_date_12mo]

In [58]:
df_agg_diff_12mo[df_agg_diff_12mo.columns[2:]].describe()

In [59]:
# median 
median_agg = df_agg_diff_12mo[df_agg_diff_12mo.columns[2:]].median()

In [60]:
# needed columns to display

# converting timpstamp to date.date object
df_agg['PUBLISH DATE'] = df_agg['VIDEO PUBLISH TIME'].dt.date

metric_agg = df_agg[[
    'PUBLISH TIME',
    'COMMENTS ADDED', 
    'SHARES', 
    'DISLIKES', 
    'LIKES', 
    'SUBSCRIBERS GAINED', 
    'RPM (USD)', 
    'VIEWS', 
    'YOUR ESTIMATED REVENUE (USD)',
    'AVERAGE VIEW SECONDS', 
    'ENGAGEMENT RATIO', 
    'VIEW TO SUBSCRIBER RATIO',
]]  

def metric_median(n):
    # date range
    metric_date_n = metric_agg['VIDEO PUBLISH TIME'].max() - DateOffset(months=n)
    # median of date range
    median_date_n = metric_agg[metric_agg['VIDEO PUBLISH TIME'] >= metric_date_n].median()

    return metric_date_n,median_date_n

metric_12mo, median_12mo = metric_median(12)
metric_6mo, median_6mo = metric_median(6)

col1, col2, col3,col4,col5,col6 = st.columns(6)
columns = [col1, col2, col3,col4,col5,col6]

count = 0
for i in median_6mo.index:
    with columns[count]:
        if i != 'VIDEO PUBLISH TIME':
            delta = (median_6mo[i] - median_12mo[i])/median_12mo[i]

            st.metric(label = i, value = round(median_12mo[i], 1), delta="{:.2%}".format(delta)) 
        else:
            # dealing with datetime
            delta = median_6mo[i] - median_12mo[i]

            st.metric(label = 'Duration', value = delta.days, delta=f"{(delta.days)//30} Months")
        count += 1
        if count >= 6:
            count = 0


df_agg_diff_final = df_agg_diff.loc[:,[
'VIDEO',
'VIDEO TITLE',
'VIDEO PUBLISH TIME',
'COMMENTS ADDED',
'SHARES',
'DISLIKES',
'LIKES',
'SUBSCRIBERS LOST',
'SUBSCRIBERS GAINED',
'VIEWS',
'SUBSCRIBERS',
'YOUR ESTIMATED REVENUE (USD)',
'IMPRESSIONS',
'IMPRESSIONS CLICK-THROUGH RATE (%)',
'AVERAGE VIEW SECONDS',
'ENGAGEMENT RATIO',
'VIEW TO SUBSCRIBER RATIO',
'VIEW TO SUBSCRIBER LOST RATIO']
]
#st.dataframe(df_agg_diff_final)