In [144]:
# These are some common libraries which are required to do any kind of data analysis 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline


In [145]:
# This function will help reading data from a CSV file
# It will return the data as a Panda Data frame
def load_data_from_csv(filePath, fileName):
    df = pd.read_csv(filePath + fileName)
    return df


In [146]:
# This is where we are loading the data from a CSV file
df = load_data_from_csv('./data/', 'youtube.csv')

# removing the comma from the original data set
df = df.replace(',', '', regex=True)

# converting certain columns to integer
columns_to_exclude = ['Month', 'Month.1']
df.loc[:, df.columns.difference(columns_to_exclude)] = df.loc[:, df.columns.difference(columns_to_exclude)].astype(float)
df['Month.1'] = pd.to_datetime(df['Month.1'], format='%m/%d/%Y').dt.strftime('%m').astype(int)
df['Views'] = df['Views'].astype(int)
df = df.drop('Month', axis=1)


In [147]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Month.1                11 non-null     int64  
 1   Views                  11 non-null     int64  
 2   Views increased        11 non-null     object 
 3   Watch time (hours)     11 non-null     object 
 4   Watch time increase    11 non-null     int64  
 5   Subscribers            11 non-null     int64  
 6   Subscribers increased  11 non-null     int64  
 7   Revenue                11 non-null     float64
 8   Revenue Increase       11 non-null     float64
 9   Videos                 11 non-null     int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 1012.0+ bytes


In [148]:
df.head(20)


Unnamed: 0,Month.1,Views,Views increased,Watch time (hours),Watch time increase,Subscribers,Subscribers increased,Revenue,Revenue Increase,Videos
0,1,44700,0.0,2200.0,0,206,0,7281.85,0.0,9
1,2,44400,-300.0,2100.0,-100,201,-5,7290.49,8.64,7
2,3,54900,10500.0,2400.0,300,248,47,8040.0,749.51,10
3,4,51200,-3700.0,2200.0,-200,200,-48,7434.57,-605.43,9
4,5,52700,1500.0,2400.0,200,211,11,8266.61,832.04,9
5,6,49202,-3498.0,2200.0,-200,206,-5,7836.71,-429.9,8
6,7,62007,12805.0,2200.0,0,242,36,7898.54,61.83,8
7,8,65989,3982.0,2969.0,769,357,115,9950.24,2051.7,10
8,9,57122,-8867.0,2615.0,-354,353,-4,8476.46,-1473.78,10
9,10,51236,-5886.0,2221.0,-394,212,-141,8056.71,-419.75,9


In [163]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

df['VideosK'] = sc.fit_transform(df['Videos'].values.reshape(-1, 1))
df['ViewsK'] = sc.fit_transform(df['Views'].values.reshape(-1, 1))

df.head(10)


Unnamed: 0,Month.1,Views,Views increased,Watch time (hours),Watch time increase,Subscribers,Subscribers increased,Revenue,Revenue Increase,Videos,VideosK,ViewsK
0,1,44700,0.0,2200.0,0,206,0,7281.85,0.0,9,0.316228,-1.233033
1,2,44400,-300.0,2100.0,-100,201,-5,7290.49,8.64,7,-1.075174,-1.278912
2,3,54900,10500.0,2400.0,300,248,47,8040.0,749.51,10,1.011929,0.326853
3,4,51200,-3700.0,2200.0,-200,200,-48,7434.57,-605.43,9,0.316228,-0.238988
4,5,52700,1500.0,2400.0,200,211,11,8266.61,832.04,9,0.316228,-0.009593
5,6,49202,-3498.0,2200.0,-200,206,-5,7836.71,-429.9,8,-0.379473,-0.544542
6,7,62007,12805.0,2200.0,0,242,36,7898.54,61.83,8,-0.379473,1.413727
7,8,65989,3982.0,2969.0,769,357,115,9950.24,2051.7,10,1.011929,2.022695
8,9,57122,-8867.0,2615.0,-354,353,-4,8476.46,-1473.78,10,1.011929,0.666664
9,10,51236,-5886.0,2221.0,-394,212,-141,8056.71,-419.75,9,0.316228,-0.233482


In [165]:
fig = px.line(df, 
              x='Month.1', 
              y=['ViewsK', 'VideosK'], 
              line_group='variable', 
              title='Comparing Views vs Videos created',
              labels={'value': 'ASd'}
              )
fig.update_layout(
    xaxis_title='Month of 2023', 
    yaxis_title='Count of Views and Videos'
)
fig.show()
