In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('Global YouTube Statistics.csv', encoding='latin1').drop('rank', axis=1)

In [None]:
data.head()

In [None]:
data = data.drop(['video_views_rank', 'country_rank', 'channel_type_rank', 'created_year',\
           'created_month', 'created_date', 'Gross tertiary education enrollment (%)',\
          'Population', 'Unemployment rate', 'Urban_population', 'Latitude', 'Longitude'], axis=1)

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.loc[data['category'].isna()]

In [None]:
data['category'].fillna('No Category Found', inplace=True)

In [None]:
data.loc[data['Country'].isna()]

In [None]:
data['Country'].fillna('No Country Found', inplace=True)

In [None]:
data['Abbreviation'].fillna('No Country Found', inplace=True)

In [None]:
data['channel_type'].fillna('No Channel Found', inplace=True)

In [None]:
data[data['video_views_for_the_last_30_days'].isna()]

In [None]:
data['video_views_for_the_last_30_days'].fillna(0, inplace=True)

In [None]:
data['subscribers_for_last_30_days'].fillna(0, inplace=True)

In [None]:
data.duplicated().sum()

In [None]:
from dash import Dash, html, dcc
from jupyter_dash import JupyterDash
from dash.dependencies import Output, Input
from dash.exceptions import PreventUpdate

In [None]:
#General Data Visualization 

In [None]:
data.Youtuber.dtype

In [None]:
#Top 10 Youtuber with Highest {variables}
# for x in data.columns:
#     if data[x].dtype != 'O':
#         df = data.sort_values(x, ascending=False)[:15]
#         plt.figure(figsize=(16, 4))
#         sns.barplot(
#             data= df.loc[:, ['Youtuber', x]],
#             x='Youtuber',
#             y=x)
#         plt.xticks(rotation=45)
#         plt.show()
    

In [None]:
#Top 10 Youtuber with Highest {variables}

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown',
                options=[{'label':x.title(), 'value':x} for x in data.columns if data[x].dtype != 'O'],
                value='subscribers'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))

def plot(variable):
    
    df = data.sort_values(variable, ascending=False)[:15]

    fig = px.bar(
        df, 
        x='Youtuber',
        y=variable,
        color='Youtuber',
        labels={variable:variable.title()},
        title=f'Top 15 Youtuber with Highest {variable.title()}',
        text_auto=True
    ).update_traces(showlegend=False)\
     .update_layout(
            title={
            'x':0.5,
            'y':0.88,
            'xanchor':'center'},
            height=600)
    
    return fig
    
    
if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8065)

In [None]:
#Variable Value Counts
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown',
                options=['category', 'channel_type'],
                value='category'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))

def plot(variable):
    
    df = data[variable].value_counts().reset_index()

    fig = px.pie(
        df, 
        values=variable,
        names='index',
        title=f'{variable.title()} Value Counts in Percentage'
    ).update_layout(
            title={
            'x':0.438,
            'y':0.90,
            'xanchor':'center'},
            width=800, height=600)\
    .update_traces(textposition='inside', textinfo='percent+label')
    
    return fig
    
if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8065)

In [None]:
#Top 10 Countries with Most Youtuber Value Counts
df = data.Country.value_counts().sort_values(ascending=False).reset_index()[:15]

# fig, ax = plt.subplots()
# ax.pie(
#     x=df['Country'],
#     startangle=90,
#     labels=df['index'],
#     autopct="%.0f%%",
#     );

px.pie(
    df,
    values='Country',
    names='index',
    title='Top 10 Countries with Most Youtuber Value Counts'
    ).update_layout(
            title={
            'x':0.438,
            'y':0.90,
            'xanchor':'center'},
            width=800, height=600)\
    .update_traces(textposition='inside', textinfo='percent+label')

In [None]:
country_codes = {
    'India': 'IND',
    'United States': 'USA',
    'Japan': 'JPN',
    'Russia': 'RUS',
    'South Korea': 'KOR',
    'United Kingdom': 'GBR',
    'Canada': 'CAN',
    'Brazil': 'BRA',
    'Argentina': 'ARG',
    'Chile': 'CHL',
    'Cuba': 'CUB',
    'El Salvador': 'SLV',
    'Pakistan': 'PAK',
    'Philippines': 'PHL',
    'Thailand': 'THA',
    'Colombia': 'COL',
    'Barbados': 'BRB',
    'Mexico': 'MEX',
    'United Arab Emirates': 'ARE',
    'Spain': 'ESP',
    'Saudi Arabia': 'SAU',
    'Indonesia': 'IDN',
    'Turkey': 'TUR',
    'Venezuela': 'VEN',
    'Kuwait': 'KWT',
    'Jordan': 'JOR',
    'Netherlands': 'NLD',
    'Singapore': 'SGP',
    'Australia': 'AUS',
    'Italy': 'ITA',
    'Germany': 'DEU',
    'France': 'FRA',
    'Sweden': 'SWE',
    'Afghanistan': 'AFG',
    'Ukraine': 'UKR',
    'Latvia': 'LVA',
    'Switzerland': 'CHE',
    'Vietnam': 'VNM',
    'Malaysia': 'MYS',
    'China': 'CHN',
    'Iraq': 'IRQ',
    'Egypt': 'EGY',
    'Andorra': 'AND',
    'Ecuador': 'ECU',
    'Morocco': 'MAR',
    'Peru': 'PER',
    'Bangladesh': 'BGD',
    'Finland': 'FIN',
    'Samoa': 'WSM'
}

df = data.Country.value_counts().sort_values(ascending=False).reset_index()

df.drop(2, inplace=True)

In [None]:
df['Code'] = df['index'].map(country_codes)

In [None]:
df.head()

In [None]:
px.choropleth(
    df, 
    locations="Code",
    color="Country", # lifeExp is a column of gapminder
    hover_name="Country", # column to add to hover information
    color_continuous_scale=px.colors.sequential.Plasma,
    title='Youtuber Value Counts Map Distribution'
).update_layout(
            title={
            'x':0.5,
            'y':0.90,
            'xanchor':'center'})

In [None]:
#EDA 

In [None]:
data.head()

In [None]:
#Category with its total and avg {variables}

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Variable Selection:'), 
    dcc.Dropdown(
        id='dropdown', 
        options=[{'label':x.title(), 'value':x} for x in data.columns if data[x].dtype != 'O'],
        value= 'uploads'
    ), 
    dcc.Graph('visual1'),
    dcc.Graph('visual2')
])

@app.callback(Output('visual1', 'figure'),
              Output('visual2', 'figure'),
              Input('dropdown', 'value'))
def plot(variable):
    df = data.groupby('category', as_index=False).sum().sort_values(variable, ascending=False)

    fig1 = px.bar(
        df, 
        y='category', 
        x=variable,
        color='category', 
        labels={'category':'Category', variable:variable.title()},
        title = f'Category with Total {variable.title()}'
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=600)\
    .add_vline(
            x=df[variable].mean(),
            line_dash='dash',
            line_color='grey',
            opacity=0.4)
           
    df2 = data.groupby('category', as_index=False).mean().sort_values(variable, ascending=False)
    
    fig2 = px.bar(
        df2,
        y='category', 
        x=variable,
        color='category', 
        labels={'category':'Category', variable:variable.title()},
        title = f'Category with Avg {variable.title()}'
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=600)     
        
    return fig1, fig2

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8066)

In [None]:
#Category with its total and avg {variables}

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Variable Selection:'), 
    dcc.Dropdown(
        id='dropdown', 
        options=[{'label':x.title(), 'value':x} for x in data.columns if data[x].dtype != 'O'],
        value= 'subscribers'
    ), 
    dcc.Graph('visual1'),
    dcc.Graph('visual2')
])

@app.callback(Output('visual1', 'figure'),
              Output('visual2', 'figure'),
              Input('dropdown', 'value'))
def plot(variable):
    df = data.groupby('channel_type', as_index=False).sum().sort_values(variable, ascending=False)

    fig1 = px.bar(
        df, 
        y='channel_type', 
        x=variable,
        color='channel_type', 
        labels={'channel_type':'Channel Type', variable:variable.title()},
        title = f'Channel Type with Total {variable.title()}'
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=600)\
    .add_vline(
            x=df[variable].mean(),
            line_dash='dash',
            line_color='grey',
            opacity=0.4)
           
    df2 = data.groupby('channel_type', as_index=False).mean().sort_values(variable, ascending=False)
    
    fig2 = px.bar(
        df2,
        y='channel_type', 
        x=variable,
        color='channel_type', 
        labels={'channel_type':'Channel_Type', variable:variable.title()},
        title = f'Channel_Type with Avg {variable.title()}'
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=600)     
        
    return fig1, fig2

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8067)

In [None]:
#Top 10 Youtubers under different category/Country /channel_type of different variable

In [None]:
# for x in data.category.unique():
#     df = data.loc[data['category'] == x].sort_values('subscribers', ascending=False)[:10]
#     plt.figure(figsize=(16,8))
#     sns.barplot(
#         data=df, 
#         y='subscribers',
#         x='Youtuber'
#     )
#     plt.title(f'Top 10 Youtubers with Subscribers Cnt in {x}')
#     plt.show()

In [None]:
#Top 10 Youtubers with Most Subscribers Cnt / Video views
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Category Selection:'),
    dcc.Dropdown(id='dropdown',
                  options=data.category.unique(), 
                  value='Music'),
    dcc.Graph(id='visual1'),
    dcc.Graph(id='visual2')
])

@app.callback(Output('visual1', 'figure'), 
              Output('visual2', 'figure'),
              Input('dropdown', 'value'))
def plot(category):
    df = data.loc[data['category'] == category]
    
    fig1 = px.bar(
        df.sort_values('subscribers', ascending=False)[:10],
        x='subscribers', 
        y='Youtuber', 
        color='video views',
        labels={'subscribers':'Subscribers Cnt',
                'video views':'Video Views'},
        title = f"Top 10 Youtubers with Most Subscribers Cnt in {category}"
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    fig2 = px.bar(
        df.sort_values('video views', ascending=False)[:10],
        x='video views', 
        y='Youtuber', 
        color='subscribers',
        labels={'subscribers':'Subscribers Cnt',
                'video views':'Video Views'},
        title = f"Top 10 Youtubers with Most Video Views in {category}"
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    return fig1, fig2

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8068)

In [None]:
#Top Ten Youtubers with most subscribers and video views in each Country

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Country Selection:'),
    dcc.Dropdown(
        id='dropdown',
        options=[{'label': x.title(), 'value': x} for x in data.Country.unique()],
        value='United States'
    ),
    dcc.Graph(id='visual1'),
    dcc.Graph(id='visual2')
])

@app.callback(Output('visual1', 'figure'),
             Output('visual2', 'figure'),
             Input('dropdown', 'value'))
def plot(country):
    df = data.loc[data['Country']==country]
    
    fig1 = px.bar(
        df.sort_values('subscribers', ascending=False)[:10],
        x='subscribers',
        y='Youtuber',
        color='video views',
        labels={'subscribers':'Subscribers Cnt',
               'video views':'Video Views'},
        title = f"Top 10 YouTuber with Most Subsribers in {country}"
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    fig2 = px.bar(
        df.sort_values('video views', ascending=False)[:10],
        x='video views',
        y='Youtuber',
        color='subscribers',
        labels={'subscribers':'Subscribers Cnt',
               'video views':'Video Views'},
        title = f"Top 10 YouTuber with Most Video Views in {country}"
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    return fig1, fig2

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8069)

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Category Selection:'),
    dcc.RadioItems(id='items',
                  options=[{'label':'Category', 'value':'category'},
                           {'label':'Channel Type', 'value':'channel_type'},
                           {'label':'Country', 'value':'Country'}], 
                  value='category'),
    dcc.Dropdown(id='dropdown'),
    dcc.Graph(id='visual')
])

def unique_value(column):
    return [{'label':column.title(), 'value':column} for column in data.columns.unique()]


@app.callback(Output('dropdown', 'options'),
              Output('dropdown', 'value'),
              Input('items', 'value'))
def update_dropdown(selected_item):
    if selected_item == 'category':
        column_value = unique_value('category')
    elif selected_item == 'channel_type':
        column_value = unique_value('channel_type')
    else:
        column_value = unique_value('Country')
        
    initial_value = column_value[0]['value']
    return column_value, initial_value
    
@app.callback(Output('visual', 'figure'), 
              Input('items', 'value'),
              Input('dropdown', 'value'))

def plot(selected_item, variable):
    df = data.loc[data[selected_item] == variable].sort_values('subscribers', ascending=False)[:10]
    
    fig = px.bar(
        df, 
        x='subscribers',
        y=variable,
        color='Youtuber'
    )
    
    return fig
    
if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8068)

In [None]:
#Last 30 Days Subscribers/Video Views avg&sum

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Category Selection:'),
    dcc.Dropdown(id='dropdown', 
                options=[{'label': 'video_views_for_the_last_30_days'.title().replace('_', ' '), 'value':'video_views_for_the_last_30_days'},
                         {'label': 'subscribers_for_last_30_days'.title().replace('_', ' '), 'value':'subscribers_for_last_30_days'}],
                value='video_views_for_the_last_30_days'
    ),
    dcc.Graph(id='visual1'),
    dcc.Graph(id='visual2')
])

@app.callback(Output('visual1', 'figure'), 
              Output('visual2', 'figure'),
              Input('dropdown', 'value'))
def plot(variable):
    df1 = data.groupby('category', as_index=False).mean().sort_values(variable, ascending=False)
    df2 = data.groupby('category', as_index=False).sum().sort_values(variable, ascending=False)
    
    fig1 = px.bar(
        df1,
        x=variable, 
        y='category', 
        title= f"Avg {variable.title().replace('_', ' ')}",
        color='category',
        labels={'category':'Category',
               variable:variable.title().replace('_', ' ')}
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    fig2 = px.bar(
        df2,
        x=variable, 
        y='category', 
        title= f"Total {variable.title().replace('_', ' ')}",
        color='category',
        labels={'category':'Category',
               variable:variable.title().replace('_', ' ')}
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    

    return fig1, fig2

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8070)

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Channel Type Selection:'),
    dcc.Dropdown(id='dropdown', 
                options=[{'label': 'video_views_for_the_last_30_days'.title().replace('_', ' '), 'value':'video_views_for_the_last_30_days'},
                         {'label': 'subscribers_for_last_30_days'.title().replace('_', ' '), 'value':'subscribers_for_last_30_days'}],
                value='video_views_for_the_last_30_days'
    ),
    dcc.Graph(id='visual1'),
    dcc.Graph(id='visual2')
])

@app.callback(Output('visual1', 'figure'), 
              Output('visual2', 'figure'),
              Input('dropdown', 'value'))
def plot(variable):
    df1 = data.groupby('channel_type', as_index=False).mean().sort_values(variable, ascending=False)
    df2 = data.groupby('channel_type', as_index=False).sum().sort_values(variable, ascending=False)
    
    fig1 = px.bar(
        df1,
        x=variable, 
        y='channel_type', 
        title= f"Avg {variable.title().replace('_', ' ')}",
        color='channel_type',
        labels={'channel_type':'Channel Type',
               variable:variable.title().replace('_', ' ')}
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    fig2 = px.bar(
        df2,
        x=variable, 
        y='channel_type', 
        title= f"Total {variable.title().replace('_', ' ')}",
        color='channel_type',
        labels={'channel_type':'Channel Type',
               variable:variable.title().replace('_', ' ')}
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    

    return fig1, fig2

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8072)

In [None]:
data.head()

In [None]:
#Histograms

In [None]:
# for x in data.columns:
#     if data[x].dtype != 'O':
#         sns.histplot(data[x])
#         plt.show()
    

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='Dropdown',
                options=[{'label': x.title().replace('_', ' '), 'value':x } for x in data.columns if data[x].dtype != 'O'],
                value='subscribers'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('Dropdown', 'value'))
def plot(variable):
    
    fig = px.histogram(
    data,
    x=variable,
    title=f"{variable.title().replace('_', ' ')}",
    nbins=30,
#     text_auto=True,
#     histnorm='percent',
    labels={variable:variable.title().replace('_', ' ')}
    ).update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'})
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8071)

In [None]:
#Correlation

In [None]:
data.corr()

In [None]:
# sns.lmplot(
#     data=data, 
#     x='subscribers', 
#     y='video views'
#     )

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown1',
                options=[{'label':x.title().replace("_", " "), 'value':x} for x in data.corr().columns],
                value='subscribers'),
    dcc.Dropdown(id='dropdown2',
                options=[{'label':x.title().replace("_", " "), 'value':x} for x in data.corr().columns],
                value='video views'),
    html.Hr(),
    dcc.RadioItems(id='items',
                  options=['Trendline Off', 'Trendline On'],
                  value='Trendline Off'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'),
             Input('dropdown1', 'value'),
             Input('dropdown2', 'value'),
             Input('items', 'value'))
def plot(x, y, switch):
    
    fig = px.scatter(
        data,
        x=x, 
        y=y,
        trendline=None if switch == 'Trendline Off' else 'ols',
        labels={x:x.title().replace("_", " "),
               y:y.title().replace("_", " ")},
        title=f'{x.title().replace("_", " ")} vs. {y.title().replace("_", " ")}'
    ).update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'})
    
    return fig
    
if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8073)

In [None]:
sns.heatmap(data.corr(), annot=True)

In [None]:
sns.pairplot(data)