In [21]:
# This is an EDA made by me on Kaggle, using the "Most Subscribed 1000 Youtube Channels" dataset (https://www.kaggle.com/datasets/themrityunjaypathak/most-subscribed-1000-youtube-channels).

# You can navigate through the original here: https://www.kaggle.com/code/agostinacapponi/top-subscribed-yt-channels-eda/notebook

import numpy as np
import pandas as pd 
import plotly as py
import plotly.express as px

# Exploratory Data Analysis

In [22]:
df = pd.read_csv('data/topSubscribed.csv')

In [23]:
df.head()

Unnamed: 0,Rank,Youtube Channel,Subscribers,Video Views,Video Count,Category,Started
0,1,T-Series,234000000,212900271553,18515,Music,2006
1,2,YouTube Movies,161000000,0,0,Film & Animation,2015
2,3,Cocomelon - Nursery Rhymes,152000000,149084178448,846,Education,2006
3,4,SET India,150000000,137828094104,103200,Shows,2006
4,5,MrBeast,128000000,21549128785,733,Entertainment,2012


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Rank             1000 non-null   int64 
 1   Youtube Channel  1000 non-null   object
 2   Subscribers      1000 non-null   object
 3   Video Views      1000 non-null   object
 4   Video Count      1000 non-null   object
 5   Category         1000 non-null   object
 6   Started          1000 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [25]:
#converting values from string to numbers
strings = ['Subscribers', 'Video Views', 'Video Count']
for s in strings:
    df[s] = pd.to_numeric(df[s].apply(lambda x: x.replace(',', '')))

In [26]:
df.iloc[df['Video Count'].argmax()]

Rank                           821
Youtube Channel           GMA News
Subscribers               12800000
Video Views            11220255933
Video Count                 342802
Category           News & Politics
Started                       2007
Name: 820, dtype: object

In [27]:
df.iloc[df['Video Views'].argmax()]

Rank                          1
Youtube Channel        T-Series
Subscribers           234000000
Video Views        212900271553
Video Count               18515
Category                  Music
Started                    2006
Name: 0, dtype: object

In [28]:
df['Category'].value_counts()

Entertainment                                                                   238
Music                                                                           217
People & Blogs                                                                  132
Gaming                                                                           94
Comedy                                                                           68
Film & Animation                                                                 50
Education                                                                        45
Howto & Style                                                                    43
https://us.youtubers.me/global/all/top-1000-most_subscribed-youtube-channels     30
News & Politics                                                                  27
Science & Technology                                                             18
Shows                                                                       

In [29]:
#renaming the url category as N/D (no data)
df['Category'].replace("https://us.youtubers.me/global/all/top-1000-most_subscribed-youtube-channels", "N/D", regex=True,inplace=True)

# General Statistics

In [30]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rank,1000.0,500.5,288.8194,1.0,250.75,500.5,750.25,1000.0
Subscribers,1000.0,21581400.0,16625560.0,11400000.0,13400000.0,16600000.0,23200000.0,234000000.0
Video Views,1000.0,9994912000.0,13005460000.0,0.0,3871470000.0,6723360000.0,12230520000.0,212900300000.0
Video Count,1000.0,9416.228,32190.91,0.0,365.5,896.0,3277.25,342802.0
Started,1000.0,2012.594,4.110238,1970.0,2010.0,2013.0,2015.0,2021.0


In [31]:
px.box(df, x="Subscribers", title="Subscribers")

In [32]:
px.box(df, x="Video Views", title="Video Views")

In [33]:
px.box(df, x="Video Count", title="Video Count")

# Channels by category

In [34]:
#creates dataframes of the category count
byCategory = df['Category'].value_counts().to_frame('Count').rename_axis('Category').reset_index()
byCategory20m = df[df['Subscribers'] > 20000000]['Category'].value_counts().to_frame('Count').rename_axis('Category').reset_index()

In [35]:
byCategory

Unnamed: 0,Category,Count
0,Entertainment,238
1,Music,217
2,People & Blogs,132
3,Gaming,94
4,Comedy,68
5,Film & Animation,50
6,Education,45
7,Howto & Style,43
8,N/D,30
9,News & Politics,27


In [36]:
title = "Category of all channels (n=1000)"
px.pie(byCategory, values="Count", names="Category", title=title)

In [37]:
n = df[df['Subscribers'] > 20000000]['Rank'].count()
title = "Category on channels with over 20M subscribers (n="+str(n)+')'
px.pie(byCategory20m, values="Count", names="Category", title=title)

# Channels by subscriber count

In [38]:
px.histogram(df, x="Subscribers", title="Total subscribers count", nbins=25)

In [39]:
px.box(df, y="Subscribers", x="Category", color="Category", title="Subscribers by category")

In [40]:
px.histogram(df.query('Started > 1970').sort_values('Started', ascending=True), x="Subscribers", color="Category",
             range_x=[df['Subscribers'].min(),df['Subscribers'].max()],
             range_y=[0, 70],
             animation_frame="Started", title="Subscribers by channel creation date")