In [444]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


<img src="images/green-divider.png" style="width: 100%;" />


# Global YouTube Statistics 2023

This dataset can be openly accessed via [Kaggle.com](https://www.kaggle.com/datasets/nelgiriyewithana/global-youtube-statistics-2023/versions/1?resource=download)  
This datset is a collection of the most subscribed channels on youtube in 2023 it has comprehensive details on top creators' subscriber counts, video views, upload frequency, country of origin, earnings, and more.


In [445]:
df = pd.read_csv("Global YouTube Statistics.csv", encoding ="latin-1")

## Getting an overview

In [446]:
df.head().T

Unnamed: 0,0,1,2,3,4
rank,1,2,3,4,5
Youtuber,T-Series,YouTube Movies,MrBeast,Cocomelon - Nursery Rhymes,SET India
subscribers,245000000,170000000,166000000,162000000,159000000
video views,228000000000.0,0.0,28368841870.0,164000000000.0,148000000000.0
category,Music,Film & Animation,Entertainment,Education,Shows
Title,T-Series,youtubemovies,MrBeast,Cocomelon - Nursery Rhymes,SET India
uploads,20082,1,741,966,116536
Country,India,United States,United States,United States,India
Abbreviation,IN,US,US,US,IN
channel_type,Music,Games,Entertainment,Education,Entertainment


In [447]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rank,995.0,498.0,287.3761,1.0,249.5,498.0,746.5,995.0
subscribers,995.0,22982410.0,17526110.0,12300000.0,14500000.0,17700000.0,24600000.0,245000000.0
video views,995.0,11039540000.0,14110840000.0,0.0,4288145000.0,7760820000.0,13554700000.0,228000000000.0
uploads,995.0,9187.126,34151.35,0.0,194.5,729.0,2667.5,301308.0
video_views_rank,994.0,554248.9,1362782.0,1.0,323.0,915.5,3584.5,4057944.0
country_rank,879.0,386.0535,1232.245,1.0,11.0,51.0,123.0,7741.0
channel_type_rank,962.0,745.7193,1944.387,1.0,27.0,65.5,139.75,7741.0
video_views_for_the_last_30_days,939.0,175610300.0,416378200.0,1.0,20137500.0,64085000.0,168826500.0,6589000000.0
lowest_monthly_earnings,995.0,36886.15,71858.72,0.0,2700.0,13300.0,37900.0,850900.0
highest_monthly_earnings,995.0,589807.8,1148622.0,0.0,43500.0,212700.0,606800.0,13600000.0


In [448]:
df.dtypes

rank                                         int64
Youtuber                                    object
subscribers                                  int64
video views                                float64
category                                    object
Title                                       object
uploads                                      int64
Country                                     object
Abbreviation                                object
channel_type                                object
video_views_rank                           float64
country_rank                               float64
channel_type_rank                          float64
video_views_for_the_last_30_days           float64
lowest_monthly_earnings                    float64
highest_monthly_earnings                   float64
lowest_yearly_earnings                     float64
highest_yearly_earnings                    float64
subscribers_for_last_30_days               float64
created_year                   


<img src="images/green-divider.png" style="width: 100%;" />


### Subscribers count 

Since this database holds the most subscribed youtube channels the minimal value for subscribers is in the millions.  
For ease of readbility I tranform the Series to multiples of 1 million. 

In [449]:
# original data description
display(df["subscribers"].describe())

# transform subscribers Series
df["subscribers_mil"] = df["subscribers"]/1_000_000

# display newly calculated Series
display(df["subscribers_mil"].describe())

count    9.950000e+02
mean     2.298241e+07
std      1.752611e+07
min      1.230000e+07
25%      1.450000e+07
50%      1.770000e+07
75%      2.460000e+07
max      2.450000e+08
Name: subscribers, dtype: float64

count    995.000000
mean      22.982412
std       17.526105
min       12.300000
25%       14.500000
50%       17.700000
75%       24.600000
max      245.000000
Name: subscribers_mil, dtype: float64

In [450]:
# Define a custom formatting function
def format_scientific(x):
    return f"{x:.0e}"

# Apply the formatting function to the Series
df["subscribers"] = df["subscribers"].apply(format_scientific)

display(df["subscribers"].describe())

display(df["subscribers"].dtype)

count       995
unique       11
top       2e+07
freq        453
Name: subscribers, dtype: object

dtype('O')


<img src="images/green-divider.png" style="width: 100%;" />


### Video Views
Transform Video views count to int64.  
For ease of readbility I tranform the Series to multiples of 1 million. 

In [451]:
# original data description
display(df["video views"].describe())

# transform subscribers Series
df["video views mil"] = (df["video views"]/1_000_000).round().astype('int64')

# display newly calculated Series
display(df["video views mil"].describe())

# Notice the discrepancy of dtypes reported with .df.describe() and df.dtype
# The .describe() method provides summary statistics for a Series, and it reports the data type as part of the summary. However, this reporting is based on the original data type of the Series before any transformations. In this case, the "video views" column originally had a float64 data type, and .describe() retains this information in its summary.

display(df["video views mil"].dtype)

count    9.950000e+02
mean     1.103954e+10
std      1.411084e+10
min      0.000000e+00
25%      4.288145e+09
50%      7.760820e+09
75%      1.355470e+10
max      2.280000e+11
Name: video views, dtype: float64

count       995.000000
mean      11039.540704
std       14110.846997
min           0.000000
25%        4288.500000
50%        7761.000000
75%       13555.000000
max      228000.000000
Name: video views mil, dtype: float64

dtype('int64')

### Identify faulty entries (video views)

The minimal video views is 0, which seems not feasable for the most subscribed channels.  
By identifiying the channels with the lowest video views, we can get more information.  

The 10 channels with the lowest video views seem to be either youtube specific channels, that have different system of counting views or errors in the database.  
Youtube Movies for example is a service to buy and rent movies, which are not openly accessable like normal youtube videos.  
"Popular on YouTube" and "Minecraft - Topic" are not independet channels, but rather platforms that point to other channels within their topics scope.  

Other channels seem to be errors in the database, as the channels "Happy Lives" and "ýýýýýý" with rather low video views (below 1 million) can not be found on youtube as of September 2023.

In [452]:
# Identify channels with lowest video views count

display(df.nsmallest(15, 'video views mil'))

# Identify channels with NaN video views count
display(df[df['video views'].isna()])

Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,subscribers_mil,video views mil
1,2,YouTube Movies,200000000.0,0.0,Film & Animation,youtubemovies,1,United States,US,Games,...,Mar,5.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,170.0,0
5,6,Music,100000000.0,0.0,,Music,0,,,Music,...,Sep,24.0,,,,,,,119.0,0
12,13,Gaming,90000000.0,0.0,,Gaming,0,,,Games,...,Dec,15.0,,,,,,,93.6,0
18,19,Sports,80000000.0,0.0,,sports,3,United States,US,Entertainment,...,Jan,30.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,75.0,0
102,103,News,40000000.0,0.0,,News,0,,,,...,Sep,9.0,,,,,,,36.3,0
173,174,Popular on YouTube,30000000.0,0.0,,Popular on Youtube,3,,,Education,...,Aug,7.0,,,,,,,29.3,0
286,287,Happy Lives,20000000.0,2634.0,Science & Technology,Happy Lives,1,United States,US,Entertainment,...,Mar,15.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,23.2,0
360,361,Minecraft - Topic,20000000.0,0.0,,Minecraft - Topic,0,,,Games,...,Dec,20.0,,,,,,,20.9,0
592,593,Live,20000000.0,0.0,,Live,0,,,,...,Jan,13.0,,,,,,,16.1,0
700,701,ýýýýýý,10000000.0,439098.0,People & Blogs,ýýýýýý,1,Russia,RU,News,...,Aug,11.0,81.9,144373500.0,4.59,107683889.0,61.52401,105.318756,14.9,0


Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,subscribers_mil,video views mil


### Remove faulty entries (video views)  
As the two channels mentioned above are not accessable on Youtube I'll remove them from the dataframe.

In [453]:
# Drop faulty channels by their index
df.drop([286, 700], inplace=True)

# Show that the specific indices have been dropped from the dataframe
display(df.iloc[285: 287])
display(df.iloc[698: 700])

Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,subscribers_mil,video views mil
285,286,Sesame Street,20000000.0,22471360000.0,Entertainment,Sesame Street,3657,United States,US,Entertainment,...,Jan,16.0,88.2,328239523.0,14.7,270663028.0,37.09024,-95.712891,23.3,22471
287,288,Lady Gaga,20000000.0,15751660000.0,People & Blogs,Lady Gaga,172,United States,US,Music,...,May,15.0,88.2,328239523.0,14.7,270663028.0,37.09024,-95.712891,23.2,15752


Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,subscribers_mil,video views mil
699,700,ZutiGang,10000000.0,15913320000.0,People & Blogs,ZutiGang,1363,,,People,...,Mar,5.0,,,,,,,14.9,15913
701,702,TazerCraft,10000000.0,5956194000.0,Gaming,TazerCraft,4175,Brazil,BR,Games,...,Jul,13.0,51.3,212559417.0,12.08,183241641.0,-14.235004,-51.92528,14.9,5956



<img src="images/green-divider.png" style="width: 100%;" />


### Category

In [454]:
# How many channels fall under each category?
display(df["category"].value_counts())

# How many channels don't list a category?
display(df["category"].isna().sum())

category
Entertainment            241
Music                    202
People & Blogs           131
Gaming                    94
Comedy                    69
Film & Animation          46
Education                 45
Howto & Style             40
News & Politics           26
Science & Technology      16
Shows                     13
Sports                    11
Pets & Animals             4
Trailers                   2
Nonprofits & Activism      2
Movies                     2
Autos & Vehicles           2
Travel & Events            1
Name: count, dtype: int64

46

If you the reader, want to explore the dataset, you can change "Sports" below to any of the categories listed above and explore.  
If instead you are interested in the channels without a proper category, remove the # symbol preceeding the last line.

In [455]:
# Display channels by specific category
df.loc[df["category"] =="Sports"]

# Display channels without a category
#df.loc[df["category"].isna()]

Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,subscribers_mil,video views mil
11,12,WWE,100000000.0,77428470000.0,Sports,WWE,70127,United States,US,Sports,...,May,11.0,88.2,328239523.0,14.7,270663028.0,37.09024,-95.712891,96.0,77428
28,29,Dude Perfect,60000000.0,16241550000.0,Sports,Dude Perfect,389,United States,US,Sports,...,Mar,17.0,88.2,328239523.0,14.7,270663028.0,37.09024,-95.712891,59.5,16242
368,369,NBA,20000000.0,12624880000.0,Sports,NBA,47926,United States,US,Sports,...,Nov,21.0,88.2,328239523.0,14.7,270663028.0,37.09024,-95.712891,20.7,12625
423,424,FIFA,20000000.0,5529132000.0,Sports,FIFA,10728,Switzerland,CH,Sports,...,Sep,6.0,59.6,8574832.0,4.58,6332428.0,46.818188,8.227512,19.4,5529
478,479,How Ridiculous,20000000.0,9601137000.0,Sports,How Ridiculous,650,Australia,AU,Sports,...,Sep,9.0,113.1,25766605.0,5.27,21844756.0,-25.274398,133.775136,18.0,9601
567,568,UFC - Ultimate Fighting Championship,20000000.0,7135821000.0,Sports,UFC - Ultimate Fighting Championship,14662,United States,US,Sports,...,Mar,4.0,88.2,328239523.0,14.7,270663028.0,37.09024,-95.712891,16.4,7136
646,647,FC Barcelona,20000000.0,2656528000.0,Sports,FC Barcelona,10988,Spain,ES,Sports,...,Feb,6.0,88.9,47076781.0,13.96,37927409.0,40.463667,-3.74922,15.3,2657
790,791,F2Freestylers - Ultimate Soccer Skills Channel,10000000.0,3280482000.0,Sports,F2Freestylers - Ultimate Soccer Skills Channï¿½,777,United Kingdom,GB,Sports,...,Apr,9.0,60.0,66834405.0,3.85,55908316.0,55.378051,-3.435973,14.1,3280
833,834,DALLMYD,10000000.0,1948926000.0,Sports,DALLMYD,412,United States,US,Sports,...,Feb,1.0,88.2,328239523.0,14.7,270663028.0,37.09024,-95.712891,13.6,1949
913,914,gymvirtual,10000000.0,2509753000.0,Sports,gymvirtual,1572,Spain,ES,Sports,...,Nov,1.0,88.9,47076781.0,13.96,37927409.0,40.463667,-3.74922,12.9,2510



<img src="images/green-divider.png" style="width: 100%;" />


### Uploads


In [456]:
# Display the channels with the most uploads
display(df.nlargest(10, 'uploads'))

# It seems that news & politics channels upload the most videos

# Display the channels with the fewest uploads
display(df.nsmallest(10, 'uploads'))

# There are a few channels listed with 0 uploads, but non-zero video views. This doesn't add up.

Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,subscribers_mil,video views mil
95,96,ABP NEWS,40000000.0,13102610000.0,People & Blogs,ABP NEWS,301308,India,IN,News,...,Jun,1.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288,37.0,13103
857,858,GMA Integrated News,10000000.0,9569815000.0,News & Politics,GMA Integrated News,296272,Philippines,PH,News,...,Oct,29.0,35.5,108116600.0,2.15,50975903.0,12.879721,121.774017,13.4,9570
747,748,TV9 Bharatvarsh,10000000.0,10303520000.0,People & Blogs,TV9 Bharatvarsh,293516,India,IN,News,...,Nov,19.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288,14.5,10304
33,34,Aaj Tak,60000000.0,25307750000.0,News & Politics,Aaj Tak,283775,India,IN,News,...,Aug,27.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288,57.6,25308
107,108,IndiaTV,40000000.0,16105020000.0,News & Politics,IndiaTV,273255,India,IN,News,...,Aug,26.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288,35.5,16105
689,690,KOMPASTV,20000000.0,11827310000.0,News & Politics,KOMPASTV,269050,Indonesia,ID,News,...,Aug,23.0,36.3,270203900.0,4.69,151509724.0,-0.789275,113.921327,15.0,11827
586,587,Thairath Online,20000000.0,14563840000.0,News & Politics,Thairath Online,244899,Thailand,TH,News,...,Sep,27.0,49.3,69625580.0,0.75,35294600.0,15.870032,100.992541,16.2,14564
502,503,News 24,20000000.0,8396876000.0,News & Politics,24 ï¿½ï¿½ï¿½ï,211620,Ukraine,UA,News,...,Feb,5.0,82.7,44385160.0,8.88,30835699.0,48.379433,31.16558,17.7,8397
673,674,ABS-CBN News,20000000.0,10489370000.0,News & Politics,ABS-CBN News,209520,Philippines,PH,News,...,Oct,22.0,35.5,108116600.0,2.15,50975903.0,12.879721,121.774017,15.1,10489
84,85,TEDx Talks,40000000.0,7339333000.0,Nonprofits & Activism,TEDx Talks,200933,United States,US,Nonprofit,...,Jun,23.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,38.6,7339


Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,subscribers_mil,video views mil
5,6,Music,100000000.0,0.0,,Music,0,,,Music,...,Sep,24.0,,,,,,,119.0,0
12,13,Gaming,90000000.0,0.0,,Gaming,0,,,Games,...,Dec,15.0,,,,,,,93.6,0
57,58,BRIGHT SIDE,40000000.0,10708530000.0,Howto & Style,brightside,0,,,,...,Nov,18.0,,,,,,,44.5,10709
73,74,Luisito Comunica,40000000.0,8670474000.0,Comedy,Luis Arturo Villar Sudek,0,Mexico,MX,Comedy,...,Jun,18.0,40.2,126014024.0,3.42,102626859.0,23.634501,-102.552784,40.6,8670
102,103,News,40000000.0,0.0,,News,0,,,,...,Sep,9.0,,,,,,,36.3,0
113,114,T-Series Apna Punjab,30000000.0,21306320000.0,Music,T- Series Apna Punjab,0,,,News,...,Jul,9.0,,,,,,,34.6,21306
149,150,Luis Fonsi,30000000.0,15176760000.0,Entertainment,luisfonsi,0,,,,...,Mar,31.0,,,,,,,31.4,15177
166,167,Frost Diamond,30000000.0,7277494000.0,Gaming,frostdiamond,0,,,,...,Aug,10.0,,,,,,,30.1,7277
180,181,Aditya Music India,30000000.0,25857990000.0,Music,Aditya Music,0,,,Music,...,Dec,16.0,,,,,,,28.5,25858
190,191,Sandeep Maheshwari,30000000.0,2303069000.0,People & Blogs,Sandeepmaheshwari,0,Singapore,SG,,...,Jan,2.0,84.8,5703569.0,4.11,5703569.0,1.352083,103.819836,27.8,2303


There seems to be an error in the database with the uploads Series, that can't be easily resolved by calculation of other Series.  
As such, I'll change the uploads column for channels with non-zero video views to -1 to indicate the discrepancy, while keeping the int64 datatype of the Series.

In [457]:
# Original datatype of "uploads" Series
display(df["uploads"].dtype)

# Change "uploads" from 0 to -1
df.loc[df["uploads"] == 0, "uploads"] = -1

# Descriptive statistics of "uploads" Series
display(df["uploads"].describe())

# Display the channels with exactly -1 uploads
display(df.loc[df["uploads"] == -1])

dtype('int64')

count       993.000000
mean       9205.584089
std       34183.279928
min          -1.000000
25%         196.000000
50%         733.000000
75%        2717.000000
max      301308.000000
Name: uploads, dtype: float64

Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,subscribers_mil,video views mil
5,6,Music,100000000.0,0.0,,Music,-1,,,Music,...,Sep,24.0,,,,,,,119.0,0
12,13,Gaming,90000000.0,0.0,,Gaming,-1,,,Games,...,Dec,15.0,,,,,,,93.6,0
57,58,BRIGHT SIDE,40000000.0,10708530000.0,Howto & Style,brightside,-1,,,,...,Nov,18.0,,,,,,,44.5,10709
73,74,Luisito Comunica,40000000.0,8670474000.0,Comedy,Luis Arturo Villar Sudek,-1,Mexico,MX,Comedy,...,Jun,18.0,40.2,126014000.0,3.42,102626859.0,23.634501,-102.552784,40.6,8670
102,103,News,40000000.0,0.0,,News,-1,,,,...,Sep,9.0,,,,,,,36.3,0
113,114,T-Series Apna Punjab,30000000.0,21306320000.0,Music,T- Series Apna Punjab,-1,,,News,...,Jul,9.0,,,,,,,34.6,21306
149,150,Luis Fonsi,30000000.0,15176760000.0,Entertainment,luisfonsi,-1,,,,...,Mar,31.0,,,,,,,31.4,15177
166,167,Frost Diamond,30000000.0,7277494000.0,Gaming,frostdiamond,-1,,,,...,Aug,10.0,,,,,,,30.1,7277
180,181,Aditya Music India,30000000.0,25857990000.0,Music,Aditya Music,-1,,,Music,...,Dec,16.0,,,,,,,28.5,25858
190,191,Sandeep Maheshwari,30000000.0,2303069000.0,People & Blogs,Sandeepmaheshwari,-1,Singapore,SG,,...,Jan,2.0,84.8,5703569.0,4.11,5703569.0,1.352083,103.819836,27.8,2303



<img src="images/green-divider.png" style="width: 100%;" />


### Country

In [458]:
# Display the top 5 countries with the most registered Youtube channels
display(df["Country"].value_counts().head())

# How many channels don't have an entry in the "Country" column?
display(df["Country"].isna().sum())

Country
United States     312
India             168
Brazil             62
United Kingdom     43
Mexico             33
Name: count, dtype: int64

122


<img src="images/green-divider.png" style="width: 100%;" />


### channel_type

In [459]:
# Display the different types of channels listed
display(df["channel_type"].value_counts())

channel_type
Entertainment    303
Music            216
People           101
Games             98
Comedy            51
Education         49
Film              42
Howto             36
News              29
Tech              17
Sports            13
Autos              3
Animals            3
Nonprofit          2
Name: count, dtype: int64


<img src="images/green-divider.png" style="width: 100%;" />


### Video views rank

In [460]:
# Are there NaN values in video views rank?
display(df["video_views_rank"].isna().sum())

# Identify the channel without a ranking
display(df.loc[df["video_views_rank"].isna(), ["Title", "video_views_rank"]])

# Drop the LegendaFUNK channel
df.drop(735, inplace = True)

# Show that the specific index 735 has been dropped from the dataframe
df.iloc[732:734]

1

Unnamed: 0,Title,video_views_rank
735,LegendaFUNK,


Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,subscribers_mil,video views mil
734,735,Noman Official,10000000.0,5525774000.0,Comedy,Noman Official,560,India,IN,Film,...,Oct,28.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288,14.6,5526
736,737,Like Nastya Stories,10000000.0,6944968000.0,Entertainment,Like Nastya Stories,479,United States,US,Entertainment,...,Sep,9.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,14.5,6945



<img src="images/green-divider.png" style="width: 100%;" />


### Country rank

In [461]:
# Are there NaN values in country ranks?
display(df["country_rank"].isna().sum())

# Identify the channel without a ranking
display(df.loc[df["country_rank"].isna(), ["Title", "Country", "country_rank"]])

# Are there entries with a country but without a ranking?
display(df.loc[(df["Country"].notna()) &
               (df["country_rank"].isna())])

# No, the missing "country_rank" stems solely from the missing entry in the "Country" Series.

115

Unnamed: 0,Title,Country,country_rank
5,Music,,
12,Gaming,,
14,goldmines,,
38,LooLoo Kids - Nursery Rhymes and Children's ï¿½,,
48,badabun,,
...,...,...,...
958,Troom Troom PT,,
967,TROOM TROOM INDONESIA,,
972,Hero Movies 2023,,
985,TKoR,,


Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,subscribers_mil,video views mil



<img src="images/green-divider.png" style="width: 100%;" />


### channel_type_rank

In [462]:
# Statistical description of the channel_type_rank column
display(df["channel_type_rank"].describe())

# Are there NaN entries in the channel_type_rank column?
display(df["channel_type_rank"].isna().sum())

# Are there entries with a channel type but without a channel_type_rank?
display(df.loc[(df["channel_type"].notna()) &
               (df["channel_type_rank"].isna()),
               ["Title", "channel_type", "channel_type_rank"]])

# Yes, there are 2 channels from the Music and Comedy type, that don't have a corresponding channel_type_rank.
# Let's drop these 2 channels

df.drop([5, 73], inplace=True)

# Show that the specific indices have been dropped from the dataframe
display(df.iloc[4:6])
display(df.iloc[71:73])


count     960.000000
mean      747.170833
std      1946.151925
min         1.000000
25%        27.000000
50%        65.500000
75%       140.000000
max      7741.000000
Name: channel_type_rank, dtype: float64

32

Unnamed: 0,Title,channel_type,channel_type_rank
5,Music,Music,
73,Luis Arturo Villar Sudek,Comedy,


Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,subscribers_mil,video views mil
4,5,SET India,200000000.0,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,...,Sep,20.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288,159.0,148000
6,7,ýýý Kids Diana Show,100000000.0,93247040000.0,People & Blogs,ýýý Kids Diana Show,1111,United States,US,Entertainment,...,May,12.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,112.0,93247


Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,subscribers_mil,video views mil
72,73,Little Baby Bum - Nursery Rhymes & Kids Songs,40000000.0,39450820000.0,Education,Little Baby Bum - Nursery Rhymes & Kids Songs,2423,United States,US,Education,...,Jun,22.0,88.2,328239523.0,14.7,270663028.0,37.09024,-95.712891,40.9,39451
74,75,elrubiusOMG,40000000.0,7410537000.0,Gaming,elrubiusOMG,703,,,Entertainment,...,Dec,20.0,,,,,,,40.4,7411
