# Youtube Top 1000 Channels Analasys

In this project, I scraped data from us.youtubers.me to get the latest data of 1000 top Youtube channels and used Tableau to create dashboard and analyze the data.

In [32]:
# Imports
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import datetime
import smtplib

In [33]:
#Connect to Website and pull in data

URL = 'https://us.youtubers.me/global/all/top-1000-youtube-channels'

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

page = requests.get(URL, headers=headers)

soup1 = BeautifulSoup(page.content, "html.parser")

soup2 = BeautifulSoup(soup1.prettify(), "html.parser")

table = soup2.find_all('table')
df = pd.read_html(str(table))[0]

## Data Cleaning

In [34]:
dim = df.shape
print("There are", dim[0], "rows, and", dim[1],"columns in total")

print(df.info())

print(df.isnull().sum())
total_nulls = df.isnull().sum().sum()
print("There are", total_nulls, "empty values all in category column")

# select rows with null values in the category column
display(df.loc[df["category"].isnull()].head())

There are 1000 rows, and 7 columns in total
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   rank         1000 non-null   int64 
 1   Youtuber     1000 non-null   object
 2   subscribers  1000 non-null   int64 
 3   video views  1000 non-null   int64 
 4   video count  1000 non-null   int64 
 5   category     973 non-null    object
 6   started      1000 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 54.8+ KB
None
rank            0
Youtuber        0
subscribers     0
video views     0
video count     0
category       27
started         0
dtype: int64
There are 27 empty values all in category column


Unnamed: 0,rank,Youtuber,subscribers,video views,video count,category,started
36,37,D Billions,25100000,28985817530,622,,2019
96,97,Vlad và Niki,34400000,21356095039,466,,2018
136,137,व्लाद और निकिता,29200000,17323157149,486,,2018
258,259,BETER BÖCÜK,22000000,12848513801,1753,,2012
333,334,Diana and Roma ARA,23400000,10637524293,552,,2019


In [35]:
#Impute the missing values using the most common category
most_common_category = df['category'].mode()[0]
df['category'].fillna(most_common_category, inplace=True)
df.isnull().sum()

rank           0
Youtuber       0
subscribers    0
video views    0
video count    0
category       0
started        0
dtype: int64

In [36]:
#Choose only categories with 10 or more channels so the analysis will be easier to read
df_group = df.groupby(['category']).size().reset_index()
lst = df_group['category'].loc[df_group[0]>=10].values.tolist()
df = df.loc[df['category'].isin(lst)]

In [37]:
#Final clean dataframe
df.drop('rank', axis=1,inplace=True)
display(df)

Unnamed: 0,Youtuber,subscribers,video views,video count,category,started
0,T-Series,236000000,216280733169,18811,Music,2006
1,Cocomelon - Nursery Rhymes,154000000,152336550693,860,Education,2006
2,SET India,152000000,139952828932,105431,Shows,2006
3,Sony SAB,77400000,92784532407,64915,Shows,2007
4,✿ Kids Diana Show,108000000,88333687979,1068,People & Blogs,2015
...,...,...,...,...,...,...
995,Hola Amigos,10200000,5525837683,1204,People & Blogs,2019
996,MundoBitaVEVO,2430000,5523958544,37,Music,2013
997,Tyga,11300000,5522272543,160,Music,2009
998,Nick Jr. en Español,7960000,5519080616,994,Music,2021


In [38]:
#Create a CSV file so I can use it in Tableau Public 
df.to_csv('Youtube_Data.csv')