# YouTube Top Trending Video Analysis

Analysis performed by Harleen Kaur and Amelia Meyer

## Packages

In [8]:
# load packages
import pandas as pd
import numpy as np
import altair as alt
import random

In [9]:
# !pip install wordcloud



In [10]:
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt

## Read in & Tidy Data

In [11]:
# read in US_youtube_trending_data

US_youtube_trending_data = pd.read_csv('/home/jovyan/pstat100-w22-content/project/US_youtube_trending_data.csv')

# US_youtube_trending_data = pd.read_csv(r'C:\Users\ameyer\Documents\PSTAT100FinalProject\US_youtube_trending_data.csv')

# US_youtube_trending_data = pd.read_csv(r'C:\Users\candy\Documents\PSTAT100FinalProject\US_youtube_trending_data.csv')

In [12]:
# read in US_category_id json that has categoryId info

US_category_id = pd.read_json('/home/jovyan/pstat100-w22-content/project/US_category_id.json')

# US_category_id = pd.read_json(r'C:\Users\ameyer\Documents\PSTAT100FinalProject\US_category_id.json')

# US_category_id = pd.read_json(r'C:\Users\candy\Documents\PSTAT100FinalProject\US_category_id.json')

In [13]:
# convert json to csv

US_category_id.to_csv('US_category_id.csv')

In [14]:
# print first couple rows of US_youtube_trending_data 

US_youtube_trending_data.head(4)

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description
0,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11T19:20:14Z,UCvtRTOMP2TqYqu51xNrqAzg,Brawadis,22,2020-08-12T00:00:00Z,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156908,5855,35313,https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg,False,False,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...
1,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,2020-08-11T17:00:10Z,UC0ZV6M2THA81QT9hrVWJG3A,Apex Legends,20,2020-08-12T00:00:00Z,Apex Legends|Apex Legends characters|new Apex ...,2381688,146739,2794,16549,https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg,False,False,"While running her own modding shop, Ramya Pare..."
2,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11T16:34:06Z,UCYzPXprvl5Y-Sf0g4vX-m6g,jacksepticeye,24,2020-08-12T00:00:00Z,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353787,2628,40221,https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg,False,False,I left youtube for a month and this is what ha...
3,kXLn3HkpjaA,XXL 2020 Freshman Class Revealed - Official An...,2020-08-11T16:38:55Z,UCbg_UMjlHJg_19SZckaKajg,XXL,10,2020-08-12T00:00:00Z,xxl freshman|xxl freshmen|2020 xxl freshman|20...,496771,23251,1856,7647,https://i.ytimg.com/vi/kXLn3HkpjaA/default.jpg,False,False,Subscribe to XXL → http://bit.ly/subscribe-xxl...


In [15]:
## DEFINE GENRES
# gathered from documentation
# these are the only genres that are trending; there are others but they aren't even in the dataset
genres = {1: 'Film & Animation', 2:'Autos & Vehicles ',10 : 'Music', 15 : 'Pets & Animals', 17 : 'Sports', 18 : 'Short Movies', 19 : 'Travel & Events', 20 : 'Gaming', 21 : 'Videoblogging',
         22 : 'People & Blogs', 23 : 'Comedy', 24 : 'Entertainment', 25 : 'News & Politics', 26 : 'Howto & Style', 27 : 'Education', 28 : 'Science & Technology', 29 : 'Nonprofits & Activism', 
          30 : 'Movies', 31 : 'Anime/Animation'}

In [16]:
# replace categoryId numbers with genre dict
US_youtube_trending_data['categoryId'] = US_youtube_trending_data.categoryId.replace(genres)

In [17]:
# dropping some columns 

US_youtube_trending_data = US_youtube_trending_data.drop(columns=['video_id', 'channelId', 'thumbnail_link', 'description']) 

In [18]:
# storing month and year, we can discuss how we should actually store it
# not sure how altair works with datetimes

#published date
US_youtube_trending_data['publishedAt'] = pd.to_datetime(US_youtube_trending_data['publishedAt'])

US_youtube_trending_data['publishedAt'] = US_youtube_trending_data['publishedAt'].dt.strftime('%Y-%m-%d')
US_youtube_trending_data['PublishedYear'] = pd.DatetimeIndex(US_youtube_trending_data['publishedAt']).year
US_youtube_trending_data['PublishedMonth'] = pd.DatetimeIndex(US_youtube_trending_data['publishedAt']).month

#trending date
US_youtube_trending_data['trending_date'] = pd.to_datetime(US_youtube_trending_data['trending_date'])

US_youtube_trending_data['trending_date'] = US_youtube_trending_data['trending_date'].dt.strftime('%Y-%m-%d')
US_youtube_trending_data['TrendingYear'] = pd.DatetimeIndex(US_youtube_trending_data['trending_date']).year
US_youtube_trending_data['TrendingMonth'] = pd.DatetimeIndex(US_youtube_trending_data['trending_date']).month

In [19]:
# rename columns to match formatting 
US_youtube_trending_data.rename(
    columns={
    "publishedAt":"date_published", "channelTitle": "channel_name", "categoryId": "category", "PublishedYear":"year_published",
    "PublishedMonth":"month_published", "TrendingYear": "year_trending", "TrendingMonth":"month_trending"
    }, inplace = True)

# print first few rows of dataset
US_youtube_trending_data.head(3)

Unnamed: 0,title,date_published,channel_name,category,trending_date,tags,view_count,likes,dislikes,comment_count,comments_disabled,ratings_disabled,year_published,month_published,year_trending,month_trending
0,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11,Brawadis,People & Blogs,2020-08-12,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156908,5855,35313,False,False,2020,8,2020,8
1,Apex Legends | Stories from the Outlands – “Th...,2020-08-11,Apex Legends,Gaming,2020-08-12,Apex Legends|Apex Legends characters|new Apex ...,2381688,146739,2794,16549,False,False,2020,8,2020,8
2,I left youtube for a month and THIS is what ha...,2020-08-11,jacksepticeye,Entertainment,2020-08-12,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353787,2628,40221,False,False,2020,8,2020,8


In [20]:
# export tidy data as csv

US_youtube_trending_data.to_csv('trending.csv')

In [21]:
# load new csv

trending = pd.read_csv('/home/jovyan/pstat100-w22-content/project/trending.csv')
# trending = pd.read_csv(r'C:\Users\candy\Documents\PSTAT100FinalProject\trending.csv')

# drop 'unnamed' col that appeared
trending.drop(columns=['Unnamed: 0'], inplace=True)

# print a few example rows of dataset in tidy format
trending.head(4)

Unnamed: 0,title,date_published,channel_name,category,trending_date,tags,view_count,likes,dislikes,comment_count,comments_disabled,ratings_disabled,year_published,month_published,year_trending,month_trending
0,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11,Brawadis,People & Blogs,2020-08-12,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156908,5855,35313,False,False,2020,8,2020,8
1,Apex Legends | Stories from the Outlands – “Th...,2020-08-11,Apex Legends,Gaming,2020-08-12,Apex Legends|Apex Legends characters|new Apex ...,2381688,146739,2794,16549,False,False,2020,8,2020,8
2,I left youtube for a month and THIS is what ha...,2020-08-11,jacksepticeye,Entertainment,2020-08-12,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353787,2628,40221,False,False,2020,8,2020,8
3,XXL 2020 Freshman Class Revealed - Official An...,2020-08-11,XXL,Music,2020-08-12,xxl freshman|xxl freshmen|2020 xxl freshman|20...,496771,23251,1856,7647,False,False,2020,8,2020,8


In [22]:
# check for missing data
trending.isna().sum()

title                0
date_published       0
channel_name         0
category             0
trending_date        0
tags                 0
view_count           0
likes                0
dislikes             0
comment_count        0
comments_disabled    0
ratings_disabled     0
year_published       0
month_published      0
year_trending        0
month_trending       0
dtype: int64