In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
youtube_path = r'F:\DataSpell\Pandas training\data_sets\youtube.csv'
youtube = pd.read_csv(youtube_path, parse_dates=['publish_date'], date_format='%d/%m/%Y').dropna(how='all')
youtube.drop('index', axis=1, inplace=True)

for col in youtube.select_dtypes(include=['int', 'float']).columns:
  if youtube[col].dtype.kind == 'i':
    youtube[col] = pd.to_numeric(youtube[col], downcast='integer')
  else:
    youtube[col] = pd.to_numeric(youtube[col], downcast='float')

In [3]:
youtube.dtypes

video_id                          object
trending_date                     object
title                             object
channel_title                     object
category_id                         int8
publish_date              datetime64[ns]
time_frame                        object
published_day_of_week             object
publish_country                   object
tags                              object
views                              int32
likes                              int32
dislikes                           int32
comment_count                      int32
comments_disabled                   bool
ratings_disabled                    bool
video_error_or_removed              bool
dtype: object

# Calculate the mean, median, and standard deviation of views, likes, dislikes, and comment_count for each category_id.

In [4]:
category_id = youtube.groupby('category_id')
category_id['views'].agg(['mean', 'median', 'std'])
category_id['likes'].agg(['mean', 'median', 'std'])
category_id['dislikes'].agg(['mean', 'median', 'std'])
category_id['comment_count'].agg(['mean', 'median', 'std'])

Unnamed: 0_level_0,mean,median,std
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,5077.721742,1272.0,9402.829082
2,1617.563063,451.5,2614.072126
10,17782.714296,3968.0,56791.69089
15,2949.77233,1121.0,5148.801581
17,2987.077698,540.0,9385.933782
19,1725.999009,1179.0,2098.161999
20,6216.750555,1297.0,14067.46937
22,4035.977444,562.0,16945.791456
23,5077.232147,1524.0,11911.465746
24,6173.731007,992.0,40740.255837


# 2. Find the video with the highest number of views per category.

In [5]:
# youtube.loc[youtube.groupby('category_id')['views'].idxmax()]
youtube.groupby('category_id')['views'].idxmax()
youtube.loc[youtube.groupby('category_id')['views'].idxmax()]

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_date,time_frame,published_day_of_week,publish_country,tags,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed
39948,ulNswX3If6U,18.09.06,Selena Gomez - Back To You (Lyric Video),Selena Gomez,1,2018-05-10,16:00 to 16:59,Thursday,US,"selena gomez|""13 reasons why""""|""""back to you""""...",54863912,922355,18037,41774,False,False,False
17962,cqg5oc20nxk,18.13.02,Official Ram Trucks Super Bowl Commercial | Ic...,Ram Trucks,2,2018-02-05,0:00 to 0:59,Monday,US,"Ram Trucks Commercial|""Ram Super Bowl Commerci...",25244097,6971,1301,875,False,False,False
69361,_I_D_8Z4sJE,18.07.04,Nicky Jam x J. Balvin - X (EQUIS) | Video Ofic...,NickyJamTV,10,2018-03-02,5:00 to 5:59,Friday,GB,"Bad Bunny|""Amorfoda""|""Dura""|""Daddy Yankee""|""Oz...",424538912,2818771,149275,99380,False,False,False
108621,VJB3N4O_6MA,18.15.04,é‡Žç”Ÿå‹•ç‰©ã®é©šãã¹ãçž¬é–“2018ï¼ãƒ©ã‚¤...,kien hoang,15,2018-04-13,5:00 to 5:59,Friday,FRANCE,[none],7220717,14954,9334,2083,False,False,False
15192,VRJmcxCrAOA,18.30.01,Real Life Trick Shots 2 | Dude Perfect,Dude Perfect,17,2018-01-22,22:00 to 22:59,Monday,US,"dude perfect|""dude perfect stereotypes""""|""""dud...",29090799,1017919,15066,51282,False,False,False
17573,yvicqMrAHvQ,18.11.02,Turkish Airlines - 5 Senses with Dr. Oz,Turkish Airlines,19,2018-02-04,23:00 to 23:59,Sunday,US,[none],23932421,4666,374,465,False,False,False
71110,HwkLbeEYz6E,18.23.04,Yodeling Walmart Kid EDM Remix (OFFICIAL AUDIO...,GalacticFusion,20,2018-04-03,1:00 to 1:59,Tuesday,GB,"walmart|""EDM""|""TRAP""|""REMIX""|""YODELING WALMART...",18158133,394974,7123,17171,False,False,False
64717,BhIEIO0vaBE,18.14.03,To Our Daughter,Kylie Jenner,22,2018-02-04,20:00 to 20:59,Sunday,GB,"Kylie Jenner|""Kylie""|""Travis Scott""|""Baby""|""An...",62338362,0,0,0,True,True,False
50278,wzKL-bQKcgA,17.30.12,Anitta & J Balvin - Downtown (Official Lyric V...,Lele Pons,23,2017-12-14,18:00 to 18:59,Thursday,GB,anitta j balvin downtown official lyric video ...,43460605,1253154,38693,58235,False,False,False
49138,FlsCjmMhFmw,17.24.12,YouTube Rewind: The Shape of 2017 | #YouTubeRe...,YouTube Spotlight,24,2017-12-06,17:00 to 17:59,Wednesday,GB,"Rewind|""Rewind 2017""|""youtube rewind 2017""|""#Y...",169884583,3312868,1753274,845233,False,False,False


In [6]:
youtube['category_id'].unique().size

18

# 3. Determine the average number of views for videos published on each day of the week.

In [7]:
# youtube.groupby('published_day_of_week')['views'].mean()
week_day = youtube['publish_date'].dt.day_name()
youtube.groupby(week_day)['views'].mean()

publish_date
Friday       3.755648e+06
Monday       1.887318e+06
Saturday     1.444060e+06
Sunday       2.110650e+06
Thursday     2.778910e+06
Tuesday      2.079608e+06
Wednesday    2.202413e+06
Name: views, dtype: float64

# 4. Extract the year and month from the trending_date and publish_date columns, then analyze the trend of videos getting into trending by month.

In [8]:
# year = youtube['publish_date'].dt.year
# month = youtube['publish_date'].dt.month
month = youtube['publish_date'].dt.to_period('M')
month
trending_videos = youtube.groupby(month).size()
trending_videos

publish_date
2006-07        1
2007-03        9
2007-06       12
2007-12       16
2008-01       12
           ...  
2018-02    23432
2018-03    24044
2018-04    18935
2018-05    23533
2018-06     6231
Freq: M, Length: 98, dtype: int64

In [9]:
# # Convert 'trending_date' to datetime if it's not
# youtube['trending_date'] = pd.to_datetime(youtube['trending_date'])
# 
# # Extract year and month from 'trending_date' and 'publish_date'
# youtube['trending_year_month'] = youtube['trending_date'].dt.to_period('M')
# youtube['publish_year_month'] = youtube['publish_date'].dt.to_period('M')
# 
# # Count the number of videos trending each month
# trending_videos = youtube.groupby('trending_year_month').size()
# 
# # Count the number of videos published each month
# published_videos = youtube.groupby('publish_year_month').size()
# 
# print("Number of videos trending each month:")
# print(trending_videos)
# 
# print("\nNumber of videos published each month:")
# print(published_videos)

# 5. Identify the top 10 channels with the highest average number of views.

In [10]:
youtube

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_date,time_frame,published_day_of_week,publish_country,tags,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13,17:00 to 17:59,Monday,US,SHANtell martin,748374,57527,2966,15954,False,False,False
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13,7:00 to 7:59,Monday,US,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,False,False,False
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12,19:00 to 19:59,Sunday,US,"racist superman|""rudy""""|""""mancuso""""|""""king""""|""...",3191434,146033,5339,8181,False,False,False
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13,11:00 to 11:59,Monday,US,"rhett and link|""gmm""""|""""good mythical morning""...",343168,10172,666,2146,False,False,False
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12,18:00 to 18:59,Sunday,US,"ryan|""higa""""|""""higatv""""|""""nigahiga""""|""""i dare ...",2095731,132235,1989,17518,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161465,sGolxsMSGfQ,18.14.06,HOW2: How to Solve a Mystery,Annoying Orange,24,2018-06-13,18:00 to 18:59,Wednesday,CANADA,"annoying orange|""funny""|""fruit""|""talking""|""ani...",80685,1701,99,1312,False,False,False
161466,8HNuRNi8t70,18.14.06,Eli Lik Lik Episode 13 Partie 01,Elhiwar Ettounsi,24,2018-06-13,19:00 to 19:59,Wednesday,CANADA,"hkayet tounsia|""elhiwar ettounsi""|""denya okhra...",103339,460,66,51,False,False,False
161467,GWlKEM3m2EE,18.14.06,KINGDOM HEARTS III â€“ SQUARE ENIX E3 SHOWCASE...,Kingdom Hearts,20,2018-06-11,17:00 to 17:59,Monday,CANADA,"Kingdom Hearts|""KH3""|""Kingdom Hearts 3""|""Froze...",773347,25900,224,3881,False,False,False
161468,lbMKLzQ4cNQ,18.14.06,Trump Advisor Grovels To Trudeau,The Young Turks,25,2018-06-13,4:00 to 4:59,Wednesday,CANADA,"180612__TB02SorryExcuse|""News""|""Politics""|""The...",115225,2115,182,1672,False,False,False


In [11]:
youtube.groupby('channel_title')['views'].mean().sort_values(ascending=False).head(10)

channel_title
Bad Bunny              1.624488e+08
Flow La Movie          1.520128e+08
ChildishGambinoVEVO    1.428129e+08
NickyJamTV             9.400607e+07
BeckyGVEVO             9.024716e+07
DrakeVEVO              8.645280e+07
Ozuna                  8.550741e+07
ArianaGrandeVevo       6.727837e+07
SebastianYatraVEVO     6.629097e+07
YouTube Spotlight      6.497292e+07
Name: views, dtype: float64

# 6. For each category_id, find out the ratio of likes to dislikes and sort them in descending order.

In [12]:
like_dislike_ratio = youtube['likes'] / youtube['dislikes']
# youtube.insert(13, 'like dislike ratio', like_dislike_ratio)
youtube[['category_id', 'like dislike ratio']].sort_values(by='like dislike ratio', ascending=False)

KeyError: "['like dislike ratio'] not in index"

# 7. Create a new column that categorizes videos into 'High Engagement' and 'Low Engagement' based on the median value of the comment_count column.
of the comment_count column.

In [13]:
median = youtube['comment_count'].median() # 1144.0

def engagement_categorizing(row):
  comment = row.loc['comment_count']
  if median >= comment:
    return  'High Engagement'
  else:
    return  'Low Engagement'
  
youtube_engagement = youtube.apply(engagement_categorizing, axis=1) # -4 loc

youtube.insert(15, 'engagement', youtube_engagement)

In [14]:
# youtube.drop('engagement', axis=1, inplace=True)
youtube

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_date,time_frame,published_day_of_week,publish_country,tags,views,likes,dislikes,comment_count,comments_disabled,engagement,ratings_disabled,video_error_or_removed
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13,17:00 to 17:59,Monday,US,SHANtell martin,748374,57527,2966,15954,False,Low Engagement,False,False
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13,7:00 to 7:59,Monday,US,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,False,Low Engagement,False,False
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12,19:00 to 19:59,Sunday,US,"racist superman|""rudy""""|""""mancuso""""|""""king""""|""...",3191434,146033,5339,8181,False,Low Engagement,False,False
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13,11:00 to 11:59,Monday,US,"rhett and link|""gmm""""|""""good mythical morning""...",343168,10172,666,2146,False,Low Engagement,False,False
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12,18:00 to 18:59,Sunday,US,"ryan|""higa""""|""""higatv""""|""""nigahiga""""|""""i dare ...",2095731,132235,1989,17518,False,Low Engagement,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161465,sGolxsMSGfQ,18.14.06,HOW2: How to Solve a Mystery,Annoying Orange,24,2018-06-13,18:00 to 18:59,Wednesday,CANADA,"annoying orange|""funny""|""fruit""|""talking""|""ani...",80685,1701,99,1312,False,Low Engagement,False,False
161466,8HNuRNi8t70,18.14.06,Eli Lik Lik Episode 13 Partie 01,Elhiwar Ettounsi,24,2018-06-13,19:00 to 19:59,Wednesday,CANADA,"hkayet tounsia|""elhiwar ettounsi""|""denya okhra...",103339,460,66,51,False,High Engagement,False,False
161467,GWlKEM3m2EE,18.14.06,KINGDOM HEARTS III â€“ SQUARE ENIX E3 SHOWCASE...,Kingdom Hearts,20,2018-06-11,17:00 to 17:59,Monday,CANADA,"Kingdom Hearts|""KH3""|""Kingdom Hearts 3""|""Froze...",773347,25900,224,3881,False,Low Engagement,False,False
161468,lbMKLzQ4cNQ,18.14.06,Trump Advisor Grovels To Trudeau,The Young Turks,25,2018-06-13,4:00 to 4:59,Wednesday,CANADA,"180612__TB02SorryExcuse|""News""|""Politics""|""The...",115225,2115,182,1672,False,Low Engagement,False,False


# 8. Calculate the total views, likes, dislikes, and comments for each channel_title.

In [15]:
youtube.groupby('channel_title')[['likes', 'dislikes', 'comment_count']].sum()

Unnamed: 0_level_0,likes,dislikes,comment_count
channel_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
#AndresSTyle,21039,598,2364
#DÃ©bloquÃ©s,1933,111,18
#Mind Warehouse,536411,94787,336485
#NAME?,9049,647,1480
#SeekingTheTruth,3277,176,1066
...,...,...,...
í¬í¬í¬í¬,6058,207,1208
íŠ¹ì´í•œë™ë¬¼ì±„ë„,261140,3382,22741
íƒ€ìš°TV,4160,222,1000
í™œë ¥ì†ŒTV,1157,511,735


# 9. Determine which hour of the day (from publish_date) has the highest average number of views.

In [16]:
hour = youtube['publish_date'].dt.hour
max_avg_views_hour = youtube.groupby(hour)['views'].mean().idxmax()
max_avg_views_hour

0

In [17]:
youtube

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_date,time_frame,published_day_of_week,publish_country,tags,views,likes,dislikes,comment_count,comments_disabled,engagement,ratings_disabled,video_error_or_removed
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13,17:00 to 17:59,Monday,US,SHANtell martin,748374,57527,2966,15954,False,Low Engagement,False,False
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13,7:00 to 7:59,Monday,US,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,False,Low Engagement,False,False
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12,19:00 to 19:59,Sunday,US,"racist superman|""rudy""""|""""mancuso""""|""""king""""|""...",3191434,146033,5339,8181,False,Low Engagement,False,False
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13,11:00 to 11:59,Monday,US,"rhett and link|""gmm""""|""""good mythical morning""...",343168,10172,666,2146,False,Low Engagement,False,False
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12,18:00 to 18:59,Sunday,US,"ryan|""higa""""|""""higatv""""|""""nigahiga""""|""""i dare ...",2095731,132235,1989,17518,False,Low Engagement,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161465,sGolxsMSGfQ,18.14.06,HOW2: How to Solve a Mystery,Annoying Orange,24,2018-06-13,18:00 to 18:59,Wednesday,CANADA,"annoying orange|""funny""|""fruit""|""talking""|""ani...",80685,1701,99,1312,False,Low Engagement,False,False
161466,8HNuRNi8t70,18.14.06,Eli Lik Lik Episode 13 Partie 01,Elhiwar Ettounsi,24,2018-06-13,19:00 to 19:59,Wednesday,CANADA,"hkayet tounsia|""elhiwar ettounsi""|""denya okhra...",103339,460,66,51,False,High Engagement,False,False
161467,GWlKEM3m2EE,18.14.06,KINGDOM HEARTS III â€“ SQUARE ENIX E3 SHOWCASE...,Kingdom Hearts,20,2018-06-11,17:00 to 17:59,Monday,CANADA,"Kingdom Hearts|""KH3""|""Kingdom Hearts 3""|""Froze...",773347,25900,224,3881,False,Low Engagement,False,False
161468,lbMKLzQ4cNQ,18.14.06,Trump Advisor Grovels To Trudeau,The Young Turks,25,2018-06-13,4:00 to 4:59,Wednesday,CANADA,"180612__TB02SorryExcuse|""News""|""Politics""|""The...",115225,2115,182,1672,False,Low Engagement,False,False


# 10. Identify the top 5 tags associated with the highest average number of views.

In [26]:
tags = youtube['tags'].str.split('|').explode()
tags.reset_index(drop=True, inplace=True)
tags

0                             SHANtell martin
1          last week tonight trump presidency
2           "last week tonight donald trump""
3                       ""john oliver trump""
4                           ""donald trump"""
                          ...                
2907697                              "ç”·å‹"
2907698                              "å¥³å‹"
2907699                              "ç›¸è™•"
2907700                        "ææ€–æƒ…äºº"
2907701                              "æƒ…æ®º"
Name: tags, Length: 2907702, dtype: object

In [29]:
# Explode the tags
exploded_tags = youtube['tags'].str.split('|').explode()

# Join with original dataframe to get corresponding views
tag_views = youtube[['views']].join(exploded_tags)

# Group by tags and calculate mean views
# tag_mean_views = tag_views.groupby('tags')['views'].mean()
tag_mean_views = tag_views.groupby('tags')['views'].sum()

# Sort and take top 5
top_5_tags = tag_mean_views.sort_values(ascending=False).head(5)

top_5_tags

tags
"Pop"         28936747962
"Rap"         18729605556
"Records"     17073688470
"Ozuna"       16474937149
"Amorfoda"    16324177353
Name: views, dtype: int64

In [30]:
youtube

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_date,time_frame,published_day_of_week,publish_country,tags,views,likes,dislikes,comment_count,comments_disabled,engagement,ratings_disabled,video_error_or_removed
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13,17:00 to 17:59,Monday,US,SHANtell martin,748374,57527,2966,15954,False,Low Engagement,False,False
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13,7:00 to 7:59,Monday,US,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,False,Low Engagement,False,False
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12,19:00 to 19:59,Sunday,US,"racist superman|""rudy""""|""""mancuso""""|""""king""""|""...",3191434,146033,5339,8181,False,Low Engagement,False,False
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13,11:00 to 11:59,Monday,US,"rhett and link|""gmm""""|""""good mythical morning""...",343168,10172,666,2146,False,Low Engagement,False,False
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12,18:00 to 18:59,Sunday,US,"ryan|""higa""""|""""higatv""""|""""nigahiga""""|""""i dare ...",2095731,132235,1989,17518,False,Low Engagement,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161465,sGolxsMSGfQ,18.14.06,HOW2: How to Solve a Mystery,Annoying Orange,24,2018-06-13,18:00 to 18:59,Wednesday,CANADA,"annoying orange|""funny""|""fruit""|""talking""|""ani...",80685,1701,99,1312,False,Low Engagement,False,False
161466,8HNuRNi8t70,18.14.06,Eli Lik Lik Episode 13 Partie 01,Elhiwar Ettounsi,24,2018-06-13,19:00 to 19:59,Wednesday,CANADA,"hkayet tounsia|""elhiwar ettounsi""|""denya okhra...",103339,460,66,51,False,High Engagement,False,False
161467,GWlKEM3m2EE,18.14.06,KINGDOM HEARTS III â€“ SQUARE ENIX E3 SHOWCASE...,Kingdom Hearts,20,2018-06-11,17:00 to 17:59,Monday,CANADA,"Kingdom Hearts|""KH3""|""Kingdom Hearts 3""|""Froze...",773347,25900,224,3881,False,Low Engagement,False,False
161468,lbMKLzQ4cNQ,18.14.06,Trump Advisor Grovels To Trudeau,The Young Turks,25,2018-06-13,4:00 to 4:59,Wednesday,CANADA,"180612__TB02SorryExcuse|""News""|""Politics""|""The...",115225,2115,182,1672,False,Low Engagement,False,False
