## Question 6 : Cluster the publish time into 10-minute intervals (e.g. from 02:20 to 02:30)

In [72]:
#to solve this question first I need to have the solution for question 1,
#and also for question 10 I need to have question 5 to be solved
#so I have get the file 'final_df.csv' from my teammate.
#because the file was larger than 100MB we could not load it in the github.
import re
import pandas as pd

final_df = pd.read_csv(r'C:\Users\Utente\final_df.csv')
print(final_df.head())

      video_id trending_date  \
0  n1WpP7iowLc      17.14.11   
1  0dBIkQ4Mz1M      17.14.11   
2  5qpjK5DgCt4      17.14.11   
3  d380meD0W0M      17.14.11   
4  2Vv-BfVoq4g      17.14.11   

                                               title channel_title  \
0         Eminem - Walk On Water (Audio) ft. BeyoncÃ©    EminemVEVO   
1                      PLUSH - Bad Unboxing Fan Mail     iDubbbzTV   
2  Racist Superman | Rudy Mancuso, King Bach & Le...  Rudy Mancuso   
3                           I Dare You: GOING BALD!?      nigahiga   
4        Ed Sheeran - Perfect (Official Music Video)    Ed Sheeran   

   category_id              publish_time  \
0           10  2017-11-10T17:00:03.000Z   
1           23  2017-11-13T17:00:00.000Z   
2           23  2017-11-12T19:05:24.000Z   
3           24  2017-11-12T18:01:41.000Z   
4           10  2017-11-09T11:04:14.000Z   

                                                tags     views    likes  \
0  Eminem|"Walk"|"On"|"Water"|"Aftermath/Shad

In [73]:
# Convert the column to datetime objects
final_df['publish_time'] = pd.to_datetime(final_df['publish_time'])

# remove the timezone (I checked it in the file, it was the same in all countries)
final_df['publish_time'] = final_df['publish_time'].dt.tz_localize(None)

# round down to the nearest 10 minutes. Example: 02:23:00 becomes 02:20:00
interval_start = final_df['publish_time'].dt.floor('10min')

# create "Start-End" string format
# adds 10 minutes to the start time to get the end time
final_df['interval_10min'] = (
    interval_start.dt.strftime('%H:%M') + "-" + 
    (interval_start + pd.Timedelta(minutes=10)).dt.strftime('%H:%M')
)

print(final_df[['publish_time', 'interval_10min']].head())

         publish_time interval_10min
0 2017-11-10 17:00:03    17:00-17:10
1 2017-11-13 17:00:00    17:00-17:10
2 2017-11-12 19:05:24    19:00-19:10
3 2017-11-12 18:01:41    18:00-18:10
4 2017-11-09 11:04:14    11:00-11:10


## Question 7 : For each interval, determine the number of videos, average number of likes and of dislikes.

In [74]:
interval_stats = final_df.groupby('interval_10min').agg(
    num_videos=('video_id', 'count'),
    avg_likes=('likes', 'mean'),
    avg_dislikes=('dislikes', 'mean')
).reset_index() #to reset 'interval_10min' from being index (was converted by groupby) into a flat table
print(interval_stats.head())

  interval_10min  num_videos     avg_likes  avg_dislikes
0    00:00-00:10        2913  60951.483350   3787.232750
1    00:10-00:20        1522  22553.870565   1437.457293
2    00:20-00:30        1248  21258.370192   1066.330128
3    00:30-00:40        1625  36604.352000    949.439385
4    00:40-00:50        1283  41770.614186   1889.012471


## Question 8 : For each tag, determine the number of videos. Notice that tags contains a string with several tags.

In [75]:
# splits and explodes tags
tags_series = final_df['tags'].str.split('|').explode()

#deleting none tags
tags_series = tags_series.replace('', None).dropna()

#remove spaces/quotation and lowercase the strings
tags_series = tags_series.str.strip().str.strip('"').str.lower()

#number of videos for each tag
tag_counts = tags_series.value_counts(sort=False)
print("\n first 20 tags with the number of videos they repeated in:\n")
print(tag_counts.head(20))


 first 20 tags with the number of videos they repeated in:

tags
eminem                         675
walk                           227
on                             496
water                          573
aftermath/shady/interscope     127
rap                           5476
plush                           19
bad unboxing                    20
unboxing                      1385
fan mail                        39
idubbbztv                       42
idubbbztv2                      33
things                         112
best                          2520
packages                        13
plushies                        22
chontent chop                    9
racist superman                122
rudy                           261
mancuso                        210
Name: count, dtype: int64


## Question 9 : Find the tags with the largest number of videos.

In [76]:
# first I decided to delete [none] tag
tags_series_new = tags_series.replace('[none]', None).dropna()

# Then to sort the tags
tag_counts_sorted = tags_series_new.value_counts()

print(tag_counts_sorted.head())

tags
funny     17344
comedy    15701
2018      11402
news       9199
music      8262
Name: count, dtype: int64


## Question 10 : For each (tag, country) pair, compute average ratio likes/dislikes.

In [77]:
df_pairs = final_df[['tags', 'country', 'like_ratio']].copy()

df_pairs['tags'] = df_pairs['tags'].str.split('|')
df_pairs = df_pairs.explode('tags')

# clean & split tags
df_pairs['tags'] = df_pairs['tags'].str.strip().str.strip('"').str.lower()


# remove empty, [none], nan tags
df_pairs = df_pairs.dropna(subset=['tags']) #remove real Nulls (NaN/None)
df_pairs = df_pairs[df_pairs['tags'] != ''] #remove empty strings ('')
df_pairs = df_pairs[df_pairs['tags'] != '[none]'] #remove the "[none]" tags

# calculating the average of like_ratio For each (tag, country) pair 
result = df_pairs.groupby(['tags', 'country'])['like_ratio'].mean().reset_index()

#to make columns by their names to have a clear output
result.columns = ['tag', 'country', 'average_ratio']

print(result.head(20))

                                                  tag      country  \
0                           ! banii i-au luat mintile      Germany   
1                                                  !!       France   
2                                                  !!       Mexico   
3                                     !! *me bloquea*       Mexico   
4                                                 !!!       Mexico   
5                                                !!!!       France   
6                             !00% sketch comedy show       Russia   
7                                             !t live  South Korea   
8                                   # carlosvideostar       Mexico   
9   # einen schÃ¶nen tag wÃ¼nschen# einen schÃ¶nen so...      Germany   
10                                    # ssc scam 2018        India   
11                            # ssc scam protest 2018        India   
12                                          # ã‚³ãƒªã‚¢ãƒ³ã‚¿ã‚¦ãƒ³        Japan   
13 