**This is the final pre-processing code for the video features in terms of foundational cleaning and merging**

In [1]:
import pandas as pd
import os
import re
from datetime import datetime as dt
import scipy.stats as st
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import requests
from api_keys import (gkey, gkey2, gkey3)
import time
import random
import pickle
import json

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Merging the Data to a single DF

In [2]:
def find_all_files(path, ty='csv', Name='Merged_DF.csv'):
    #Iteratively appends all files with ty extention to list_of_files
    for root,dirs,files in os.walk(path):
        [list_of_files.append(file) for file in files if (file.endswith(f".{ty}") and (file!=Name))]

In [3]:
Name = 'Merged_DF.csv' #Name of Final DF
list_of_files = []
data_path = os.path.join('..', 'Data')
find_all_files(data_path, Name=Name)

Total_DF = pd.DataFrame()
for file in list_of_files:
    try:
        DF = pd.read_csv(os.path.join('..', 'Data', file), encoding='utf-8')
    except:
        DF = pd.read_csv(os.path.join('..', 'Data', file), encoding='latin1')
    DF['country'] = file[:2]
    Total_DF = (DF if Total_DF.empty else pd.concat([Total_DF, DF]))

# Total DF Cleaning

In [None]:
#Total_DF['video_id'].map(lambda x: type(x)!=str).sum() #All values are string
#Total_DF['trending_date'].map(lambda x: type(x)!=str).sum() #All values are string
#Convert the 'trending_date' to date format
Total_DF['trending_date'] = Total_DF['trending_date'].map(lambda x: dt.strptime(x, "%y.%d.%m"))

#Total_DF['title'].map(lambda x: type(x)!=str).sum() #All values are string
#Total_DF['channel_title'].map(lambda x: type(x)!=str).sum() #All values are string
#Total_DF['category_id'].map(lambda x: type(x)!=int).sum() #All values are int
#Total_DF['publish_time'].map(lambda x: type(x)!=str).sum() #All values are string
#Convert the 'publish_time' to date format
Total_DF['publish_time'] = pd.to_datetime(Total_DF['publish_time'], format='%Y-%m-%dT%H:%M:%S.%fZ') #%f means microsecond which means 6 digits. This works here as it is always 0 microseconds
#Total_DF['tags'].map(lambda x: type(x)!=str).sum() #All values are string
#Total_DF['views'].map(lambda x: type(x)!=int).sum() #All values are int64
#Total_DF['likes'].map(lambda x: type(x)!=int).sum() #All values are int64
#Total_DF['dislikes'].map(lambda x: type(x)!=int).sum() #All values are int64
#Total_DF['comment_count'].map(lambda x: type(x)!=int).sum() #All values are int64
#Total_DF['likes'].isnull().sum()
#Total_DF['dislikes'].isnull().sum()
#Total_DF['comment_count'].isnull().sum()
#Total_DF['thumbnail_link'].isnull().sum()
#Total_DF['comments_disabled'].map(lambda x: type(x)!=bool).sum() #All values are boolean
#Total_DF['ratings_disabled'].map(lambda x: type(x)!=bool).sum() #All values are boolean
#Total_DF['video_error_or_removed'].map(lambda x: type(x)!=bool).sum() #All values are boolean
#Total_DF['comments_disabled'].isnull().sum()
#Total_DF['ratings_disabled'].isnull().sum()
#Total_DF['video_error_or_removed'].isnull().sum()

#Convert NaN values in 'description' to ''
Total_DF['description'].fillna(value='', inplace=True)
#Total_DF['description'].isna().sum()
#Total_DF[Total_DF['description'] == ''].shape

## Removing for duplicate rows

In [4]:
#Drop the duplicate rows
Total_DF.drop_duplicates(subset=['video_id', 'trending_date', 'country'], keep='last', inplace=True)#NEED TO CHANGE
Total_DF.reset_index(drop=True, inplace=True)
to_drop = Total_DF[(Total_DF['video_id']=='#NAME?') | (Total_DF['video_id']=='#VALUE!')].index
Total_DF.drop(to_drop, inplace=True)
Total_DF.reset_index(drop=True, inplace=True)
Total_DF.shape
#375942 - 14518 - 846  = 360578

(360578, 17)

## Removing Videos with multiple publish Times and videos with 'video_error_or_removed'

In [5]:
#Publish time is supposed to be unique. Remove the videos with More than 1 publish time 
Temp_TF = Total_DF.groupby('video_id').aggregate(Publish_Time_Unique_Count = ('publish_time', lambda x : len(set(x.to_list()))))           
#Drop these 29 videos, total 146 corresponding rows
Total_DF.drop(Total_DF[Total_DF['video_id'].isin(Temp_TF[Temp_TF['Publish_Time_Unique_Count']>1].index)].index, inplace=True)
Total_DF.reset_index(drop=True, inplace=True)
Total_DF.shape
#360578 - 146 = 360432

#Some Videos are removed after some time; Let's exclude these videos from the analysis as there is a manual intervention or environment issue 
#50 videos had error (atleast once); Total 215 rows
#Both the below codes give exactly same results; MEANS 'video_error_or_removed' really means that atleast once 'video_error_or_removed'
#Total_DF[Total_DF['video_id'].isin(Total_DF[Total_DF['video_error_or_removed']]['video_id'].unique())]
#Total_DF[Total_DF['video_error_or_removed']]
Total_DF.drop(Total_DF[Total_DF['video_id'].isin(Total_DF[Total_DF['video_error_or_removed']]['video_id'].unique())].index, inplace=True)
Total_DF.reset_index(drop=True, inplace=True)
Total_DF.shape
#360432-215 = 360217

#Remove 'video_error_or_removed' as it doesn't carry any relevant info now
Total_DF.drop('video_error_or_removed', inplace=True, axis=1)
Total_DF.shape

(360432, 17)

(360217, 17)

(360217, 16)

# Save the DataFrame as pickle 

In [None]:
#Total_DF.to_pickle("../Data/VideoDF.pkl") #Commenting to avoid overwritting the existing file

# Read from pickle

In [6]:
Video_DF = pd.read_pickle("../Data/VideoDF.pkl")

In [7]:
#Checking Sanity
Video_DF.dtypes
Video_DF.duplicated(subset=['video_id', 'trending_date', 'country'], keep='last').sum()

video_id                     object
trending_date        datetime64[ns]
title                        object
channel_title                object
category_id                   int64
publish_time         datetime64[ns]
tags                         object
views                         int64
likes                         int64
dislikes                      int64
comment_count                 int64
thumbnail_link               object
comments_disabled              bool
ratings_disabled               bool
description                  object
country                      object
dtype: object

0

##  Introducing the Notion of Popularity. What makes trending videos popular ?

**Popularity score (Longevity): Total days of trending for a video**

**Populrity score of a trending video depends on what ?**

**Useful metric to consider from the dataset**
- Views of Trend Day 1
- Likes of Trend Day 1
- Dislikes of Trend Day 1
- Comment_Count of Trend Day 1
- Words in Title
- Channel Title
- Category ID
- Tags
- comments_disabled
- ratings_disabled
- video_error_or_removed
- description

**Extract the info outside dataset**
- Publish time of the day based on the timezone of the channel (Need outside info)
- Country
- Language

**Extract info outside news**
- News effect on popularity (Can be speific to a category)
- Google trending effect on popularity


# New Video Features via API call

### Sample API call

base = 'https://www.googleapis.com/youtube/v3/videos'


params = {
            'part' : 'snippet,contentDetails',
            #'part' :'localizations',
            #'part' : 'player',
            #'part' : 'recordingDetails',
            #'part' : 'statistics',
            #'part' : 'status',
            'id': <List of Videos>,
            'key':gkey
}

response = requests.get(base, params)


In [None]:
start_time = time.time()
#Video_ID_List = list(set(Video_DF['video_id'])) #COMMENTING NOT TO ACCEDENTALLY RUN THE ENTIRE BATCH
#Video_ID_List = pickle.load(open( "To_scrape.p", "rb" )) #THIS IS USED TO SCRAPE MISSED VIDEOS

Total = []
len(Video_ID_List)
Video_features = dict()
base = 'https://www.googleapis.com/youtube/v3/videos'
Total_Videos = len(Video_ID_List)
count = 0
for iteration in range(((len(Video_ID_List)//50 + 1) if len(Video_ID_List)%50 else len(Video_ID_List)//50)):
    list_of_videos = (Video_ID_List[iteration*50:iteration*50+50] if (iteration+1)*50<=Total_Videos else Video_ID_List[iteration*50:])
    Total = Total + list_of_videos
    params = {
            'part' : 'snippet,contentDetails',
            'id': list_of_videos,
            'key':gkey
    }
    
    response = requests.get(base, params)
    
    try:
        content = response.json()
        
        for item in content['items']:
            count+=1
            ID = item['id']

            Video_features[ID] = {'ChannelID' : (item['snippet']['channelId'] if 'channelId' in item['snippet'] else None)}

            Video_features[ID].update({'ChannelTitle':(item['snippet']['channelTitle'] if 'channelTitle' in item['snippet'] else None)})

            Video_features[ID].update({'DefaultLanguage' : (item['snippet']['defaultLanguage'] if 'defaultLanguage' in item['snippet'] else None)})

            Video_features[ID].update({'DefaultAudioLanguage' : (item['snippet']['defaultAudioLanguage'] if 'defaultAudioLanguage' in item['snippet'] else None)})

            Video_features[ID].update({'Duration' : (item['contentDetails']['duration'] if 'duration'in item['contentDetails'] else None)})

            Video_features[ID].update({'Caption' : (item['contentDetails']['caption'] if 'caption'in item['contentDetails'] else None)})

            Video_features[ID].update({'RegionRestriction_Blocked' : (((item['contentDetails']['regionRestriction']['blocked']  if  ('blocked' in item['contentDetails']['regionRestriction']) else None)) if ('regionRestriction' in item['contentDetails']) else None)})   

            Video_features[ID].update({'RegionRestriction_Allowed' : (((item['contentDetails']['regionRestriction']['allowed']  if  ('allowed' in item['contentDetails']['regionRestriction']) else None)) if ('regionRestriction' in item['contentDetails']) else None)})   
            
    except:
        print(response.url)
        print(response)
        print(f"Didn't get response for iteration {iteration}")
        
    time.sleep(2)
    print(count)
time_taken_in_min=(time.time()-start_time)//60 
#Before I start,
#YouTube Data API v3	74	
    

## Saving the scraped Data

In [40]:
#Scraped_DF = pd.DataFrame(Video_features).transpose()
#Remove duplicates if any
#Scraped_DF=Scraped_DF[~Scraped_DF.duplicated(['video_id'], keep='first')]
#Scraped_DF.to_pickle("../Data/API_RETRIEVED_DATA.pkl")
Scraped_DF = pd.read_pickle('../Data/API_RETRIEVED_DATA.pkl')
#Scraped_DF.columns = ['video_id'] + Scraped_DF.columns[1:].to_list()
Scraped_DF.shape

(124848, 9)

In [None]:
#Scraped_DF.to_csv('../Output/API_RETRIEVED_DATA.csv', index=False, encoding='utf-8')

## Merge the Video Data with Scraped Data (Inner)

In [9]:
Video_DF.shape
Scraped_DF.shape

Video_DF.columns
Scraped_DF.columns

New_Video_DF = Video_DF.merge(Scraped_DF, how='inner', on='video_id')
New_Video_DF.columns
New_Video_DF.shape

(360217, 16)

(124848, 9)

Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'description', 'country'],
      dtype='object')

Index(['video_id', 'ChannelID', 'ChannelTitle', 'DefaultLanguage',
       'DefaultAudioLanguage', 'Duration', 'Caption',
       'RegionRestriction_Blocked', 'RegionRestriction_Allowed'],
      dtype='object')

Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'description', 'country', 'ChannelID', 'ChannelTitle',
       'DefaultLanguage', 'DefaultAudioLanguage', 'Duration', 'Caption',
       'RegionRestriction_Blocked', 'RegionRestriction_Allowed'],
      dtype='object')

(266934, 24)

# Channel Information through API call

### Sample API call

base = 'https://www.googleapis.com/youtube/v3/channels'

params = {
            'part' : 'snippet,brandingSettings,topicDetails',
            #'part' :'localizations',
            #'part' : 'player',
            #'part' : 'recordingDetails',
            #'part' : 'statistics',
            #'part' : 'status',
            'id': Channel_ID_list[1000:1050],
            'key':gkey3
}

response = requests.get(base, params)


In [None]:
start_time = time.time()
#Channel_ID_list = list(set(New_Video_DF['ChannelID'])) #COMMENTING NOT TO ACCEDENTALLY RUN THE ENTIRE BATCH
#len(Channel_ID_list) = 23159
#Channel_ID_list = Remaining #THIS IS USED TO SCRAPE MISSED VIDEOS

Total = []
len(Channel_ID_list)
Channel_features = dict()
base = 'https://www.googleapis.com/youtube/v3/channels'
Total_Channels = len(Channel_ID_list)
count = 0
for iteration in range(((len(Channel_ID_list)//50 + 1) if len(Channel_ID_list)%50 else len(Channel_ID_list)//50)):
    list_of_channels = (Channel_ID_list[iteration*50:iteration*50+50] if (iteration+1)*50<=Total_Channels else Channel_ID_list[iteration*50:])
    Total = Total + list_of_channels
    params = {
            'part' : 'snippet,brandingSettings,topicDetails',
            'id': list_of_channels,
            'key':gkey
            }
    
    response = requests.get(base, params)
 


    try:
        content = response.json()

        for item in content['items']:
            count+=1
            ID = item['id']

            Channel_features[ID] = {'Channel_Title' : (item['snippet']['title'] if 'title' in item['snippet'] else None)}

            Channel_features[ID].update({'Channel_Description' : (item['snippet']['description'] if 'description' in item['snippet'] else None)})

            Channel_features[ID].update({'Channel_PublishedAt' : (item['snippet']['publishedAt'] if 'publishedAt' in item['snippet'] else None)})

            Channel_features[ID].update({'country' : (item['snippet']['country'] if 'country' in item['snippet'] else None)})

            Channel_features[ID].update({'defaultLanguage' : (item['snippet']['defaultLanguage'] if 'defaultLanguage' in item['snippet'] else None)})

            Channel_features[ID].update({'keywords' : (((item['brandingSettings']['channel']['keywords']  if  ('keywords' in item['brandingSettings']['channel']) else None)) if ('channel' in item['brandingSettings']) else None)})

            Channel_features[ID].update({'GoogleAnalytics' : (((True  if  ('trackingAnalyticsAccountId' in item['brandingSettings']['channel']) else False)) if ('channel' in item['brandingSettings']) else False)})

            Channel_features[ID].update({'moderateComments' : (((item['brandingSettings']['channel']['moderateComments']  if  ('moderateComments' in item['brandingSettings']['channel']) else False)) if ('channel' in item['brandingSettings']) else False)})

            Channel_features[ID].update({'topicCategories' : ((' | '.join([category.split('/')[-1] for category in item['topicDetails']['topicCategories']]) if 'topicCategories' in item['topicDetails'] else None)) if 'topicDetails' in item else None})

    except:
        print(response.url)
        print(response)
        print(f"Didn't get response for iteration {iteration}")
        
    time.sleep(2)
    print(count)
time_taken_in_min=(time.time()-start_time)//60 
#Before I start,
#YouTube Data API v3	5,210	   
            

## Saving the scraped Data

In [39]:
#Channel_features_Scraped_DF = pd.DataFrame(Channel_features).transpose()
#Channel_features_Scraped_DF.reset_index(inplace=True)
#Channel_features_Scraped_DF.columns = ['ChannelID'] + Channel_features_DF.columns[1:].to_list()
#Channel_features_Scraped_DF.to_pickle("../Data/Channel_API_RETRIEVED_DATA.pkl")

Channel_features_DF = pd.read_pickle("../Data/Channel_API_RETRIEVED_DATA.pkl")

In [11]:
Channel_features_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23155 entries, 0 to 23154
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ChannelID            23155 non-null  object
 1   Channel_Title        23155 non-null  object
 2   Channel_Description  23155 non-null  object
 3   Channel_PublishedAt  23154 non-null  object
 4   country              16525 non-null  object
 5   defaultLanguage      2084 non-null   object
 6   keywords             17390 non-null  object
 7   GoogleAnalytics      23155 non-null  object
 8   moderateComments     23155 non-null  object
 9   topicCategories      23099 non-null  object
dtypes: object(10)
memory usage: 1.8+ MB


In [None]:
#Channel_features_DF.to_csv("../Output/Channel_API_RETRIEVED_DATA.csv", encoding='utf-8', index=False)

In [12]:
Channel_features_DF.head()
New_Video_DF.head()
Channel_features_DF.shape
New_Video_DF.shape

Unnamed: 0,ChannelID,Channel_Title,Channel_Description,Channel_PublishedAt,country,defaultLanguage,keywords,GoogleAnalytics,moderateComments,topicCategories
0,UCcADqTjMyMol8B8mWm9n6rA,SECHSKIES,SECHSKIES Official YouTube Channel\n젝스키스 공식 유튜...,2016-08-12T04:43:25Z,,,"""YG Entertainment"" YG 와이지 K-pop 젝스키스 젝키 SECHSK...",True,False,Music | Entertainment
1,UClmXPfaYhXOYsNn_QUyheWQ,Ed Sheeran - Topic,"Edward Christopher ""Ed"" Sheeran is an English ...",2013-07-03T16:09:35Z,,,,False,False,Pop_music | Music | Electronic_music | Hip_hop...
2,UCknYpLMv_eQQJn5u8zQ8IJQ,RadioEvropaLire,Misioni i Radios Evropa e Lirë është plasimi i...,2011-02-15T11:30:09Z,,,,True,False,Society
3,UCBhtXZI7_FxRw-6V19oloBw,Olga Astrology,Канал посвящен Астрологии - делам земным и неб...,2011-03-02T14:05:32Z,BE,,астрология,False,False,Hobby | Lifestyle_(sociology) | Society
4,UCvMu4ihpZCShKdbjjMkSxQA,La Jefa Ingrid Ramos,"¡Hola, soy La Jefa Ingrid Ramos! En este canal...",2017-12-10T18:52:47Z,MX,,"cocina recetas tips ""cocineros mexicanos"" ""la ...",False,False,Food | Lifestyle_(sociology)


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,...,description,country,ChannelID,ChannelTitle,DefaultLanguage,DefaultAudioLanguage,Duration,Caption,RegionRestriction_Blocked,RegionRestriction_Allowed
0,SbOwzAl9ZfQ,2017-11-14,CapÃ­tulo 12 | MasterChef 2017,MasterChef 2017,24,2017-11-13 06:06:22,"MasterChef Junior 2017|""TV Azteca""|""recetas""|""...",310130,4182,361,...,Disfruta la presencia del Chef Torreblanca en ...,MX,UCDYetMc6gOLkhIiNzFyrJPA,MasterChef México,,,PT1H48M8S,False,,[MX]
1,SbOwzAl9ZfQ,2017-11-15,CapÃ­tulo 12 | MasterChef 2017,MasterChef 2017,24,2017-11-13 06:06:22,"MasterChef Junior 2017|""TV Azteca""|""recetas""|""...",684302,5891,553,...,Disfruta la presencia del Chef Torreblanca en ...,MX,UCDYetMc6gOLkhIiNzFyrJPA,MasterChef México,,,PT1H48M8S,False,,[MX]
2,klOV6Xh-DnI,2017-11-14,ALEXA EX-INTEGRANTE DEL GRUPO TIMBIRICHE RENUN...,Micky Contreras Martinez,22,2017-11-13 05:11:58,La Voz Mexico 7,104972,271,174,...,ALEXA EX-INTEGRANTE DEL GRUPO TIMBIRICHE RENUN...,MX,UCZYbxoZhCltabKhvgwLHnig,Micky Contreras Martinez,,,PT9M13S,False,"[SD, UA, TL, NZ, US, NP, FI, FJ, FK, UY, FM, F...",
3,6L2ZF7Qzsbk,2017-11-14,LOUIS CKAGÃ - EL PULSO DE LA REPÃBLICA,El Pulso De La RepÃºblica,25,2017-11-13 17:00:02,"Chumel Torres|""El Pulso de la Republica""|""noti...",136064,10105,266,...,La canciÃ³n del principio se llama âEste esp...,MX,UCK0_zBeybLuyXbOcHp7wmJA,El Pulso De La República,es-419,es,PT22M34S,False,,
4,6L2ZF7Qzsbk,2017-11-15,LOUIS CKAGÃ - EL PULSO DE LA REPÃBLICA,El Pulso De La RepÃºblica,25,2017-11-13 17:00:02,"Chumel Torres|""El Pulso de la Republica""|""noti...",449540,18377,586,...,La canciÃ³n del principio se llama âEste esp...,MX,UCK0_zBeybLuyXbOcHp7wmJA,El Pulso De La República,es-419,es,PT22M34S,False,,


(23155, 10)

(266934, 24)

## Merge the channel features to the video features (Inner)

In [13]:
New_Video_Channel_DF = New_Video_DF.merge(Channel_features_DF, how='inner', on='ChannelID', suffixes=('', '_CH'))
New_Video_Channel_DF.shape

(266928, 33)

In [14]:
New_Video_Channel_DF.iloc[10000,:]

video_id                                                           xL_qpDkF5A8
trending_date                                              2017-11-18 00:00:00
title                        American Crime Story Season 2: The Assassinati...
channel_title                                                        TV Promos
category_id                                                                 24
publish_time                                               2017-11-15 01:25:18
tags                         Donatella Versace|"Antonio D' Amico"|"Penelope...
views                                                                   408821
likes                                                                     3912
dislikes                                                                   152
comment_count                                                              466
thumbnail_link                  https://i.ytimg.com/vi/xL_qpDkF5A8/default.jpg
comments_disabled                                   

# Process and  find what is missing 

**Remove**
- Remove Channel_Title as channel_title has the same content
- Remove thumbnail_link as we do not cover any computer vision techniques here

**Rename**
- Rename country as viewing_country
- Rename country_CH as origin_country
- Rename Caption as Caption_Enabled
- Rename GoogleAnalytics as GoogleAnalyticsUsed
- Rename moderateComments as IsmoderatingComments
- Rename topicCategories as channelTopicCategories

**Mapping**
- Map category_id based on category mapping in each countries in the json file
- Combine defaultLanguage with DefaultLanguage and DefaultAudioLanguage. If None of them are present, then google 
- Combine DefaultAudioLanguage(pref 1)  with DefaultLanguage(pref 2) and defaultLanguage(pref 3 - channel) - Lang
- Map Lang to human readable

**Type Conversion**
- Convert Duration to seconds
- Convert Channel_PublishedAt to Datetime

**Derived Variable**
- Derive a new variable :- PublishedAfter(Days) - > in months (publish_time - Channel_PublishedAt approximated to nearest number of days)

**translate(future)**
- description to identify the language


In [15]:
Cleaned_Video_Channel_DF = New_Video_Channel_DF.copy()
Cleaned_Video_Channel_DF.drop(['Channel_Title','ChannelTitle','thumbnail_link'], axis=1, inplace=True)
#Cleaned_Video_Channel_DF.columns
Cleaned_Video_Channel_DF.rename(columns={
   'country': 'viewing_country',
   'country_CH': 'origin_country',
   'Caption' : 'Caption_Enabled',
   'GoogleAnalytics' : 'GoogleAnalyticsUsed',
   'moderateComments' : 'IsmoderatingComments',
   'topicCategories' : 'channelTopicCategories'
}, inplace=True)
#Cleaned_Video_Channel_DF.columns


**All the category JSON files contain the same info with an exception that, in US, there is an additional category called '29': 'Nonprofits & Activism'. Hence, taking US json file for categories for all countries**



List_of_JSON = [(file, file.split('_')[0]) for file in list(os.walk(os.path.join('..', 'Data')))[0][-1] if (file.endswith("category_id.json"))]   
Dict_Country_Catogory = dict()
for file, country in List_of_JSON:
    with open(os.path.join('..', 'Data', file)) as f:
        data = json.load(f)
        Dict_Country_Catogory.update({country:{item['id']: item['snippet']['title'] for item in data['items']}})






In [16]:
List_of_JSON = [(file, file.split('_')[0]) for file in list(os.walk(os.path.join('..', 'Data')))[0][-1] if (file.endswith("category_id.json"))]   
Dict_Catogory = dict()
file = 'US_category_id.json'
with open(os.path.join('..', 'Data', file)) as f:
    data = json.load(f)
    Dict_Catogory.update({int(item['id']): item['snippet']['title'] for item in data['items']})
#Dict_Catogory

Cleaned_Video_Channel_DF['category'] = Cleaned_Video_Channel_DF['category_id'].map(Dict_Catogory)
Cleaned_Video_Channel_DF.drop('category_id', axis=1, inplace=True)

In [17]:
Cleaned_Video_Channel_DF['defaultLanguage'].isnull().sum() #Channel
Cleaned_Video_Channel_DF['DefaultAudioLanguage'].isnull().sum() #Video
Cleaned_Video_Channel_DF['DefaultLanguage'].isnull().sum() #Video

#API call to get Language Mapping
params = {'part':'snippet', 'key':gkey}
response = requests.get('https://www.googleapis.com/youtube/v3/i18nLanguages', params)
content = response.json()
code_language_dict = dict()
for item in content['items']:
    code_language_dict.update({item['snippet']['hl']:item['snippet']['name']})
    
#Create Lang as a combination all 3 columns related to language    
Cleaned_Video_Channel_DF['Lang'] = Cleaned_Video_Channel_DF.apply(lambda x: (x['DefaultAudioLanguage'] if ['DefaultAudioLanguage'] else (x['DefaultLanguage'] if x['DefaultLanguage'] else x['defaultLanguage'])), axis=1)                           
Cleaned_Video_Channel_DF['Lang'] = Cleaned_Video_Channel_DF['Lang'].map(lambda x: code_language_dict.get(x,None) if x else x)

#Drop redundant language columns
Cleaned_Video_Channel_DF.drop(['DefaultAudioLanguage', 'DefaultLanguage', 'defaultLanguage'], axis=1, inplace=True)

234913

95787

199903

In [18]:
def time_to_seconds(x):
    hours = (int(re.findall('\d+(?=H)', x)[0]) if re.findall('\d+(?=H)', x) else 0)
    minutes = (int(re.findall('\d+(?=M)', x)[0]) if re.findall('\d+(?=M)', x) else 0)
    seconds = (int(re.findall('\d+(?=S)', x)[0]) if re.findall('\d+(?=S)', x) else 0)
    return hours*60*60 + minutes*60 + seconds
Cleaned_Video_Channel_DF['Duration'] = Cleaned_Video_Channel_DF['Duration'].map(lambda x: time_to_seconds(x))

In [19]:
Cleaned_Video_Channel_DF['Channel_PublishedAt'] = pd.to_datetime(Cleaned_Video_Channel_DF['Channel_PublishedAt'], format='%Y-%m-%dT%H:%M:%SZ')  



In [20]:
#Remove Data without channel published At is NaN (only 89)
Cleaned_Video_Channel_DF.drop(Cleaned_Video_Channel_DF[Cleaned_Video_Channel_DF['Channel_PublishedAt'].isna()].index, inplace=True)  

#Remove Data with publish_time < Channel_PublishedAt (Some channels get public after publishing some content); Only 293
Cleaned_Video_Channel_DF.drop(Cleaned_Video_Channel_DF[Cleaned_Video_Channel_DF['publish_time']<=Cleaned_Video_Channel_DF['Channel_PublishedAt']].index,inplace=True)    
                              
                              

In [21]:
def indays(x,y):
    z=x-y
    return int(z.days + (1 if z.seconds/3600 >= 12 else 0))
Cleaned_Video_Channel_DF['PublishedAfter(Days)'] = Cleaned_Video_Channel_DF.apply(lambda x: indays(x['publish_time'],x['Channel_PublishedAt']), axis=1)


In [22]:
Cleaned_Video_Channel_DF.shape

(266546, 29)

In [23]:
Cleaned_Video_Channel_DF.columns

Index(['video_id', 'trending_date', 'title', 'channel_title', 'publish_time',
       'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'comments_disabled', 'ratings_disabled', 'description',
       'viewing_country', 'ChannelID', 'Duration', 'Caption_Enabled',
       'RegionRestriction_Blocked', 'RegionRestriction_Allowed',
       'Channel_Description', 'Channel_PublishedAt', 'origin_country',
       'keywords', 'GoogleAnalyticsUsed', 'IsmoderatingComments',
       'channelTopicCategories', 'category', 'Lang', 'PublishedAfter(Days)'],
      dtype='object')

In [24]:
Sorted_Cleaned_Vid_Ch_DF = Cleaned_Video_Channel_DF.sort_values(by=['video_id', 'Lang']) #To make Na position last
Description_DF = Sorted_Cleaned_Vid_Ch_DF.groupby('video_id')[['description','Lang']].first()
Need_To_Translate = Description_DF[Description_DF['Lang'].isna()].copy()
Need_To_Translate.reset_index(inplace=True)

# Language Detection through API Call

**Out of 124737 unique videos, 51935 don't have Lang. Hence, we use description to understand the Lang**

### Sample API call

base = 'https://translation.googleapis.com/language/translate/v2/detect'

params = {
            'q': first_sentence(Need_To_Translate['description'][2378]),
            'key':gkey
}

response = requests.get(base, params)

In [25]:
def first_sentences(x, sent=2):
    split = re.split('(?<=[.!?]) ', x)
    return ' '.join(split[:sent]) if len(split)>sent else ' '.join(split)

In [26]:
def first_100_letters(x, letters=100):
    return (x[:100] if len(x)>100 else x)

In [None]:
#PLEASE DON'T RUN ACCEDENTALLY
start_time = time.time()


base =  'https://translation.googleapis.com/language/translate/v2/detect'


BS = 5

for iteration in range(((Need_To_Translate.shape[0]//BS + 1) if Need_To_Translate.shape[0]%BS else Need_To_Translate.shape[0]//BS)):
    Batch = (Need_To_Translate['description'][iteration*BS:iteration*BS+BS] if (iteration+1)*BS<=Need_To_Translate.shape[0] else Need_To_Translate['description'][iteration*BS:])
    #Batch = Batch.map(lambda x: first_sentences(x)).to_list()
    Batch = Batch.map(lambda x: first_100_letters(x)).to_list()
    
    params = {
            'q': Batch,
            'key':gkey
            }
    
    response = requests.get(base, params)
 
    count=0
    try:
        content = response.json()
        for item in content['data']['detections']:
            Need_To_Translate.loc[iteration*BS + count, 'Lang']=item[0]['language']
            count+=1
    except:
        print(response.url)
        print(response)
        print(f"Didn't get response for iteration {iteration}")
        
    time.sleep(1)
    print(iteration*BS)
    print(Need_To_Translate.loc[iteration*BS:iteration*BS+count, 'Lang'])
time_taken_in_min=(time.time()-start_time)//60 
#Before I start,
  
            


## Saving the scraped data

In [27]:
#Need_To_Translate.to_pickle("../Data/Lang_Detect_TILL_NOW.pkl")
Translated = pd.read_pickle("../Data/Lang_Detect_TILL_NOW.pkl")
Dict1 = Description_DF[~Description_DF['Lang'].isnull()]['Lang'].to_dict()
Translated.set_index('video_id',inplace=True)
Dict2 = Translated[~Translated['Lang'].map(code_language_dict).isnull()]['Lang'].map(code_language_dict).to_dict() #Some languages are undetermined, not in the list etc
Dict1.update(Dict2)#VideoID - Language Mapping

In [None]:
#Translated.to_csv("../Output/Lang_Detect_TILL_NOW.csv", encoding='utf-8')

## Add the language to the Content

In [28]:
Cleaned_Video_Channel_DF.loc[:,'Lang'] = Cleaned_Video_Channel_DF['video_id'].map(Dict1)

In [29]:
Cleaned_Video_Channel_DF.reset_index(drop=True, inplace=True)
Cleaned_Video_Channel_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266546 entries, 0 to 266545
Data columns (total 29 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   video_id                   266546 non-null  object        
 1   trending_date              266546 non-null  datetime64[ns]
 2   title                      266546 non-null  object        
 3   channel_title              266546 non-null  object        
 4   publish_time               266546 non-null  datetime64[ns]
 5   tags                       266546 non-null  object        
 6   views                      266546 non-null  int64         
 7   likes                      266546 non-null  int64         
 8   dislikes                   266546 non-null  int64         
 9   comment_count              266546 non-null  int64         
 10  comments_disabled          266546 non-null  bool          
 11  ratings_disabled           266546 non-null  bool    

In [None]:
#Cleaned_Video_Channel_DF.to_pickle("../Data/Cleaned_Data(No_Agg).pkl")

# Create a popularity DF (by Aggregating the Cleaned_Video_Channel_DF)

In [30]:
Cleaned_Video_Channel_DF = Cleaned_Video_Channel_DF.sort_values(by=['video_id', 'viewing_country', 'trending_date'])
Cleaned_Video_Channel_DF.reset_index(drop=True, inplace=True)
Cleaned_Video_Channel_DF.shape

(266546, 29)

In [None]:
#Cleaned_Video_Channel_DF.to_pickle("../Data/Cleaned_Data(No_Agg).pkl")

In [31]:
Popularity_DF = Cleaned_Video_Channel_DF.groupby(['video_id', 'viewing_country']).aggregate(Total_Trend_Days=('trending_date', 'count'), Video_Title=('title', 'first'),
                                                                                            First_Trending=('trending_date', 'min'),
                                                                                            Publish_Time=('publish_time', 'first'), 
                                                                                            PublishedAfter_in_Days = ('PublishedAfter(Days)', 'first'),
                                                                                            Origin_Country = ('origin_country', 'first'),
                                                                                            Category = ('category', 'first'),
                                                                                            Tags = ('tags', 'first'),
                                                                                            Duration = ('Duration', 'first'), Language = ('Lang', 'first'),
                                                                                            Views = ('views', 'first'), Likes=('likes', 'first'),
                                                                                            Dislikes = ('dislikes', 'first'), Comment_Count = ('comment_count', 'first'),
                                                                                            Comments_Disabled=('comments_disabled', 'first'),
                                                                                            Ratings_Disabled=('ratings_disabled', 'first'),
                                                                                            Caption_Enabled = ('Caption_Enabled', 'first'),
                                                                                            Video_Description = ('description', 'first'),
                                                                                            Blocked = ('RegionRestriction_Blocked', 'first'),
                                                                                            Allowed = ('RegionRestriction_Allowed', 'first'),
                                                                                            Channel_Title = ('channel_title', 'first'),
                                                                                            Channel_Description = ('Channel_Description', 'first'),
                                                                                            Channel_PublishedAt = ('Channel_PublishedAt', 'first'),
                                                                                            Channel_Keywords = ('keywords', 'first'),
                                                                                            ChannelTopicCategories = ('channelTopicCategories', 'first'),
                                                                                            GoogleAnalyticsUsed = ('GoogleAnalyticsUsed', 'first'), #Should not use in prediction as it is the current info
                                                                                            IsmoderatingComments = ('IsmoderatingComments', 'first'),) #Should not use in prediction as it is the current info


In [32]:
Popularity_DF.reset_index(inplace=True)

In [33]:
Popularity_DF.shape

(142846, 29)

# Save the PopularityDF

In [None]:
#Popularity_DF.to_pickle("../Data/Popularity_DF.pkl")

In [34]:
DF = pd.read_pickle("../Data/Popularity_DF.pkl")

In [35]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142846 entries, 0 to 142845
Data columns (total 29 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   video_id                142846 non-null  object        
 1   viewing_country         142846 non-null  object        
 2   Total_Trend_Days        142846 non-null  int64         
 3   Video_Title             142846 non-null  object        
 4   First_Trending          142846 non-null  datetime64[ns]
 5   Publish_Time            142846 non-null  datetime64[ns]
 6   PublishedAfter_in_Days  142846 non-null  int64         
 7   Origin_Country          115154 non-null  object        
 8   Category                142846 non-null  object        
 9   Tags                    142846 non-null  object        
 10  Duration                142846 non-null  int64         
 11  Language                134847 non-null  object        
 12  Views                   142846

# Create Popularity_FS_DF which includes final states of views-likes-dislikes-comments

In [36]:
Popularity_FS_DF = Cleaned_Video_Channel_DF.groupby(['video_id', 'viewing_country']).aggregate(Total_Trend_Days=('trending_date', 'count'), Video_Title=('title', 'first'),
                                                                                            First_Trending=('trending_date', 'min'),
                                                                                            Publish_Time=('publish_time', 'first'), 
                                                                                            PublishedAfter_in_Days = ('PublishedAfter(Days)', 'first'),
                                                                                            Origin_Country = ('origin_country', 'first'),
                                                                                            Category = ('category', 'first'),
                                                                                            Tags = ('tags', 'first'),
                                                                                            Duration = ('Duration', 'first'), Language = ('Lang', 'first'),
                                                                                            Views_I = ('views', 'first'), Views_F = ('views', 'last'),
                                                                                            Likes_I=('likes', 'first'), Likes_F=('likes', 'last'),
                                                                                            Dislikes_I = ('dislikes', 'first'), Dislikes_F = ('dislikes', 'last'),
                                                                                            Comment_Count_I = ('comment_count', 'first'), Comment_Count_F = ('comment_count', 'last'),
                                                                                            Comments_Disabled=('comments_disabled', 'first'),
                                                                                            Ratings_Disabled=('ratings_disabled', 'first'),
                                                                                            Caption_Enabled = ('Caption_Enabled', 'first'),
                                                                                            Video_Description = ('description', 'first'),
                                                                                            Blocked = ('RegionRestriction_Blocked', 'first'),
                                                                                            Allowed = ('RegionRestriction_Allowed', 'first'),
                                                                                            Channel_Title = ('channel_title', 'first'),
                                                                                            Channel_Description = ('Channel_Description', 'first'),
                                                                                            Channel_PublishedAt = ('Channel_PublishedAt', 'first'),
                                                                                            Channel_Keywords = ('keywords', 'first'),
                                                                                            ChannelTopicCategories = ('channelTopicCategories', 'first'),
                                                                                            GoogleAnalyticsUsed = ('GoogleAnalyticsUsed', 'first'), #Should not use in prediction as it is the current info
                                                                                            IsmoderatingComments = ('IsmoderatingComments', 'first'),) #Should not use in prediction as it is the current info


In [37]:
#Popularity_FS_DF.to_pickle("../Data/Popularity_FS_DF.pkl")

# Dataframe to be used for regionwise insights

**For region wise analysis, we can use the full version of data as the viewing country and trending information doesn't have any NaNs**

In [38]:
## Merge the Video Data with Scraped Data (left)
New_Video_DF = Video_DF.merge(Scraped_DF, how='left', on='video_id')
New_Video_DF.columns
New_Video_DF.shape

Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'description', 'country', 'ChannelID', 'ChannelTitle',
       'DefaultLanguage', 'DefaultAudioLanguage', 'Duration', 'Caption',
       'RegionRestriction_Blocked', 'RegionRestriction_Allowed'],
      dtype='object')

(360217, 24)

In [None]:
#New_Video_DF.to_pickle("../Data/New_Video_DF_360217.pkl")