#### Data Extraction
extract essential information from JSON for the analysis
"title","views", "likes","comments", 'duration','tag','description', 'channel', 'category', 'published'

In [43]:
#extracting json to df
import os, json
import pandas as pd
import numpy as np
import glob
from datetime import datetime
pd.set_option('display.max_columns', None)

dir = 'data'
path = os.path.join(dir, '**/*.json')
file_list = glob.glob(path)
print('Total number of channels ' + str(len(file_list)))
# print(file_list)
dfs = list()

for file in file_list:
    with open(file, 'r') as f:
        data = json.load(f)
    channel_id, stats = data.popitem()
    pchannel_stats = stats["channel_statistics"]
    video_stats = stats["video_data"]
    # sorted_vids = sorted(video_stats.items(), key=lambda item: int(item[1]["viewCount"]),reverse=True)
    vids = video_stats.items()
    stats = []
    for vid in vids:
        video_id = vid[0]
        title = vid[1]["title"]
        try:
            views = vid[1]["viewCount"]
            likes = vid[1]["likeCount"]
            duration = vid[1]['duration']
            tags = vid[1]['tags']
            description = vid[1]['description']
            comments = vid[1]["commentCount"]
            channel = vid[1]['channelTitle']
            published = vid[1]['publishedAt'].split('T')[0]
        except:
            pass
        cat = os.path.dirname(file).split('\\')[1]
        stats.append([title,views, published, likes, comments, duration, tags, description, channel, cat])
    vid_df = pd.DataFrame(stats, columns=["title","views", 'published',"likes","comments", 'duration','tag','description', 'channel', 'category'])
    dfs.append(vid_df)
df = pd.concat(dfs, ignore_index=True)
print('Total number of videos ' + str(df.shape[0]))


Total number of channels 112
Total number of videos 43209


Data Cleaning

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43209 entries, 0 to 43208
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        43209 non-null  object
 1   views        43209 non-null  object
 2   published    43209 non-null  object
 3   likes        43209 non-null  object
 4   comments     43209 non-null  object
 5   duration     43209 non-null  object
 6   tag          43209 non-null  object
 7   description  43209 non-null  object
 8   channel      43209 non-null  object
 9   category     43209 non-null  object
dtypes: object(10)
memory usage: 3.3+ MB


In [45]:
# convert published date into datetime
df.published = pd.to_datetime(df.published, format='%Y-%m-%d')

# convert duration into minute with regex

df.sample(5)

Unnamed: 0,title,views,published,likes,comments,duration,tag,description,channel,category
39177,"Lake Como, Italy: Bellagio and Varenna",525512,2012-05-10,2135,77,PT3M1S,"[Rick Steves, Milano, rick steves lake como, l...",More info about travel to the Italian Lakes: h...,Rick Steves' Europe,Travel
26787,By the Numbers: Thousands of students ‘missing...,6538,2021-06-12,100,31,PT1M35S,"[abcnl, by, from, many, missing, numbers, of, ...",A look at the thousands of students who fell o...,ABC News,News
4728,Cool Guys Don't Look At Explosions,50205179,2009-06-05,363087,39897,PT2M31S,"[Andy, Samberg, 2009, MTV, Movie, Awards, SNL,...",Here's the song we made for the 2009 MTV Movie...,thelonelyisland,Comedy
28058,Sean Penn helps Ukrainian fighter pilots lobby...,68014,2022-06-25,2166,1620,PT3M24S,"[latest News, Happening Now, CNN, Moonfish, Ju...",CNN's Jim Acosta speaks to actor Sean Penn and...,CNN,News
15316,What's the Difference Between Baking Powder an...,1868105,2016-11-14,34250,954,PT5M8S,"[SciShow, science, Hank, Green, education, lea...",Powder vs. Soda: an important distinction!\n\n...,SciShow,Educational


convert text into Word2Vec

In [46]:
import pandas as pd
import gensim.downloader as api
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# import nltk; nltk.download('popular')

We use gensim pretrained model called 'glove-wiki-gigaword-50' with the model trained on wikipedia dataset

In [47]:
# importing pretrained gensim model to convert text tokens into vector
model_wv = api.load('glove-wiki-gigaword-50')

In [48]:
# function to convert token to word vector with word2vec finding average vector for the document
# tokenize function with df series as input to give a list of list result of each title
def df_token(df_text_series):
    # set of english stopwords
    stopset = set(stopwords.words('english'))
    token_item = []
    for t in df_text_series:
        if type(t) == list:
            t = ' '.join(t)
        else:
            pass
        tokens = [word for word in word_tokenize(t.lower()) if word not in stopset]
        token_item.append(tokens)
    return token_item

def gen_word_vec(df_text_series, wordvec):
    word_vector = []
    for token in df_token(df_text_series):
        token_considered = [t for t in token if t.isalpha]
        token_vocab = [i for i in token_considered if i in wordvec.key_to_index]
        if len(token_vocab) > 0:
            word_vector.append(np.mean(wordvec[token_vocab], axis=0))
        else:
            word_vector.append(np.zeros(wordvec.vector_size))
    word_vector = np.array(word_vector)
    return word_vector

In [49]:
title_word_vec = gen_word_vec(df.title, model_wv)
tag_word_vec = gen_word_vec(df.tag, model_wv)
desc_word_vec = gen_word_vec(df.description, model_wv)

In [50]:
# add this vector to the original df and see the similarity within the channel
df['title_vector'] = title_word_vec.tolist()
df['tag_vector'] = tag_word_vec.tolist()
df['desc_vector'] = desc_word_vec.tolist()

In [51]:
df.sample(3)

Unnamed: 0,title,views,published,likes,comments,duration,tag,description,channel,category,title_vector,tag_vector,desc_vector
15344,The myth of Zeus' test - Iseult Gillespie,438199,2022-09-13,19485,696,PT6M9S,"[zeus, hermes, baucis, philemon, baucis and ph...","Dig into the myth of Baucis and Philemon, a co...",TED-Ed,Educational,"[0.1946544349193573, 0.38024282455444336, -0.1...","[0.11282431334257126, 0.6269444227218628, -0.3...","[0.2323508858680725, 0.5799580216407776, 0.295..."
12240,Introduction to improper integrals | AP Calcul...,526746,2014-06-06,1151,91,PT3M52S,[dotsub],"Learn more about ATP: how it stores energy, an...",Khan Academy,Educational,"[-0.3846859931945801, 0.6127420663833618, 0.14...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.03771638125181198, 0.39429447054862976, -0...."
33397,How sea sponges are harvested #shorts,16776,2022-02-09,537,51,PT1M1S,"[science, tech, science insider]",#sponges #seasponge #scienceinsider\n\nHow sea...,Science Insider,Science,"[0.04434001445770264, 0.05265200138092041, -0....","[-0.6982625722885132, 0.4484702944755554, 0.15...","[-0.08555710315704346, 0.2091914266347885, 0.3..."
