#### Data Extraction
extract essential information from JSON for the analysis
"title","views", "likes","comments", 'duration','tag','description', 'channel', 'category', 'published'

In [90]:
#extracting json to df
import os, json
import pandas as pd
import numpy as np
import glob
from datetime import datetime
pd.set_option('display.max_columns', None)

dir = 'data'
path = os.path.join(dir, '**/*.json')
file_list = glob.glob(path)
print('Total number of channels ' + str(len(file_list)))
# print(file_list)
dfs = list()

for file in file_list:
    with open(file, 'r') as f:
        data = json.load(f)
    channel_id, stats = data.popitem()
    pchannel_stats = stats["channel_statistics"]
    video_stats = stats["video_data"]
    # sorted_vids = sorted(video_stats.items(), key=lambda item: int(item[1]["viewCount"]),reverse=True)
    vids = video_stats.items()
    stats = []
    for vid in vids:
        video_id = vid[0]
        title = vid[1]["title"]
        try:
            views = vid[1]["viewCount"]
            likes = vid[1]["likeCount"]
            duration = vid[1]['duration']
            tags = vid[1]['tags']
            description = vid[1]['description']
            comments = vid[1]["commentCount"]
            channel = vid[1]['channelTitle']
            published = vid[1]['publishedAt'].split('T')[0]
        except:
            pass
        cat = os.path.dirname(file).split('\\')[1]
        stats.append([title,views, published, likes, comments, duration, tags, description, channel, cat])
    vid_df = pd.DataFrame(stats, columns=["title","views", 'published',"likes","comments", 'duration','tag','description', 'channel', 'category'])
    dfs.append(vid_df)
df = pd.concat(dfs, ignore_index=True)
print('Total number of videos ' + str(df.shape[0]))


Total number of channels 105
Total number of videos 40308


Data Cleaning

In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40308 entries, 0 to 40307
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   title         40308 non-null  object        
 1   views         40308 non-null  object        
 2   published     40308 non-null  datetime64[ns]
 3   likes         40308 non-null  object        
 4   comments      40308 non-null  object        
 5   duration      40308 non-null  object        
 6   tag           40308 non-null  object        
 7   description   40308 non-null  object        
 8   channel       40308 non-null  object        
 9   category      40308 non-null  object        
 10  title_vector  40308 non-null  object        
dtypes: datetime64[ns](1), object(10)
memory usage: 3.4+ MB


In [91]:
# convert published date into datetime
df.published = pd.to_datetime(df.published, format='%Y-%m-%d')

# convert duration into minute with regex

df.sample(5)

Unnamed: 0,title,views,published,likes,comments,duration,tag,description,channel,category
30397,The Mysterious and Powerful Force of Gravity,244713,2015-07-28,2375,300,PT3M55S,"[science documentary, how the universe works, ...",Experts explain how gravity has the ability to...,Science Channel,Science
39804,Beach Bums and Thighs Yoga Workout,79783,2012-11-16,344,452,PT4M47S,"[tara stiles, beach, body, weight loss, butt, ...",Check out http://www.tarastilesdvd.com for my ...,Tara Stiles : Yoga that Feels Like You,Yoga
1907,FAKE BOOTY PRANK,9587355,2013-05-19,164514,9291,PT2M47S,"[prankvsprank, prank vs prank, pvp, prank, pra...",Another New Prank - http://bit.ly/19RV9mA\nSho...,Jesse,Comedy
22141,Spencer Tarring - Come On Now (Official Music ...,1090835,2013-12-07,10704,313,PT3M53S,"[Spencer, Tarring, Spinnin' Records (Record La...",Spencer Tarring presents Come On Now on Oxygen...,Spinnin' Records,Music
10771,1. Simple Harmonic Motion & Problem Solving In...,392048,2013-10-25,4537,217,PT1H16M17S,"[simple harmonic motion, spring, springs, osci...",View the complete OCW resource: http://ocw.mit...,MIT OpenCourseWare,Educational


convert text into Word2Vec

In [92]:
import pandas as pd
import gensim.downloader as api
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# import nltk; nltk.download('popular')

In [93]:
# tokenize function with df series as input to give a list of list result of each title
def df_token(df_text_series):
    # set of english stopwords
    stopset = set(stopwords.words('english'))
    token_item = []
    for t in df_text_series:
        tokens = [word for word in word_tokenize(t.lower()) if word not in stopset]
        token_item.append(tokens)
    return token_item

In [94]:
# grab the titles test function
titles = df.title
tokens = df_token(titles)
print(tokens[:5])

[['bart', 'baker', '-', '``', 'polydoge', "''", '(', 'official', 'music', 'video', ')'], ['wtf', 'happened', '?', '(', 'disapeared', ')'], ['6ix9ine', ',', 'nicki', 'minaj', ',', 'murda', 'beatz', '-', '“', 'fefe', '”', 'parody'], ['destroying', 'gold', 'youtube', 'play', 'button'], ['childish', 'gambino', '-', '``', 'america', "''", 'parody']]


We use gensim pretrained model called 'glove-wiki-gigaword-50' with the model trained on wikipedia dataset

In [95]:
# importing pretrained gensim model
model_wv = api.load('glove-wiki-gigaword-50')

In [96]:
# function to convert token to word vector with word2vec finding average vector for the document
def gen_word_vec(token_list, wordvec):
    word_vector = []
    for token in token_list:
        token_considered = [t for t in token if t.isalpha]
        token_vocab = [i for i in token_considered if i in wordvec.key_to_index]
        if len(token_vocab) > 0:
            word_vector.append(np.mean(wordvec[token_vocab], axis=0))
        else:
            word_vector.append(np.zeros(wordvec.vector_size))
    word_vector = np.array(word_vector)
    return word_vector

In [98]:
title_word_vec = gen_word_vec(tokens, model_wv)
title_word_vec[:20]


array([[-1.95005745e-01,  5.75011432e-01,  7.11959079e-02,
         1.67576894e-01,  1.80390567e-01,  4.42986563e-02,
        -4.35588032e-01, -6.15372002e-01, -1.20653987e-01,
         7.48924986e-02, -6.01987131e-02,  3.03997815e-01,
        -5.17714858e-01,  1.68738395e-01,  5.86653829e-01,
        -9.74132940e-02,  6.32433444e-02,  5.35430089e-02,
        -2.42917091e-01, -7.51620084e-02, -2.25419998e-02,
         3.20320994e-01,  2.62795210e-01,  2.63738811e-01,
         1.15462961e-02, -1.36440492e+00, -5.56028008e-01,
        -1.11142017e-01, -1.25832081e-01, -5.06733656e-01,
         2.88085389e+00, -2.06901878e-01, -2.90320516e-01,
         2.98981480e-02, -2.01695591e-01, -1.10577606e-01,
         3.54725093e-01, -3.09377968e-01,  3.14899027e-01,
         1.17930025e-02,  3.03121328e-01,  5.81887007e-01,
        -1.25411302e-01, -4.84923899e-01,  5.93050048e-02,
         5.14921062e-02, -2.15581115e-02, -2.47536093e-01,
        -1.12842999e-01,  6.92350030e-01],
       [-2.20

In [99]:
# add this vector to the original df and see the similarity within the channel
df['title_vector'] = title_word_vec.tolist()
df.sample(3)

Unnamed: 0,title,views,published,likes,comments,duration,tag,description,channel,category,title_vector
6083,Christina Makes Nasi Lemak at Kopitiam | From ...,575383,2019-10-11,14336,1037,PT15M36S,"[from the test kitchen, christina chaey, chris...",In this very special edition of From OUTSIDE t...,Bon Appétit,Cooking,"[-0.0498967207968235, 0.10870454460382462, -0...."
39948,Day 2 - Arrive | BREATH - A 30 Day Yoga Journey,3227291,2021-01-03,60357,5645,PT37M36S,"[home yoga, home yoga practice, yoga at home, ...",With each breath we have the opportunity to be...,Yoga With Adriene,Yoga,"[0.03898446261882782, 0.6734285950660706, 0.05..."
28152,North Korean Film Madness (Documentary | Part ...,1974498,2012-10-05,21247,1629,PT9M6S,"[x2, film, journalism, North Korea, spike jonz...",We went to North Korea to try and penetrate th...,VICE,News,"[-0.2264936864376068, 0.6458989977836609, -0.1..."
