### Data Extraction
extract essential information from JSON for the analysis
"title","views", "likes","comments", 'duration','tag','description', 'channel', 'category', 'published'

In [1]:
#extracting json to df
import os, json, re
import enum
import pandas as pd
import numpy as np
import glob
from datetime import datetime, timedelta, timezone
from itertools import groupby
import warnings

import pandas as pd
import gensim.downloader as api
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize ,word_tokenize
from nltk.corpus import stopwords
# import nltk; nltk.download('popular')

import seaborn as sns
import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

warnings.simplefilter(action='ignore')
pd.set_option('display.max_columns', None)


KeyboardInterrupt: 

In [None]:
rootdir = 'data'
feature_list = ['publishedAt', 'title', 'channelId', 'description',
                'channelTitle', 'tags', 'categoryId', 'viewCount', 
                'likeCount', 'favoriteCount', 'commentCount', 'duration',
                'definition', 'contentRating', 'topicCategories', 'topicLabel']

ov_dict_list = []
ov_temp_dict = {}

dict_list = []
temp_dict = {}

for path in glob.glob(f'./{rootdir}/*/*'):
  try:
    with open(path, "r") as read_file:
      data = json.load(read_file)
    for channel_id in data.keys():
      ov_temp_dict = data[channel_id]["channel_statistics"]
      channel_name = list(data[channel_id]["video_data"].keys())[0]
      ov_temp_dict["channelName"] = data[channel_id]["video_data"][channel_name]["channelTitle"]
      ov_dict_list.append(ov_temp_dict)
      
      for video_info in data[channel_id]["video_data"].values():
        temp_dict = video_info
        temp_dict["topicLabel"] = path.split('\\')[1]
        dict_list.append(temp_dict)       
  except:
    pass

overall_stats_raw_df = pd.DataFrame(ov_dict_list)
overall_stats_raw_df.drop("hiddenSubscriberCount", axis=1, inplace=True)

raw_df = pd.DataFrame(dict_list)
raw_df = raw_df[feature_list]

In [None]:
raw_df.shape

In [None]:
len(raw_df.channelTitle.unique())

### Data Cleaning

In [None]:
def duration_split(duration):
  try:
    for _, v in groupby(duration, str.isalpha):
      yield ''.join(v)
  except:
    yield np.nan
    
    
def duration_2_secs(duration, duration_split=duration_split):
  temp = 0
  # value = (duration_split(duration))
  for i, val in enumerate(duration_split(duration)):
    
    if len(str(duration)) <= 8:
      if i==1 and not val.isalpha():
        temp += float(val)*60
      
      if i==3 and not val.isalpha():
        temp += float(val)
    else:
      if i==1 and not val.isalpha():
        temp += float(val)*60*60
      
      if i==3 and not val.isalpha():
        temp += float(val)*60
      
      if i==5 and not val.isalpha():
        temp += float(val)

  return(temp)


# duration_2_secs("HR1PT51M12S")


In [None]:
def topic_extract(links_list):
  
  topics_list = []  
  try:
    for link in links_list:
      topics_list.append(link.split("/")[-1].lower())
    
    return(topics_list)
  except:
    pass

In [None]:
def text_prep(val):
  '''the goal is to replace the hypertexts in the 
  in any field to redundant names as vectorizing
  them could be misleading and also leak data'''
  val = str(val).lower()
  process_desc = re.sub(r'http[s]*:.*\w', 'url', val)
  process_desc = re.sub('[^a-zA-Z]', ' ', process_desc)
  process_desc = re.sub(r'\s+', ' ', process_desc)

  process_desc = nltk.sent_tokenize(process_desc)
  if not process_desc:
    process_desc = [nltk.word_tokenize(word) for word in process_desc]
  else:
    process_desc = [nltk.word_tokenize(word) for word in process_desc][0]
    
  
  word_list = [word for word in process_desc if word not in stopwords.words('english')]


  return word_list



In [None]:
def create_dataset(df, featureset, primary=True):

  if primary:
    df["publishedAt"] = pd.to_datetime(df.publishedAt, format='%Y-%m-%d')
    df['publishedDayDelta'] = (datetime.now(timezone.utc) - df['publishedAt']).apply(lambda x: x.days)
    df["categoryId"] = df.categoryId.astype(float)
    df["viewCount"] = df.viewCount.astype(float)
    df["likeCount"] = df.likeCount.astype(float)
    df["favoriteCount"] = df.favoriteCount.astype(float)
    df["commentCount"] = df.commentCount.astype(float)
    df['duration_secs'] = df.duration.apply(lambda x: duration_2_secs(x))
    df['topicCategories'] = df.topicCategories.apply(lambda x: topic_extract(x))
    df['channelTitle'] = df.channelTitle.str.lower()
    df['topicLabel'] = df.topicLabel.str.lower()
    df['log_duration_secs'] = np.log(df.duration_secs+1)
  
  else:
    df['ov_viewCount'] = df.viewCount.astype(float)
    df['ov_subscriberCount'] = df.subscriberCount.astype(float)
    df['ov_videoCount'] = df.videoCount.astype(float)
    df['channelName'] = df.channelName.str.lower()

  return df[featureset]

In [None]:
features = [
  'title', 'description',
  'channelTitle', 'categoryId', 'viewCount', 
  'log_duration_secs', 'topicCategories', 'publishedDayDelta', 'topicLabel'
 ]

prep_df = create_dataset(raw_df, features)

In [None]:
pct_na = prep_df.isna().sum()/len(prep_df)*100
pct_na

In [None]:
pct_dropna = len(prep_df.dropna())/len(prep_df)*100
pct_dropna

In [None]:
prep_df = prep_df.dropna()

In [None]:
sns.pairplot(prep_df[['categoryId', 'viewCount', 'log_duration_secs', 'publishedDayDelta']])

In [None]:
sns.heatmap(prep_df.corr(), cmap='rocket_r', annot=True)

### Data Mining
convert text into Word2Vec

We use gensim pretrained model called 'glove-wiki-gigaword-50' with the model trained on wikipedia dataset

In [None]:
# importing pretrained gensim model to convert text tokens into vector
model_wv = api.load('glove-wiki-gigaword-50')

In [None]:
# function to convert token to word vector with word2vec finding average vector for the document
# tokenize function with df series as input to give a list of list result of each title
def df_token(df_text_series):
    # set of english stopwords
    stopset = set(stopwords.words('english'))
    token_item = []
    for t in df_text_series:
        if type(t) == list:
            t = str(' '.join(t))
        else:
            pass
        try:
            tokens = [word for word in word_tokenize(t.lower()) if word not in stopset]
        except:
            tokens == None
        token_item.append(tokens)
    return token_item

def gen_word_vec(df_text_series, wordvec):
    word_vector = []
    for token in df_token(df_text_series):
        token_considered = [t for t in token if t.isalpha]
        token_vocab = [i for i in token_considered if i in wordvec.key_to_index]
        if len(token_vocab) > 0:
            word_vector.append(np.mean(wordvec[token_vocab], axis=0))
        else:
            word_vector.append(np.zeros(wordvec.vector_size))
    word_vector = np.array(word_vector)
    return word_vector

features to work with

In [None]:
title_word_vec = gen_word_vec(prep_df.title, model_wv)
desc_word_vec = gen_word_vec(prep_df.description, model_wv)
topic_word_vec = gen_word_vec(prep_df.topicCategories, model_wv)
label_word_vec = gen_word_vec(prep_df.topicLabel, model_wv)

In [None]:
# check the dimension of the average word vector for each title
title_word_vec.shape

extracting cosine similarity

In [None]:
import dask.array as da
from sklearn.metrics.pairwise import cosine_similarity
A = title_word_vec
vectors = da.from_array(A, 10000)
cosine = cosine_similarity(vectors)
avg_cosine_dask = []
for i in range(A.shape[0]):
    avg_cosine_dask.append(np.mean(cosine[i]))

Reduce dimensionality using PCA

In [None]:
# initiate PCA with 1 components
pca = PCA(n_components=1)
title_pc = pca.fit_transform(title_word_vec)
desc_pc = pca.fit_transform(desc_word_vec)
topic_pc = pca.fit_transform(topic_word_vec)
label_pc = pca.fit_transform(label_word_vec)

In [None]:
#copy prep_df to be worked with
vec_df = prep_df.copy()
#replace df with each vector PCs
vec_df['titleVec'] = title_pc
vec_df['label'] = label_pc
vec_df['description'] = desc_pc
vec_df['topicCategories'] = topic_pc
vec_df['avgTitleCosine'] =avg_cosine_dask

In [None]:
vec_df.sample(3)

In [None]:
view_vec = vec_df.drop(columns=['channelTitle', 'title', 'topicLabel'])

In [None]:
view_vec.sample(3)

In [None]:
norm_view_vec = vec_df.copy()
norm_view_vec['averageView'] = norm_view_vec.viewCount/norm_view_vec.publishedDayDelta
norm_view_vec['normalizedScore'] = norm_view_vec.groupby('topicLabel').averageView.apply(lambda x: x/x.median())
norm_view_vec.drop(['channelTitle', 'viewCount', 'averageView' , 'title', 'topicLabel'], axis=1, inplace=True)

In [None]:
norm_view_vec.sample(3)

### Unsupervised Machine Learning

In [None]:
# scale the df to be fitted with PCA
scaler = StandardScaler()
scale_view_vec = scaler.fit_transform(view_vec)
scale_norm_view_vec = scaler.fit_transform(norm_view_vec)

In [None]:
scale_norm_view_df = pd.DataFrame(scale_norm_view_vec, columns=norm_view_vec.columns)

In [None]:
sns.pairplot(scale_norm_view_df)

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(ax=ax, data=scale_norm_view_df.corr(), cmap='coolwarm_r', annot=True)

In [None]:
# create biplot source from https://sukhbinder.wordpress.com/2015/08/05/biplot-with-python/ + unsupervise learning class
def biplot(score, coeff, maxdim, pcax=1, pcay=2, labels=None):
    '''
    score: pca fit_transform
    coeff: components.transpose
    '''
    pca1=pcax-1
    pca2=pcay-1
    xs = score[:,pca1]
    ys = score[:,pca2]
    n = min(coeff.shape[0], maxdim)
    scalex = 2.0/(xs.max()- xs.min())
    scaley = 2.0/(ys.max()- ys.min())
    text_scale_factor = 1.5
    plt.figure(figsize=(12, 8))
    plt.scatter(xs*scalex, ys*scaley, s=1)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,pca1], coeff[i,pca2],color='r',alpha=0.5) 
        if labels is None:
            plt.text(coeff[i,pca1]* text_scale_factor, coeff[i,pca2] * text_scale_factor, "Var"+str(i+1), color='k', ha='center', va='center')
        else:
            plt.text(coeff[i,pca1]* text_scale_factor, coeff[i,pca2], labels[i], color='m', ha='center', va='center')
    plt.xlim(-1,1)
    plt.ylim(-1,1)
    plt.xlabel("PC{}".format(pcax))
    plt.ylabel("PC{}".format(pcay))
    plt.grid()

In [None]:
# biplot with viewCount variable
score_pca = PCA(n_components=2)
score = score_pca.fit_transform(scale_view_vec)
labels = view_vec.columns
coeff = np.transpose(score_pca.components_)
biplot(score,coeff, maxdim=12, labels=labels)

In [None]:
#biplot normalizedScore variable
score_pca = PCA(n_components=8).fit(scale_norm_view_vec)
score = score_pca.transform(scale_norm_view_vec)
labels = scale_norm_view_df.columns
coeff = np.transpose(score_pca.components_)
biplot(score,coeff, maxdim=12, labels=labels);

In [None]:
# scree plot
PC_values = np.arange(score_pca.n_components_) + 1
plt.figure(figsize=(8, 6))
plt.plot(PC_values, score_pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
score_pca.explained_variance_ratio_.sum()

In [None]:
# Let's explore the clustering of word embadding vector using K-Means
# first let's find optimal number of cluster using elbow methods

# workingin title word embedding
sse=[] # sum of square error
list_k = list(range(1, 50))

X = scale_norm_view_vec
for k in list_k:
    km = MiniBatchKMeans(n_clusters=k, init='k-means++', max_iter=300, random_state=0, batch_size=2048)
    km.fit(X)
    sse.append(km.inertia_)

plt.figure(figsize=(6, 6))
plt.plot(list_k, sse, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance');

In [None]:
# let's explore cluster with yellowbrick KElbowVisualizer
from yellowbrick.cluster import KElbowVisualizer
X = scale_norm_view_vec
model = MiniBatchKMeans(n_clusters=k, init='k-means++', max_iter=300, random_state=0, batch_size=2048)
visualizer = KElbowVisualizer(model, k=(3,13))
visualizer.fit(X)
visualizer.show()

The result from the elbow method shows that the optimum number of KNN is 9, so let's explore further

In [None]:
# silhouette score plot with yellowbrick
from yellowbrick.cluster import SilhouetteVisualizer
model_k = MiniBatchKMeans(n_clusters=9, init='k-means++', max_iter=300, random_state=0, batch_size=2048)
visualizer_silhouette = SilhouetteVisualizer(model_k, colors='yellowbrick')
visualizer_silhouette.fit(X)
visualizer_silhouette.show()

In [None]:
# use silhouette plot based on sklearn documentation page https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
#set the dataset to be working with
X = scale_norm_view_vec

for i, k in enumerate([5, 6, 7, 8, 9, 10, 11]):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)
    
    # Run the Kmeans algorithm
    kmeans = MiniBatchKMeans(n_clusters=k, init='k-means++', max_iter=300, random_state=0, batch_size=2048)
    labels = kmeans.fit_predict(X)
    # centroids = kmeans.cluster_centers_

    # Get silhouette samples
    silhouette_vals = silhouette_samples(X, labels)

    # Silhouette plot
    y_ticks = []
    y_lower, y_upper = 0, 0
    for i, cluster in enumerate(np.unique(labels)):
        cluster_silhouette_vals = silhouette_vals[labels == cluster]
        cluster_silhouette_vals.sort()
        y_upper += len(cluster_silhouette_vals)
        ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
        ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
        y_lower += len(cluster_silhouette_vals)

    # Get the average silhouette score and plot it
    avg_score = np.mean(silhouette_vals)
    ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green')
    ax1.set_yticks([])
    ax1.set_xlim([-0.1, 1])
    ax1.set_xlabel('Silhouette coefficient values')
    ax1.set_ylabel('Cluster labels')
    ax1.set_title('Silhouette plot for the various clusters', y=1.02);
    
    # Use PCA to flatten the data
    embedding = PCA(n_components=2)
    pca = pd.DataFrame(embedding.fit_transform(X), columns = ['component1','component2'])
    pca['labels'] = kmeans.predict(X)

    # Scatter plot of data colored with labels
    ax2.scatter(pca['component1'], pca['component2'], c=labels)
    # ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='r', s=250)
    # ax2.set_xlim([-2, 2])
    # ax2.set_xlim([-2, 2])
    ax2.set_xlabel('Component 1')
    ax2.set_ylabel('Component 2')
    ax2.set_title('Visualization of clustered data', y=1.02)
    ax2.set_aspect('equal')
    plt.tight_layout()
    plt.suptitle(f'Silhouette analysis using k = {k}',
                 fontsize=16, fontweight='semibold', y=1.05);

In [None]:
# sample data to plot with text and for clustering
norm_df = pd.DataFrame(scale_norm_view_vec)
norm_df['title'] = vec_df.title
cluster_X = norm_df.iloc[:,:-2]
kmeans = MiniBatchKMeans(n_clusters=9, init='k-means++', max_iter=300, random_state=0, batch_size=2048)
labels = kmeans.fit_predict(cluster_X)
norm_df['label'] = labels
# sample 1000 data
target_df = norm_df.sample(1000)
X = target_df.iloc[:,:-3]
labels = target_df.iloc[:,-1]


In [None]:
# pca is showing promising cluster let's try t-SNE with text adjusting
# Initialize t-SNE
tsne = TSNE(n_components = 2, init = 'random', random_state = 0, perplexity = 75)

# flatten the data and clustering then plot the cluster with t-SNE
tsne_df = pd.DataFrame(tsne.fit_transform(X), columns = ['component1','component2'])
# tsne_df['labels'] = labels
# print(tsne_df)
fig, ax = plt.subplots(figsize = (14, 10))
# sns.scatterplot(tsne_df['component1'], tsne_df['component2'], alpha = 0.5, hue=labels)
ax.scatter(tsne_df['component1'], tsne_df['component2'], c=labels)

# use adjustText to help position the text
from adjustText import adjust_text

texts = []
titles_to_plot = list(np.arange(0, 1000, 50)) # plots every 40th title in first 400 titles

# Append words to list
for title in titles_to_plot:
    texts.append(plt.text(tsne_df.iloc[title, 0], tsne_df.iloc[title, 1], target_df.title.iloc[title], fontsize = 9))
    
# Plot text using adjust_text
adjust_text(texts, force_points = 0.4, force_text = 0.5, 
            expand_points = (2,1), expand_text = (1,2),
            arrowprops = dict(arrowstyle = "-", color = 'black', lw = 0.5))

plt.show()


In [None]:
norm_df = pd.DataFrame(scale_norm_view_vec)
norm_df['title'] = vec_df.title
cluster_X = norm_df.iloc[:,:-2]
kmeans = MiniBatchKMeans(n_clusters=9, init='k-means++', max_iter=300, random_state=0, batch_size=2048)
labels = kmeans.fit_predict(cluster_X)
norm_df['label'] = labels
# sample 800 data
target_df = norm_df.sample(1000)
X = target_df.iloc[:,:-3]
labels = target_df.iloc[:,-1]

In [None]:
# pca is showing promising cluster let's try PCA with text adjusting
# Initialize PCA
pca = PCA(n_components = 2)

# flatten the data and clustering then plot the cluster with t-SNE
pca_df = pd.DataFrame(pca.fit_transform(X), columns = ['component1','component2'])
# pca_df['labels'] = labels
# print(pca_df)
fig, ax = plt.subplots(figsize = (14, 10))
# sns.scatterplot(pca_df['component1'], pca_df['component2'], alpha = 0.5, hue=labels)
ax.scatter(pca_df['component1'], pca_df['component2'], c=labels)

# use adjustText to help position the text
from adjustText import adjust_text

texts = []
titles_to_plot = list(np.arange(0, 1000, 50)) # plots every 40th title in first 400 titles

# Append words to list
for title in titles_to_plot:
    texts.append(plt.text(pca_df.iloc[title, 0], pca_df.iloc[title, 1], target_df.title.iloc[title], fontsize = 9))
    
# Plot text using adjust_text
adjust_text(texts, force_points = 0.4, force_text = 0.5, 
            expand_points = (2,1), expand_text = (1,2),
            arrowprops = dict(arrowstyle = "-", color = 'black', lw = 0.5))
plt.title('PCA clustering a 1,000 sampled data points with its text title')
plt.show()


In [None]:
# try 3-D representation with t_SNE
from wsgiref.headers import tspecials
from mpl_toolkits.mplot3d import Axes3D
# to create interactive 3D plot
%matplotlib widget

# Initialize t-SNE with 3 components for 3-dimensional plot
tsne = TSNE(n_components = 3, init = 'random', random_state = 0, perplexity = 75)

# flatten the data and clustering then plot the cluster with t-SNE
tsne_df = pd.DataFrame(tsne.fit_transform(X), columns = ['component1','component2', 'component3'])
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(tsne_df['component1'], tsne_df['component2'], tsne_df['component3'], c=labels)

plt.show()

In [None]:
norm_df = pd.DataFrame(scale_norm_view_vec)
norm_df['title'] = vec_df.title
cluster_X = norm_df.iloc[:,:-2]
kmeans = MiniBatchKMeans(n_clusters=9, init='k-means++', max_iter=300, random_state=0, batch_size=2048)
labels = kmeans.fit_predict(cluster_X)
norm_df['label'] = labels
target_df = norm_df.sample(1000)
X = target_df.iloc[:,:-3]
labels = target_df.iloc[:,-1]

In [None]:
# try 3-D representation with PCA

# Initialize PCA with 3 components for 3-dimensional plot
pca = PCA(n_components = 3)

# flatten the data and clustering then plot the cluster with t-SNE
pca_df = pd.DataFrame(pca.fit_transform(X), columns = ['component1','component2', 'component3'])
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(pca_df['component1'], pca_df['component2'], pca_df['component3'], c=labels)

plt.show()