In [11]:
import pandas as pd
import json
import numpy as np
import spacy
import nltk
import re
import gensim

In [12]:
def splitTags(tag_list):
    tag_list = tag_list.split('|')
    output = ''
    for tag in tag_list:
        output += tag
    return output

In [13]:
#get rid of the punctuations and set all characters to lowercase
RE_PREPROCESS = r'\W+|\d+' #the regular expressions that matches all non-characters

#get rid of punctuation and make everything lowercase
#the code belows works by looping through the array of text
#for a given piece of text we invoke the `re.sub` command where we pass in the regular expression, a space ' ' to
#subsitute all the matching characters with
#we then invoke the `lower()` method on the output of the re.sub command
#to make all the remaining characters
#the cleaned document is then stored in a list
#once this list has been filed it is then stored in a numpy array

In [14]:
def processFeatures(desc):
    try:
        return re.sub(RE_PREPROCESS, ' ', desc)
    except:
        return " "

In [15]:
def processDataFrame(data_frame, country_code='US'):
    data_frame.sort_values(by=['video_id', 'trending_date'], ascending=True, inplace=True)
    grouped_videos = data_frame.groupby(['video_id']).last().reset_index()
    
    #Reading categories from the json file depending on country_code
    json_location = './data/' + country_code +'_category_id.json'
    with open(json_location) as data_file:
        data = json.load(data_file)    
    categories = []
    for item in data['items']:
        category = {}
        category['category_id'] = int(item['id'])
        category['title'] = item['snippet']['title']
        categories.append(category)

    categories_df = pd.DataFrame(categories)
    # Merging videos data with category data
    final_df = grouped_videos.merge(categories_df, on = ['category_id'])
    final_df.rename(columns={'title_y': 'category', 'title_x': 'video_name'}, inplace=True)
    
    # Splitting the tags by pipe (|) character
    final_df['tags'] = final_df['tags'].apply(splitTags)
    
    # Creating a features column that consists all features used for prediction.
    final_df['video_features'] = final_df['tags'].astype(str) + final_df['video_name'].astype(str) \
                        + final_df['channel_title'].astype(str) + final_df['description'] + final_df['category']
        
    final_df['video_features'] = final_df['video_features'].apply(processFeatures)
    final_df['video_features'] = final_df['video_features'].apply(processFeatures)
    return final_df

In [21]:
def processCorpus(feature_corpus):
    feature_corpus = [comment.lower() for comment in feature_corpus]
    parser = spacy.load('en')
    processed_feature_corpus = [parser(feature) for feature in feature_corpus]
    token_corpus = [nltk.word_tokenize(str(feature)) for feature in processed_feature_corpus]
    return token_corpus

In [105]:
def trainModel(token_corpus):
    model = gensim.models.Word2Vec(sentences=token_corpus, min_count=1, size = 32)
    model.train(token_corpus, total_examples=model.corpus_count, epochs=model.iter)
    model.save('word2vec_model.w2v')
    return model

In [116]:
def recommendTags(token_corpus, input_words = ['trump', 'president']):
    word2vec_model = gensim.models.Word2Vec.load('word2vec_model.w2v')
    tags = []
    try:
        tags = word2vec_model.most_similar(positive=input_words)
    except:
        token_corpus.append(input_words)
        word2vec_model.build_vocab(token_corpus, update=True)
        word2vec_model.train(token_corpus, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.iter)
        word2vec_model.save('word2vec_model.w2v')
        tags = word2vec_model.most_similar(positive=input_words)
    
    return token_corpus, tags

Running the algorithm for US videos

In [85]:
us_videos_df = pd.read_csv('./data/USvideos.csv')
us_final_df = processDataFrame(us_videos_df, country_code='US')

In [86]:
ca_videos_df = pd.read_csv('./data/CAvideos.csv')
ca_final_df = processDataFrame(ca_videos_df, country_code='CA')

In [87]:
gb_videos_df = pd.read_csv('./data/GBvideos.csv')
gb_final_df = processDataFrame(gb_videos_df, country_code='GB')

In [75]:
us_ca_gb_final_df = pd.concat([us_final_df, ca_final_df, gb_final_df])

In [88]:
us_ca_gb_token_corpus = processCorpus(us_ca_gb_final_df['video_features'].values)



    Only loading the 'en' tokenizer.



In [89]:
len(us_ca_gb_token_corpus)

3029

In [108]:
trainModel(us_ca_gb_token_corpus)

<gensim.models.word2vec.Word2Vec at 0x2004aa02d68>

In [117]:
us_ca_gb_token_corpus, tags = recommendTags(us_ca_gb_token_corpus, input_words=['trump', 'president','karan'])

In [118]:
tags

[('putin', 0.8798644542694092),
 ('leann', 0.8748385906219482),
 ('donald', 0.8641480803489685),
 ('administration', 0.8589312434196472),
 ('cumbre', 0.8583245873451233),
 ('verifies', 0.856847882270813),
 ('meddling', 0.8502160906791687),
 ('roa', 0.84947669506073),
 ('llegó', 0.8474310636520386),
 ('firestorm', 0.8454605340957642)]