In [25]:
import pandas as pd
import json
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import gensim
import math

In [26]:
global AVG_TAGS_PER_VIDEO, US_CA_GB_TOKEN_CORPUS, US_VIDEOS_DF, US_FINAL_DF
global CA_VIDEOS_DF, CA_FINAL_DF, GB_VIDEOS_DF, GB_FINAL_DF, US_CA_GB_FINAL_DF

In [27]:
#get rid of the punctuations and set all characters to lowercase
RE_PREPROCESS = r'\W+|\d+' #the regular expressions that matches all non-characters

#get rid of punctuation and make everything lowercase
#the code belows works by looping through the array of text
#for a given piece of text we invoke the `re.sub` command where we pass in the regular expression, a space ' ' to
#subsitute all the matching characters with
#we then invoke the `lower()` method on the output of the re.sub command
#to make all the remaining characters
#the cleaned document is then stored in a list
#once this list has been filed it is then stored in a numpy array

In [28]:
RE_REMOVE_URLS = r'http\S+'

In [29]:
def processFeatures(desc):
    try:
        desc = re.sub(RE_REMOVE_URLS, ' ', desc)
        return re.sub(RE_PREPROCESS, ' ', desc)
    except:
        return " "

In [30]:
def processDataFrame(data_frame, country_code='US'):
    data_frame.sort_values(by=['video_id', 'trending_date'], ascending=True, inplace=True)
    grouped_videos = data_frame.groupby(['video_id']).last().reset_index()
    
    #Reading categories from the json file depending on country_code
    json_location = './data/' + country_code +'_category_id.json'
    with open(json_location) as data_file:
        data = json.load(data_file)    
    categories = []
    for item in data['items']:
        category = {}
        category['category_id'] = int(item['id'])
        category['title'] = item['snippet']['title']
        categories.append(category)

    categories_df = pd.DataFrame(categories)
    # Merging videos data with category data
    final_df = grouped_videos.merge(categories_df, on = ['category_id'])
    final_df.rename(columns={'title_y': 'category', 'title_x': 'video_name'}, inplace=True)
    
    # Creating a features column that consists all features used for prediction.
    # Also creating a corpus column that consists of all data required to train the model.
    final_df['video_features'] = ''
    final_df['video_corpus'] = ''
    
    if final_df['video_name'].astype(str) is not None:
        final_df['video_features'] += final_df['video_name'].astype(str)

    if final_df['channel_title'].astype(str) is not None:
        final_df['video_features'] += final_df['channel_title'].astype(str)
        
    if final_df['description'].astype(str) is not None:
        final_df['video_features'] += final_df['description'].astype(str)
    
    final_df['video_corpus'] += final_df['video_features']
    if final_df['tags'].astype(str) is not None:
        final_df['video_corpus'] += final_df['tags'].astype(str)
    
        
    final_df['video_features'] = final_df['video_features'].apply(processFeatures)
    final_df['video_corpus'] = final_df['video_corpus'].apply(processFeatures)
    return final_df

In [31]:
def removeStopwords(documents):
    stopwords_list = stopwords.words('english')
    processed_corpus = []
    for document in documents:
        processed_document = []
        for word in document.split():
            if word not in stopwords_list:
                processed_document.append(word)
        processed_corpus.append(processed_document)
    return processed_corpus

In [32]:
def processCorpus(feature_corpus):
    feature_corpus = [comment.lower() for comment in feature_corpus]
    processed_feature_corpus = removeStopwords(feature_corpus)
    return processed_feature_corpus

In [33]:
def trainModel(token_corpus):
    model = gensim.models.Word2Vec(sentences=token_corpus, min_count=1, size = 32)
    model.train(token_corpus, total_examples=model.corpus_count, epochs=model.iter)
    model.save('word2vec_model.w2v')
    return model

In [34]:
def recommendTags(word2vec_model, input_words = ['trump', 'president'], number_of_tags = 10, model_name = 'word2vec_model.w2v'):
    global US_CA_GB_TOKEN_CORPUS
    tags = []
    if word2vec_model is None:
        gensim.models.Word2Vec.load(model_name)
    try:
        tags = word2vec_model.most_similar(positive=input_words, topn=number_of_tags)
    except:
        US_CA_GB_TOKEN_CORPUS.append(input_words)
        word2vec_model.build_vocab(US_CA_GB_TOKEN_CORPUS, update=True)
        word2vec_model.train(US_CA_GB_TOKEN_CORPUS, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.iter)
        word2vec_model.save(model_name)
        tags = word2vec_model.most_similar(positive=input_words, topn=number_of_tags)
    
    return tags

In [35]:
def calculateAvgTagsPerVideo():
    total_tags = 0
    for tag_list in US_CA_GB_FINAL_DF['tags'].values:
        total_tags += len(tag_list.split('|'))
    return math.ceil(total_tags/len(US_CA_GB_FINAL_DF))

Running the algorithm for US, CA, and GB videos

In [58]:
def initializeAndFetchRecommendations(video_name = None, channel_title = None, video_category = None, description = None):
    global US_VIDEOS_DF, US_FINAL_DF, CA_VIDEOS_DF, CA_FINAL_DF, GB_VIDEOS_DF, GB_FINAL_DF
    global US_CA_GB_FINAL_DF, US_CA_GB_FINAL_DF, AVG_TAGS_PER_VIDEO, US_CA_GB_TOKEN_CORPUS
    US_VIDEOS_DF = pd.read_csv('./data/USvideos.csv')
    US_FINAL_DF = processDataFrame(US_VIDEOS_DF, country_code='US')
    
    CA_VIDEOS_DF = pd.read_csv('./data/CAvideos.csv')
    CA_FINAL_DF = processDataFrame(CA_VIDEOS_DF, country_code='CA')
    
    GB_VIDEOS_DF = pd.read_csv('./data/GBvideos.csv')
    GB_FINAL_DF = processDataFrame(GB_VIDEOS_DF, country_code='GB')
        
    US_CA_GB_FINAL_DF = pd.concat([US_FINAL_DF, CA_FINAL_DF, GB_FINAL_DF])
    US_CA_GB_FINAL_DF.reset_index(inplace=True)
    
    US_CA_GB_TOKEN_CORPUS = processCorpus(US_CA_GB_FINAL_DF['video_corpus'].values)
    US_CA_GB_FINAL_DF['video_features'] = processCorpus(US_CA_GB_FINAL_DF['video_features'].values)
    US_CA_GB_FINAL_DF['video_corpus'] = US_CA_GB_TOKEN_CORPUS
        
    AVG_TAGS_PER_VIDEO = calculateAvgTagsPerVideo()
    word2vec_model = trainModel(US_CA_GB_TOKEN_CORPUS)
    
    input_list = []
    if (video_name is not None or channel_title is not None or 
        video_category is not None or description is not None):
        frontEndInput = video_name + channel_title + video_category + description
        for word in frontEndInput.split(' '):
            if word not in stopwords.words('english'):
                input_list.append(word)
    
    if input_list != []:
        return recommendTags(word2vec_model, input_words=input_list, 
                         number_of_tags=AVG_TAGS_PER_VIDEO, 
                         model_name = 'word2vec_model.w2v')
    
    return recommendTags(word2vec_model, input_words=['trump', 'president'], 
                         number_of_tags=AVG_TAGS_PER_VIDEO, 
                         model_name = 'word2vec_model.w2v')
        
    

In [60]:
initializeAndFetchRecommendations(video_name = 'What is data science',
                                  channel_title = 'CNN', 
                                  video_category = 'Education', 
                                  description = 'Detailed description of data science')

[('carboniferous', 0.9188767671585083),
 ('rests', 0.914675772190094),
 ('designs', 0.9064610004425049),
 ('origin', 0.9061634540557861),
 ('stirring', 0.9047435522079468),
 ('agodoug', 0.9038694500923157),
 ('author', 0.9037408232688904),
 ('museums', 0.9030598998069763),
 ('lid', 0.9029368758201599),
 ('turtleneck', 0.9028455018997192),
 ('emphasis', 0.9028239846229553),
 ('bite', 0.9024524092674255),
 ('fuego', 0.9005969762802124),
 ('abandon', 0.8993898630142212),
 ('stung', 0.8990809917449951),
 ('metropolitan', 0.8984636068344116),
 ('understanding', 0.8974539041519165),
 ('somebody', 0.8973672389984131),
 ('deadly', 0.8972563743591309)]

## Dividing the dataset into training (80%) and testing sets (20%).

In [47]:
len(US_CA_GB_FINAL_DF)

3029

In [48]:
np.random.seed(seed=13579)
us_ca_gb_final_df_shuffled = US_CA_GB_FINAL_DF.iloc[np.random.permutation(len(US_CA_GB_FINAL_DF))]

In [49]:
train_size = 0.80
us_ca_gb_df_train = us_ca_gb_final_df_shuffled[:int((train_size)*len(us_ca_gb_final_df_shuffled))]
us_ca_gb_df_test = us_ca_gb_final_df_shuffled[int((train_size)*len(us_ca_gb_final_df_shuffled)):]

In [50]:
w2v_train_model = gensim.models.Word2Vec(sentences=us_ca_gb_df_train['video_corpus'], min_count=1, size = 32)
w2v_train_model.train(us_ca_gb_df_train['video_corpus'].values, total_examples=w2v_train_model.corpus_count, epochs=w2v_train_model.iter)
w2v_train_model.save('w2v_train_model.w2v')

In [51]:
w2v_train_model = gensim.models.Word2Vec.load('w2v_train_model.w2v')

In [52]:
predicted_tags = []
for idx in us_ca_gb_df_test.index:
    video_features = us_ca_gb_df_test.loc[idx, 'video_features']
    if len(video_features) > 0:
        tag_probability_list = recommendTags(w2v_train_model, input_words=video_features,
                                             number_of_tags=AVG_TAGS_PER_VIDEO, 
                                             model_name = 'w2v_train_model.w2v')
        predicted_tags.append([tag[0] for tag in tag_probability_list])

    

In [53]:
us_ca_gb_df_test['predicted_tags'] = predicted_tags

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [54]:
us_ca_gb_df_test['tags'] = us_ca_gb_df_test['tags'].apply(processFeatures)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [55]:
match_found = 0
count = 0
for idx in us_ca_gb_df_test.index:
    tag_list = us_ca_gb_df_test.loc[idx,'tags'].lower()
    tag_list = tag_list.split(' ')
    predicted_tag_list = us_ca_gb_df_test.loc[idx, 'predicted_tags']
    
    for i in range(len(tag_list)):
        if tag_list[i] in predicted_tag_list:
            match_found += 1
            break
    count += 1
print('Match found: ', match_found )
print('Accuracy: ', match_found/len(us_ca_gb_df_test))
    
    


Match found:  134
Accuracy:  0.22112211221122113


In [56]:
US_CA_GB_FINAL_DF

Unnamed: 0,index,video_id,trending_date,video_name,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category,video_features,video_corpus
0,0,-2RVw2_QyxQ,17.16.11,2017 Champions Showdown: Day 3,Saint Louis Chess Club,27,2017-11-12T02:39:01.000Z,"Chess|""Saint Louis""|""Club""",71089,460,27,20,https://i.ytimg.com/vi/-2RVw2_QyxQ/default.jpg,False,False,False,The Saint Louis Chess Club hosts a series of f...,Education,"[champions, showdown, day, saint, louis, chess...","[champions, showdown, day, saint, louis, chess..."
1,1,-oXybog2IuI,17.21.11,24 Facts about Koalas - mental_floss List Show...,Mental Floss,27,2017-11-15T16:00:00.000Z,"john green|""mental floss""|""koalas""|""marsupial""...",38775,1373,16,140,https://i.ytimg.com/vi/-oXybog2IuI/default.jpg,False,False,False,A weekly show where knowledge junkies get thei...,Education,"[facts, koalas, mental_floss, list, show, ep, ...","[facts, koalas, mental_floss, list, show, ep, ..."
2,2,16W7c0mb-rE,17.24.11,Emergence – How Stupid Things Become Smart Tog...,Kurzgesagt – In a Nutshell,27,2017-11-16T15:01:58.000Z,"emergence|""ants""|""intelligence""|""ant""|""sum of ...",2032821,124607,1183,8577,https://i.ytimg.com/vi/16W7c0mb-rE/default.jpg,False,False,False,How can many stupid things combine to form sma...,Education,"[emergence, stupid, things, become, smart, tog...","[emergence, stupid, things, become, smart, tog..."
3,3,5WUDfviiKRE,17.28.11,二贵摔跤 - tienghoa.net,Tina Nguyen,27,2011-03-01T04:14:08.000Z,hanyuqiao,21342,107,312,201,https://i.ytimg.com/vi/5WUDfviiKRE/default.jpg,False,False,False,辽阔的乌珠穆沁草原是摔跤手的摇篮。这里摔跤的传统源远流长，盛名至今不衰。康熙五年(1666年...,Education,"[二贵摔跤, tienghoa, nettina, nguyen辽阔的乌珠穆沁草原是摔跤手的...","[二贵摔跤, tienghoa, nettina, nguyen辽阔的乌珠穆沁草原是摔跤手的..."
4,4,8-u5nd2GqNE,17.24.11,The Secret Protocol for When the Queen Dies,Half as Interesting,27,2017-11-16T15:30:00.000Z,"the|""secret""|""protocal""|""procedure""|""process""|...",1145464,28690,887,5083,https://i.ytimg.com/vi/8-u5nd2GqNE/default.jpg,False,False,False,Raise money for charity just by browsing the i...,Education,"[secret, protocol, queen, dieshalf, interestin...","[secret, protocol, queen, dieshalf, interestin..."
5,5,88kkrRv1UgI,17.25.11,What Happens to You if You Upload Your Mind to...,RealLifeLore,27,2017-11-17T16:37:31.000Z,"real life lore|""real life lore maps""|""real lif...",377704,14359,830,2056,https://i.ytimg.com/vi/88kkrRv1UgI/default.jpg,False,False,False,Check out Assassin's Creed Origins: http://bit...,Education,"[happens, upload, mind, computer, reallifelore...","[happens, upload, mind, computer, reallifelore..."
6,6,B4gXsobd_ao,17.14.11,What Are Diminutives – and Why We Like Them,The School of Life,27,2017-11-09T14:00:01.000Z,"the school of life|""school""|""life""|""education""...",79306,3809,127,670,https://i.ytimg.com/vi/B4gXsobd_ao/default.jpg,False,False,False,A diminutive is something you stick on the end...,Education,"[diminutives, like, themthe, school, lifea, di...","[diminutives, like, themthe, school, lifea, di..."
7,7,B5HORANmzHw,17.15.11,Founding An Inbreeding-Free Space Colony,SciShow,27,2017-11-12T22:00:01.000Z,"SciShow|""science""|""Hank""|""Green""|""education""|""...",286577,9606,248,1196,https://i.ytimg.com/vi/B5HORANmzHw/default.jpg,False,False,False,Thanks to 23AndMe for supporting SciShow. Thes...,Education,"[founding, inbreeding, free, space, colonyscis...","[founding, inbreeding, free, space, colonyscis..."
8,8,CxC161GvMPc,17.28.11,What is the tragedy of the commons? - Nicholas...,TED-Ed,27,2017-11-21T16:03:42.000Z,"TED|""TED-Ed""|""TED Education""|""TED Ed""|""Nichola...",468081,19382,390,1309,https://i.ytimg.com/vi/CxC161GvMPc/default.jpg,False,False,False,Check out our Patreon page: https://www.patreo...,Education,"[tragedy, commons, nicholas, amendolareted, ed...","[tragedy, commons, nicholas, amendolareted, ed..."
9,9,EsBjHXCG4Us,17.25.11,How Likely Are You to Die During a Black Frida...,The Infographics Show,27,2017-11-17T17:00:03.000Z,"black friday|""black friday sales""|""How Likely ...",527531,10200,540,2197,https://i.ytimg.com/vi/EsBjHXCG4Us/default.jpg,False,False,False,🐻 Check out our new channel: Fuzzy & Nutz 🐿️ ►...,Education,"[likely, die, black, friday, sale, infographic...","[likely, die, black, friday, sale, infographic..."
