Youtube comments to word2vec
===

This is a basic implementation of word2vec using youtube comments as the dataset.

In [49]:
import numpy as np
import math
import os
import random
import zipfile
import tensorflow as tf
import json
import collections

**Read data**

In [50]:
#Note: Comments can also be seen as documents and vice versa

def read_comments_from_json(filename):
    """Read youtube comments from json file and return a list of each comment as a list of words."""  
    comments_vocabulary = []
    filename += '.json' # Add extension to filename
    file_path = os.path.join('.', filename)
    with open(file_path) as file:
        data = json.load(file) #deserialize to python object
        for comment in data:
            # Each comment will represent a document which can be used to implement Doc2Vec
            comments_vocabulary.append(comment["commentText"].split())
    return comments_vocabulary


filename = 'comments'
comments_vocabulary = read_comments_from_json(filename)
data_size = len(comments_vocabulary)
print('Data size', data_size)
print('Comments preview:\n', comments_vocabulary[:5])

Data size 7364
Comments preview:
 [['#FixRussianYoutube', 'Philip', 'we', 'urgently', 'need', 'your', 'help!', 'Please', 'consider', 'a', 'situation', 'abour', 'Russian', 'YouTube,', 'which', 'helps', 'russian', 'government', 'to', 'delete', 'videos', 'of', 'opposition.', 'And', 'not', 'only', 'delete', 'videos,', 'but', 'also', 'prevents', 'videos', 'to', 'get', 'in', '"Trending', 'videos"', 'by', 'freezing', 'views', '(', "i'm", 'not', 'talking', 'about', 'situations', 'when', 'views', 'stopsfor', 'several', 'minutes,', "i'm", 'talking', 'about', 'when', 'views', 'hold', 'on', 'a', 'same', 'number', 'for', 'whole', 'day!).', 'Spread', 'this', 'information,', 'we', 'need', 'your', 'help!'], ['I', 'am', 'so', 'upset', 'about', 'Chester.', "I've", 'been', 'listening', 'to', 'Linkin', 'Park', 'since', 'the', 'beginning,', 'like', 'a', 'lot', 'of', 'fans.', 'It', 'saddens', 'me', 'I', 'will', 'never', 'hear', 'a', 'new', 'Linkin', 'Park', 'song', 'again.', 'And', 'to', 'the', 'people', 'w

**Build the dictionary**

In [51]:
vocabulary_size = 10 # This number will depend on the max size of a youtube comment

def build_dataset(documents_vocabulary, n_words):
    """Process raw inputs into datasets"""
    documents_dataset = {'data': list(), 'count': list(), 'dictionary': list(), 'reversed_dictionary': list()}
    for words in documents_vocabulary:
        count = [['UNK', -1]] #Keeps track of common terms and unknow terms along with their count
        count.extend(collections.Counter(words).most_common(n_words - 1)) # extends "count" by adding the n_words (n most common words) found in each document
        dictionary = dict() #Keeps track of words found in count along with their id. 
        for word, _ in count:
            dictionary[word] = len(dictionary)
        data = list() #keeps track of the id of the words that appear in the dictinary in the order they appear in the vocabulary
        unk_count = 0
        for word in words:
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0
                unk_count += 1
            data.append(index)
        count[0][1] = unk_count # updata 'UNK' to reflect the number of unknown terms found so far
        reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        documents_dataset['data'].append(data)
        documents_dataset['count'].append(count)
        documents_dataset['dictionary'].append(dictionary)
        documents_dataset['reversed_dictionary'].append(reversed_dictionary)
        
    return documents_dataset
    
documents_dataset = build_dataset(comments_vocabulary, vocabulary_size)

del comments_vocabulary #Not needed so delete to reduce memory    
print('Most common words (+UNK) in first 5 documents', documents_dataset['count'][:5])
#print('Keys', documents_dataset.keys())
#print('Values', documents_dataset.values())

data_index = 0

Most common words (+UNK) in first 5 documents [[['UNK', 52], ('views', 3), ('we', 2), ('videos', 2), ('not', 2), ('talking', 2), ('help!', 2), ('about', 2), ('a', 2), ('need', 2)], [['UNK', 61], ('I', 4), ('they', 3), ('to', 3), ("don't", 3), ('Linkin', 2), ('that', 2), ('with', 2), ('a', 2), ('agree', 2)], [['UNK', 19], ('a', 2), ('it', 2), ('making', 1), ('talk', 1), ('hm.', 1), ('on', 1), ('matter,', 1), ('not', 1), ('to', 1)], [['UNK', 56], ('the', 6), ('no', 3), ('they', 3), ('something', 2), ('I', 2), ('public', 2), ('do', 2), ('one', 2), ('what', 2)], [['UNK', 0], ('away', 1), ('how', 1), ('imagine', 1), ('gotten', 1), ('they', 1), ('have', 1), ('with...', 1), ('many', 1), ('things', 1)]]


**Generate training batch (Skip-gram model)**

In [58]:
def generate_batch(batch_size, num_skips, skip_window):
    documents_batches = {'batch': list(), 'labels': list()}
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2* skip_window + 1 # [skip_window target skip_window]
    buffer = collections.deque(maxlen=span) #Keeps track all words being analized during each iteration

    for document_n in range(data_size):
        
        data = documents_dataset['data'][document_n]
        if(len(data) > 10): #TODO: Take care of documents that don't contain enough data. Some documents could be single word
            for _ in range(span):
                buffer.append(data[data_index])
                data_index = (data_index + 1) % len(data)
            for i in range(batch_size // num_skips):
                target = skip_window # target label at the center of the buffer
                targets_to_avoid = [skip_window]
                for j in range(num_skips):
                    while target in targets_to_avoid:
                        target = random.randint(0, span - 1) 
                    targets_to_avoid.append(target)
                    batch[i * num_skips + j] = buffer[skip_window]
                    labels[i* num_skips + j, 0] = buffer[target]

                buffer.append(data[data_index])
                data_index = (data_index + 1) % len(data)

            data_index = (data_index + len(data) - span) % len(data)
            documents_batches['batch'].append(batch)
            documents_batches['labels'].append(labels)
        
    return documents_batches
        
            
documents_batches = generate_batch(8, 2, 1)

IndexError: list index out of range

**Note to self:** Fix problem with "list index out of range"