In [2]:
import numpy as np
from scipy import sparse
import itertools
from random import shuffle
from math import log
import pickle

In [8]:
test_corpus = ("""human interface computer
survey user computer system response time
eps user interface system
system human system eps
user response time
trees
graph trees
graph minors trees
graph minors survey
I like graph and stuff
I like trees and stuff
Sometimes I build a graph
Sometimes I build trees""").split("\n")

print(test_corpus)

['human interface computer', 'survey user computer system response time', 'eps user interface system', 'system human system eps', 'user response time', 'trees', 'graph trees', 'graph minors trees', 'graph minors survey', 'I like graph and stuff', 'I like trees and stuff', 'Sometimes I build a graph', 'Sometimes I build trees']


In [9]:
def build_vocab(corpus):
    """
    Build a vocabulary with word frequencies for an entire corpus.
    Returns a dictionary `w -> (index, frequency)`, mapping word strings to pairs of
    word ID and word corpus frequency.
    """

    vocab_dict = {}
    for line in corpus:
        words = line.strip().split(" ")
        for word in words:
            if word not in vocab_dict:
                vocab_dict[word] = 1
            else:
                vocab_dict[word] +=1

    word_index_count_dict = {}
    word_count = 0
    for word in vocab_dict:
        word_index_count_dict[word] = (word_count , vocab_dict[word])
        word_count = word_count + 1

    return word_index_count_dict

build_vocab(test_corpus)

{'human': (0, 2),
 'interface': (1, 2),
 'computer': (2, 2),
 'survey': (3, 2),
 'user': (4, 3),
 'system': (5, 4),
 'response': (6, 2),
 'time': (7, 2),
 'eps': (8, 2),
 'trees': (9, 5),
 'graph': (10, 5),
 'minors': (11, 2),
 'I': (12, 4),
 'like': (13, 2),
 'and': (14, 2),
 'stuff': (15, 2),
 'Sometimes': (16, 2),
 'build': (17, 2),
 'a': (18, 1)}

In [None]:
def build_cooccur(vocab, corpus, window_size=3, min_count=None):
    vocab_size = len(vocab)
    id_word = {}

    for word in vocab:
        id_word[vocab[word][0]] = word

    word_id = {id_word[id_]:id_ for id_ in id_word}

    save_model(id_word, path="id2word.pkl")
    save_model(word_id, path="word2id.pkl")

    #sparse lil_matrix is optimized to operate on matrix which mostly has zeros.    
    cooccurrences = sparse.lil_matrix((vocab_size, vocab_size),dtype=np.float64)

    for i, line in enumerate(corpus):

        senetence = line.strip().split()
        #Get the ID of words from vocab dictionary
        word_ids = [vocab[word][0] for word in senetence]
        #print word_ids

        for i, center_word_id in enumerate(word_ids):
            #Get all the left side words within the window size.    
            left_context_word_ids  = word_ids[max(0, i-window_size):i]

            #Get all the right side words within the window size. 
            right_context_word_ids = word_ids[i+1: i+window_size]

            #Now update the cooccurrence matrix for the current center word 
            #using the context words list.
            #First do for the left context part and then for right context part
            cooccurrences = update_cooccurrence_matrix(cooccurrences, left_context_word_ids, center_word_id,"left_context")
            cooccurrences = update_cooccurrence_matrix(cooccurrences, right_context_word_ids, center_word_id,"right_context")

    # Now yield our tuple sequence (dig into the LiL-matrix internals to
    # quickly iterate through all nonzero cells)
    cooccurrences_tuples = []
    for i, (row, data) in enumerate(itertools.izip(cooccurrences.rows,cooccurrences.data)):
        
        print(i, row, data)
        if min_count is not None and vocab[id_word[i]][1] < min_count:
            continue

        for data_idx, j in enumerate(row):
            if min_count is not None and vocab[id_word[j]][1] < min_count:
                continue

            cooccurrences_tuples.append((i, j, float(data[data_idx])))
            #yield i, j, data[data_idx] 

    print(cooccurrences)
    return cooccurrences_tuples