In [4]:
docs = ['aa aa bb cc.',
        'cc dd ee ff.',
        'ff ff gg aa']

corpus = '\n\n'.join(docs)

In [5]:
#
# Build Co-occurence Matrix, A
#

import itertools

# external
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize

def process_text(text):
	""" return list of lowercase alphabetic words from text """
	tokenizer = RegexpTokenizer(r'\w+')
	return tokenizer.tokenize(text.lower())

def ngram_tupples(corpus, n):
	""" Create ngram tupples by sentence. Where n is the distance between words in a sentence. """
	sentences = sent_tokenize(corpus)

	pairs = []
	for s in sentences:
		unique_tokens = process_text(s)
		pairs.extend(ngrams(unique_tokens,n))

	return pairs

def get_unique_words(corpus):
	return list(set(process_text(corpus)))

def w2id_id2w_maps(unique_words):
	""" return both dictonaries for mapping between words and ids """
	id2w = {i:w for i,w in enumerate(unique_words)}
	w2id = {w:i for i,w in id2w.items()}
	return w2id, id2w

def ngram_inc_amt(n):
	""" return float for increment weight of pair occurence n distance appart. \nWeight increment ~ 1/n """
	return 1/float(n**2)

def words2ids(words, w2id):
	""" return list of ids inplace of list of words using w2id dictionary """
	return [w2id[w] for w in words]

def cooccurence_pair_of_distance(sentence_list, d):
    """ return list of unique coocurence pairs of distace d """

    all_ngrams = ngrams(sentence_list,d)

    all_pairs = []
    for t in all_ngrams:
        if len(t) > 1:
            all_pairs.extend(list(itertools.combinations(t, 2)))

    return list(set(all_pairs))

def break_corpus(corpus):
    """ Build Cooccurence Matrix. Return A, n, w2id, id2w """

    unique_words = get_unique_words(corpus)
    n = len(unique_words)
    w2id, id2w = w2id_id2w_maps(unique_words)

    #create empty cooccurence matrix
    #A = np.zeros([n,n],np.float32)
    A = np.ones([n,n],np.float32)

    #compute cooccurence matrix
    sentences = sent_tokenize(corpus)
    for s in sentences:
        s = process_text(s)
        max_distance = len(s) + 1
        s = [w2id[w] for w in s]	#convert words to ids

        for d in range(2,max_distance):
            pairs = cooccurence_pair_of_distance(s, d)

            #update cooccurence matrix for each pair
            for p in pairs:
                A[p[0],p[1]] += ngram_inc_amt(d)
                A[p[1],p[0]] += ngram_inc_amt(d)

    return A, n, w2id, id2w

A, n, w2id, id2w = break_corpus(corpus)

In [6]:
def rank_most_related_words(A, w2id, id2w, w1):
    col = A[:,w2id[w1]].tolist()

    list_of_index_value_tuples = list(zip(list(range(len(col))),col))
    #(colindex,value)
    sorted_indexs_value_tuples = sorted(list_of_index_value_tuples, key=lambda x: x[1])
    sorted_indexs_value_tuples.reverse()
    indexs, values = zip(*sorted_indexs_value_tuples)
    ranked_words = [id2w[i] for i in indexs]
    return ranked_words

rank_most_related_words(A,w2id, id2w, 'aa')

['aa', 'bb', 'gg', 'ff', 'cc', 'dd', 'ee']

In [None]:
#internal
import random

#external
import numpy as np

"""
Matrix Decomposition
	Method 1) Singular Value Decomposition of Forced Square Symmetric Positive Definite Matrix
	Method 2) Eigen Decomposition of Forced Square Symmetric Positive Matrix
	Method 3) Stocastic Gradient Decent Decomposition
"""

# check symmetry of matrix up to some tolerance
def check_symmetric(a, tol=1e-8):
    return np.allclose(a, a.T, atol=tol)

#force square symetric matrix to symetric positive definite matrix
def to_positive_definite(S):
    """ Take absolute value of S and update diagnol entries to make a diagonally dominant matrix with diagonal entries greater than 0. """
    # take absolute value of S
    S = np.absolute(S)

    # Sum rows in S
    new_diagonals = S.sum(axis=1)

    #replace diagnols in S
    np.fill_diagonal(S, new_diagonals)

    return S

"""
####################################################################################
# Singular Value Decomposition of Forced Square Symmetric Positive Definite Matrix #
####################################################################################
1.) Force Cooccurence Matrix A to Positive Definite Matrix
    " A diagonally dominant(by rows) symetric matrix with diagonal elements all greater than zero is positive definite."
    Take symmetric matrix and make diagonally dominant with diagnonal entries greater than 0
2.) Square Symmetric Positive Definite Matrix Decomposition
    " If A is positive definite, then A = QLQt = UDV (where U=V=Q and L=D) can be written as A = WWt where W = Qsqrt(L) "
    -SVD: A=UDV, W = Vsqrt(diagnol(D))
    Find V and D from singular value decomposition of A
    return W = Vsqrt(D)
overview of code:
    #
    # force symmetric matrix to positive definite matrix
    #
    # take absolute value of A
    A = np.absolute(A)
    # Sum rows in a
    new_diagonals = A.sum(axis=1)
    #replace diagnols in A
    np.fill_diagonal(A, new_diagonals)
    #
    # decompose positive definite matrix
    #
    # singular value decomposition
    U, D, V = np.linalg.svd(A, full_matrices=False)
    #
    # compute W from V and D of singular value decomposition
    #
    # Create matrix W = Vtsqrt(diagnol(D)) #why Vt?
    W = np.dot(np.transpose(V), np.sqrt(np.diag(D)))
    #A = WWt
"""

def svd_spd_decomposition(P):
    """ return M such that P = MMt, where matrix parameter P is SPD """
    # Assert Matrix P is symetric
    assert check_symmetric(P)

    # singular value decomposition
    U, D, V = np.linalg.svd(P, full_matrices=False)

    # Create matrix W = Vtsqrt(diagnol(D)) #why Vt?
    M = np.dot(np.transpose(V), np.sqrt(np.diag(D)))

    return M

def spd_decomposition(S):
	""" Force Cooccurence Matrix A to Positive Definite Matrix and decompose into W such that A = WWt. """
	P = to_positive_definite(S)
	M = svd_spd_decomposition(P)
	return M

#
# Perform Symmetric Positive Definite Decomposition
#

W = spd_decomposition(A)

print("\n\n\nSymmetric Positive Definite Decomposition:\n")
print("\n\nA:")
print (A)
print("\n\nA2 (modified into SPD):\n")
print(to_positive_definite(A))
print("\nWWt:\n")
print(np.dot(W, np.transpose(W)))