In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse import linalg

from utils.sparse_matrix_builder import build_from_conceptnet_table

In [3]:
def build_ppmi(conceptnet_filename, ndim=300):
    sparse_csr, index = build_from_conceptnet_table(conceptnet_filename)
    ppmi = counts_to_ppmi(sparse_csr)
    u, s, vT = linalg.svds(ppmi, ndim)
    v = vT.T
    values = (u + v) * (s ** 0.5)

    return pd.DataFrame(values, index=index)


def counts_to_ppmi(counts_csr, smoothing=0.75):
    """
    Converts a sparse matrix of co-occurrences into a sparse matrix of positive
    pointwise mutual information. Context distributional smoothing is applied
    to the resulting matrix.
    """
    # word_counts adds up the total amount of association for each term.
    word_counts = np.asarray(counts_csr.sum(axis=1)).flatten()

    # smooth_context_freqs represents the relative frequency of occurrence
    # of each term as a context (a column of the table).
    smooth_context_freqs = np.asarray(counts_csr.sum(axis=0)).flatten() ** smoothing
    smooth_context_freqs /= smooth_context_freqs.sum()

    # Divide each row of counts_csr by the word counts. We accomplish this by
    # multiplying on the left by the sparse diagonal matrix of 1 / word_counts.
    ppmi = sparse.diags(1 / word_counts).dot(counts_csr)

    # Then, similarly divide the columns by smooth_context_freqs, by the same
    # method except that we multiply on the right.
    ppmi = ppmi.dot(sparse.diags(1 / smooth_context_freqs))

    # Take the log of the resulting entries to give pointwise mutual
    # information. Discard those whose PMI is less than 0, to give positive
    # pointwise mutual information (PPMI).
    ppmi.data = np.maximum(np.log(ppmi.data), 0)
    ppmi.eliminate_zeros()
    return ppmi

In [4]:
df = pd.read_csv("data/conceptnet_api/csv/edge_extract.csv")
print(df.shape)
df.head(3)

(4282, 8)


Unnamed: 0,end_id,end_label,start_id,start_label,rel_id,surface_text,weight,dataset
0,/c/en/chair_meeting,chair a meeting,/c/en/chairperson,A chairperson,/r/CapableOf,[[A chairperson]] can [[chair a meeting]],4.898979,/d/conceptnet/4/en
1,/c/en/chair,chair,/c/en/chairperson/n,chairperson,/r/Synonym,,2.0,/d/wiktionary/en
2,/c/en/president/n/wn/person,president,/c/en/chairperson/n/wn/person,chairperson,/r/Synonym,[[chairperson]] is a synonym of [[president]],2.0,/d/wordnet/3.1


In [5]:
df['weight'].describe()

count    4282.000000
mean        1.529992
std         0.916131
min         0.779000
25%         1.000000
50%         1.000000
75%         2.000000
max        13.576303
Name: weight, dtype: float64

In [8]:
sparse_csr, index = build_from_conceptnet_table("data/conceptnet_api/csv/edge_extract.csv")
ppmi = counts_to_ppmi(sparse_csr)

In [10]:
# build_ppmi(conceptnet_filename="data/conceptnet_api/csv/test_reduced.csv", ndim=20)
build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extract.csv", ndim=300)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
/c/en/chairperson,1.898893e-15,1.847598e-15,1.941179e-14,-1.135664e-14,-1.516736e-15,-6.334573e-15,9.082222e-16,2.809238e-15,-4.787089e-16,7.382577e-16,...,-1.824093e-16,5.117784e-16,-2.117627e-16,2.900476e-16,-3.832136e-16,1.426295e-15,-6.030993e-16,-7.541937e-17,4.908368e-16,-5.149063e-16
/c/en/chair_meeting,1.236525e-15,8.289041e-16,1.130369e-14,-6.845358e-15,-1.564629e-15,-3.708478e-15,8.603825e-16,1.856008e-15,-3.353770e-16,6.156463e-16,...,-4.729698e-16,5.976225e-16,-3.561807e-16,1.883192e-17,-5.318144e-16,8.240312e-16,-3.627639e-16,-5.975368e-17,2.103057e-16,-5.892223e-16
/c/en/chairperson/n,1.900151e-15,2.787529e-15,2.388518e-14,-5.101215e-15,4.842416e-16,-7.981376e-15,6.514042e-16,2.969566e-15,-3.024066e-16,9.129653e-16,...,2.736542e-16,-1.159180e-16,9.263479e-17,1.884970e-16,-2.799547e-16,1.506079e-15,9.542885e-17,3.619190e-17,7.330626e-16,-4.574667e-16
/c/en/chair,1.229549e-15,2.410908e-15,1.837842e-14,-7.755649e-15,7.914800e-16,-5.640898e-15,3.822744e-16,2.078257e-15,1.573719e-16,5.505242e-16,...,1.102965e-17,3.469284e-16,-1.163957e-17,-6.524182e-17,-5.022281e-16,8.654049e-16,-2.826616e-16,-2.149240e-16,7.912656e-16,-6.965728e-16
/c/en/chairperson/n/wn/person,1.168009e-16,1.986182e-14,-1.221816e-15,1.595800e-14,8.626206e-15,7.339835e-15,-2.436191e-14,-5.111562e-15,2.402819e-14,-1.946868e-14,...,-6.522571e-18,-3.468446e-17,5.261108e-18,1.719105e-18,-9.785191e-18,2.597529e-17,-7.511298e-17,7.437480e-16,9.389813e-17,3.151101e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/c/en/reporter/n/wn/person,-2.381126e-16,4.163891e-16,-3.068542e-17,1.057857e-15,3.586921e-16,-2.844505e-16,-6.458318e-16,-6.943362e-16,7.387461e-17,-6.720120e-16,...,-4.044554e-16,-5.540129e-16,4.014140e-16,-2.427001e-16,-1.075449e-16,3.559991e-16,1.184274e-17,8.034955e-17,-4.488997e-16,9.214597e-18
/c/en/communicator/n/wn/person,-2.828701e-18,1.394304e-18,-9.004767e-17,1.090938e-15,1.038112e-16,-2.959257e-16,-2.173005e-16,-1.964636e-16,7.950534e-17,-4.127541e-16,...,-2.675547e-16,-1.392627e-16,1.127154e-16,-1.946937e-16,-1.730301e-16,9.013356e-17,-2.210196e-17,3.048287e-17,-2.045777e-16,1.214427e-17
/c/en/newswoman/n/wn/person,-1.860435e-18,-2.184379e-17,8.241463e-17,9.842196e-16,1.914506e-16,-4.088398e-16,-3.479016e-16,-3.005255e-16,2.156529e-16,-4.734510e-16,...,-2.215311e-16,-2.860106e-17,1.319915e-16,-1.233897e-16,-1.780369e-16,1.477509e-16,2.136888e-17,2.709238e-17,-2.275478e-16,1.178852e-17
/c/en/newsman/n/wn/person,-4.424029e-18,1.532760e-16,-2.818856e-16,9.907592e-16,-3.783946e-17,-1.530289e-16,-1.780107e-16,-2.270747e-16,3.844209e-17,-4.039353e-16,...,-3.251701e-16,-3.228673e-16,8.667016e-17,-2.861502e-16,-1.295805e-16,5.091333e-17,-4.007971e-17,2.494749e-17,-1.545688e-16,2.151646e-17
