In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse import linalg

from utils.sparse_matrix_builder import build_from_conceptnet_table

In [2]:
def build_ppmi(conceptnet_filename, ndim=300):
    sparse_csr, index = build_from_conceptnet_table(conceptnet_filename)
    ppmi = counts_to_ppmi(sparse_csr)
    u, s, vT = linalg.svds(ppmi, ndim)
    v = vT.T
    values = (u + v) * (s ** 0.5)

    return pd.DataFrame(values, index=index)


def counts_to_ppmi(counts_csr, smoothing=0.75):
    """
    Converts a sparse matrix of co-occurrences into a sparse matrix of positive
    pointwise mutual information. Context distributional smoothing is applied
    to the resulting matrix.
    """
    # word_counts adds up the total amount of association for each term.
    word_counts = np.asarray(counts_csr.sum(axis=1)).flatten()

    # smooth_context_freqs represents the relative frequency of occurrence
    # of each term as a context (a column of the table).
    smooth_context_freqs = np.asarray(counts_csr.sum(axis=0)).flatten() ** smoothing
    smooth_context_freqs /= smooth_context_freqs.sum()

    # Divide each row of counts_csr by the word counts. We accomplish this by
    # multiplying on the left by the sparse diagonal matrix of 1 / word_counts.
    ppmi = sparse.diags(1 / word_counts).dot(counts_csr)

    # Then, similarly divide the columns by smooth_context_freqs, by the same
    # method except that we multiply on the right.
    ppmi = ppmi.dot(sparse.diags(1 / smooth_context_freqs))

    # Take the log of the resulting entries to give pointwise mutual
    # information. Discard those whose PMI is less than 0, to give positive
    # pointwise mutual information (PPMI).
    ppmi.data = np.maximum(np.log(ppmi.data), 0)
    ppmi.eliminate_zeros()
    return ppmi

In [3]:
df = pd.read_csv("data/conceptnet_api/csv/edge_extract.csv")
print(df.shape)
df.head(3)

(4282, 8)


Unnamed: 0,end_id,end_label,start_id,start_label,rel_id,surface_text,weight,dataset
0,/c/en/chair_meeting,chair a meeting,/c/en/chairperson,A chairperson,/r/CapableOf,[[A chairperson]] can [[chair a meeting]],4.898979,/d/conceptnet/4/en
1,/c/en/chair,chair,/c/en/chairperson/n,chairperson,/r/Synonym,,2.0,/d/wiktionary/en
2,/c/en/president/n/wn/person,president,/c/en/chairperson/n/wn/person,chairperson,/r/Synonym,[[chairperson]] is a synonym of [[president]],2.0,/d/wordnet/3.1


In [4]:
df['weight'].describe()

count    4282.000000
mean        1.529992
std         0.916131
min         0.779000
25%         1.000000
50%         1.000000
75%         2.000000
max        13.576303
Name: weight, dtype: float64

In [5]:
sparse_csr, index = build_from_conceptnet_table("data/conceptnet_api/csv/edge_extract.csv")
ppmi = counts_to_ppmi(sparse_csr)

  ppmi = sparse.diags(1 / word_counts).dot(counts_csr)
  ppmi = ppmi.dot(sparse.diags(1 / smooth_context_freqs))


In [7]:
# build_ppmi(conceptnet_filename="data/conceptnet_api/csv/test_reduced.csv", ndim=20)
build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extract.csv", ndim=20)

 ** On entry to DLASCL parameter number  4 had an illegal value
 ** On entry to DLASCL parameter number  4 had an illegal value


  ppmi = sparse.diags(1 / word_counts).dot(counts_csr)
  ppmi = ppmi.dot(sparse.diags(1 / smooth_context_freqs))


ArpackError: ARPACK error -9999: Could not build an Arnoldi factorization. IPARAM(5) returns the size of the current Arnoldi factorization. The user is advised to check that enough workspace and array storage has been allocated.