In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse import linalg

from utils.sparse_matrix_builder import build_from_conceptnet_table
from utils.retrofit import sharded_retrofit, join_shards
from utils.formats import load_hdf, save_hdf

#### Vectorization of ConceptNet Data (Adjacency)

In [3]:
def build_ppmi(conceptnet_filename, ndim=300):
    sparse_csr, index = build_from_conceptnet_table(conceptnet_filename)
    ppmi = counts_to_ppmi(sparse_csr)
    u, s, vT = linalg.svds(ppmi, ndim)
    v = vT.T
    values = (u + v) * (s ** 0.5)

    return pd.DataFrame(values, index=index)


def counts_to_ppmi(counts_csr, smoothing=0.75):
    """
    Converts a sparse matrix of co-occurrences into a sparse matrix of positive
    pointwise mutual information. Context distributional smoothing is applied
    to the resulting matrix.
    """
    # word_counts adds up the total amount of association for each term.
    word_counts = np.asarray(counts_csr.sum(axis=1)).flatten()

    # smooth_context_freqs represents the relative frequency of occurrence
    # of each term as a context (a column of the table).
    smooth_context_freqs = np.asarray(counts_csr.sum(axis=0)).flatten() ** smoothing
    smooth_context_freqs /= smooth_context_freqs.sum()

    # Divide each row of counts_csr by the word counts. We accomplish this by
    # multiplying on the left by the sparse diagonal matrix of 1 / word_counts.
    ppmi = sparse.diags(1 / word_counts).dot(counts_csr)

    # Then, similarly divide the columns by smooth_context_freqs, by the same
    # method except that we multiply on the right.
    ppmi = ppmi.dot(sparse.diags(1 / smooth_context_freqs))

    # Take the log of the resulting entries to give pointwise mutual
    # information. Discard those whose PMI is less than 0, to give positive
    # pointwise mutual information (PPMI).
    ppmi.data = np.maximum(np.log(ppmi.data), 0)
    ppmi.eliminate_zeros()
    return ppmi

In [4]:
df = pd.read_csv("data/conceptnet_api/csv/edge_extract.csv")
print(df.shape)
print(df['weight'].describe())
df.head(3)

(4282, 8)
count    4282.000000
mean        1.529992
std         0.916131
min         0.779000
25%         1.000000
50%         1.000000
75%         2.000000
max        13.576303
Name: weight, dtype: float64


Unnamed: 0,end_id,end_label,start_id,start_label,rel_id,surface_text,weight,dataset
0,/c/en/chair_meeting,chair a meeting,/c/en/chairperson,A chairperson,/r/CapableOf,[[A chairperson]] can [[chair a meeting]],4.898979,/d/conceptnet/4/en
1,/c/en/chair,chair,/c/en/chairperson/n,chairperson,/r/Synonym,,2.0,/d/wiktionary/en
2,/c/en/president/n/wn/person,president,/c/en/chairperson/n/wn/person,chairperson,/r/Synonym,[[chairperson]] is a synonym of [[president]],2.0,/d/wordnet/3.1


In [5]:
# build_ppmi(conceptnet_filename="data/conceptnet_api/csv/test_reduced.csv", ndim=20)
ppmi_df = build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extract.csv", ndim=300)
ppmi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
/c/en/chair_meeting,-8.602175e-16,-8.397180e-16,1.458368e-14,6.499082e-15,7.290473e-16,-1.337740e-15,-3.769957e-16,-5.665499e-16,3.407403e-17,1.512396e-16,...,-1.631004e-16,-4.350689e-16,-6.684061e-16,1.633641e-16,-2.365628e-16,9.577024e-17,9.945500e-16,-2.473528e-16,1.552727e-16,-8.841227e-17
/c/en/chairperson,-1.041996e-15,-5.119997e-16,2.165907e-14,1.495240e-14,7.398550e-16,-2.164249e-15,-2.798377e-16,-1.792070e-15,-4.959723e-18,-9.627345e-17,...,5.888121e-16,6.579372e-16,-3.343905e-16,1.390234e-16,1.684964e-16,-4.298360e-16,9.510839e-16,-3.303446e-16,4.365613e-16,-1.417402e-16
/c/en/chair,-7.488463e-16,2.506771e-15,1.517326e-14,1.209480e-14,-4.843594e-16,-8.934079e-16,2.320789e-16,-2.284454e-15,-8.141656e-17,-4.296492e-16,...,3.613623e-16,-1.240958e-16,-4.154442e-16,2.757393e-17,2.906793e-17,-3.863980e-16,8.430756e-16,1.221546e-16,4.050377e-16,-3.257905e-16
/c/en/chairperson/n,-1.039636e-15,1.823001e-15,2.192024e-14,2.365748e-14,-4.880544e-16,-1.711169e-15,9.735907e-17,-2.611363e-15,1.568740e-18,-3.445005e-16,...,1.174256e-16,-4.665404e-17,-1.147438e-15,1.093096e-16,-5.770174e-16,-2.766513e-16,1.561625e-15,-9.459450e-17,6.171322e-16,7.467122e-16
/c/en/president/n/wn/person,-9.303028e-17,1.809264e-14,2.966122e-14,-6.240331e-14,-4.155895e-15,-3.348137e-14,-4.773879e-15,-2.428342e-14,3.244283e-15,2.433713e-14,...,7.223006e-17,-1.536466e-16,9.063268e-17,-9.787532e-17,4.221136e-17,-6.027368e-16,2.524614e-16,-2.353043e-16,-1.110227e-17,-3.722749e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/c/en/communicator/n/wn/person,1.301688e-16,-4.390491e-16,9.810768e-16,-1.051518e-16,-6.161205e-16,-1.478032e-16,-2.269507e-16,-5.030732e-16,4.075850e-17,-6.845867e-16,...,2.408639e-17,1.623674e-16,1.432303e-16,1.075206e-16,6.735983e-17,-7.065734e-17,-9.938446e-17,-2.485088e-17,2.400691e-17,-1.281066e-17
/c/en/reporter/n/wn/person,1.739354e-16,-1.074894e-15,1.835194e-15,-9.126040e-16,-7.303393e-16,-9.304480e-16,-6.799319e-16,-5.510952e-16,8.548669e-17,-9.313466e-16,...,-8.608814e-17,-3.039557e-16,-1.101658e-16,-6.681868e-18,-1.706201e-16,1.645406e-16,5.787089e-17,6.694450e-17,6.387185e-18,-4.660610e-17
/c/en/newswoman/n/wn/person,1.339242e-16,-2.742113e-16,7.410420e-16,-1.732684e-16,-5.998933e-16,-2.109137e-16,-2.934143e-16,-5.125348e-16,2.511385e-17,-6.769097e-16,...,5.639811e-19,1.725314e-16,1.511426e-16,1.965852e-17,6.812408e-17,-5.077183e-17,-5.709828e-17,1.808548e-18,2.094576e-17,-2.282692e-17
/c/en/newsman/n/wn/person,1.471623e-16,-2.848773e-16,4.289228e-16,-3.242253e-16,-6.049971e-16,-2.517672e-16,-3.237212e-16,-4.710677e-16,3.143549e-17,-7.193570e-16,...,-1.577800e-17,1.973832e-16,1.645325e-16,-3.660617e-17,5.625871e-17,-3.424992e-17,-3.233760e-17,2.599043e-17,2.097864e-17,-3.404635e-17


In [6]:
save_hdf(ppmi_df, filename='data/conceptnet_api/hdf/test.hdf')

### Retrofitting

In [7]:
sharded_retrofit(
    dense_hdf_filename="data/conceptnet_api/hdf/test.hdf",
    conceptnet_filename="data/conceptnet_api/csv/edge_extract.csv",
    output_filename="data/conceptnet_api/retrofit/test_retrofitted"
)

join_shards(output_filename="data/conceptnet_api/retrofit/test_retrofitted", nshards=6, sort=False)

In [9]:
pd.read_hdf("data/conceptnet_api/retrofit/test_retrofitted").head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
/c/en/chair_meeting,-2.129197e-16,-2.004567e-07,3.139843e-07,-6.492713e-08,1.334435e-16,-3.828865e-16,-6.465064000000001e-17,-2.960055e-16,4.2038440000000005e-17,-1.789314e-17,...,0.029592,-0.028524,0.038507,-0.051363,0.052692,-0.054144,0.052064,-0.045782,-0.061692,-0.096738
/c/en/chairperson,-2.069472e-16,-1.889602e-07,2.959767e-07,-6.120344e-08,1.127873e-16,-3.864099e-16,-5.23309e-17,-3.366037e-16,3.7787550000000004e-17,-2.999062e-17,...,0.027895,-0.026888,0.036298,-0.048417,0.04967,-0.051038,0.049078,-0.043156,-0.058154,-0.09119
/c/en/chair,-2.000913e-16,-1.983455e-07,3.106773e-07,-6.424331e-08,-8.366849000000001e-17,-3.044844e-16,2.7266750000000003e-17,-5.378111e-16,2.993259e-17,-9.740842e-17,...,0.029281,-0.028224,0.038101,-0.050822,0.052137,-0.053573,0.051516,-0.0453,-0.061042,-0.09572
/c/en/chairperson/n,-2.004085e-16,-1.867657e-07,2.925394e-07,-6.049266e-08,-5.062227e-17,-3.278182e-16,9.696096e-18,-4.912406e-16,3.229945e-17,-7.964047000000001e-17,...,0.027571,-0.026576,0.035877,-0.047855,0.049093,-0.050446,0.048508,-0.042655,-0.057479,-0.090131
/c/en/president/n/wn/person,-8.483401e-18,-1.530221e-07,2.396853e-07,-4.956326e-08,-7.436574e-16,-6.092042e-15,-8.714433e-16,-4.394232e-15,6.218402e-16,4.450388e-15,...,0.02259,-0.021774,0.029395,-0.039209,0.040223,-0.041332,0.039744,-0.034948,-0.047094,-0.073847
