In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse import linalg

from utils.ppmi import build_ppmi
from utils.retrofit import sharded_retrofit, join_shards
from utils.formats import load_hdf, save_hdf

#### Vectorization of ConceptNet Data (Adjacency)

In [3]:
df = pd.read_csv("data/conceptnet_api/csv/edge_extract.csv")
print(df.shape)
print(df['weight'].describe())
df.head(3)

(4282, 8)
count    4282.000000
mean        1.529992
std         0.916131
min         0.779000
25%         1.000000
50%         1.000000
75%         2.000000
max        13.576303
Name: weight, dtype: float64


Unnamed: 0,end_id,end_label,start_id,start_label,rel_id,surface_text,weight,dataset
0,/c/en/chair_meeting,chair a meeting,/c/en/chairperson,A chairperson,/r/CapableOf,[[A chairperson]] can [[chair a meeting]],4.898979,/d/conceptnet/4/en
1,/c/en/chair,chair,/c/en/chairperson/n,chairperson,/r/Synonym,,2.0,/d/wiktionary/en
2,/c/en/president/n/wn/person,president,/c/en/chairperson/n/wn/person,chairperson,/r/Synonym,[[chairperson]] is a synonym of [[president]],2.0,/d/wordnet/3.1


In [None]:
# build_ppmi(conceptnet_filename="data/conceptnet_api/csv/test_reduced.csv", ndim=20)
ppmi_df = build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extract.csv", ndim=128)
ppmi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
/c/en/chair_meeting,-1.775796e-16,2.285645e-15,-6.268047e-17,-3.314670e-15,3.567690e-15,-1.878298e-16,-1.776357e-15,-3.570432e-15,-6.125593e-16,4.206699e-16,...,7.990277e-16,-1.009080e-15,4.164643e-16,-4.554916e-17,1.558674e-16,1.474087e-16,-1.890701e-16,-3.255250e-17,5.011631e-16,1.790117e-15
/c/en/chairperson,-1.300408e-16,3.525097e-15,5.739152e-17,-2.175060e-15,3.962403e-15,3.537489e-16,-2.950646e-15,-3.267600e-15,-1.591883e-15,5.875406e-16,...,1.316957e-15,-9.842696e-16,1.009518e-15,2.093130e-16,2.416912e-16,-3.665886e-16,4.874291e-16,6.616670e-17,8.256789e-16,2.590560e-15
/c/en/chair,-2.560629e-16,2.207683e-15,-2.339368e-16,3.470860e-15,4.482089e-16,1.091531e-15,-2.982690e-15,5.350034e-16,-2.692842e-15,1.101375e-15,...,5.475160e-16,-7.608041e-16,3.635748e-16,1.717013e-17,3.194367e-16,-5.240716e-16,-4.628479e-17,3.780004e-17,6.115269e-16,1.808869e-15
/c/en/chairperson/n,-6.651574e-16,3.318132e-15,-2.462264e-16,3.223138e-15,1.750586e-15,1.156401e-15,-4.046480e-15,-5.918873e-16,-3.221156e-15,1.549997e-15,...,9.547271e-16,-1.146707e-15,6.663233e-16,3.296377e-16,3.996690e-16,-5.100762e-16,2.767142e-16,3.423819e-17,8.516882e-16,3.197054e-15
/c/en/president/n/wn/person,1.135477e-14,-1.987326e-15,-2.668338e-14,-2.721595e-15,2.907119e-13,2.816170e-13,-1.748158e-14,-1.557233e-15,1.746718e-14,2.424113e-14,...,3.215129e-16,3.084495e-16,-2.480066e-16,-3.221541e-16,-1.021723e-17,1.561463e-16,-1.977980e-16,-4.012878e-16,-1.795954e-15,-2.063071e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/c/en/communicator/n/wn/person,3.524825e-16,2.860483e-16,1.213384e-15,7.170911e-16,6.544154e-16,-5.358673e-16,-2.406991e-16,2.058057e-16,-5.085184e-16,1.572553e-16,...,2.167821e-16,1.783886e-16,9.192305e-17,6.602339e-17,-2.770417e-18,1.003448e-16,4.229299e-17,-8.947093e-17,-1.105695e-16,-4.063143e-17
/c/en/reporter/n/wn/person,7.925957e-16,6.687107e-16,2.026844e-15,9.346286e-16,7.135876e-16,-9.214340e-16,1.350558e-16,3.238367e-16,-1.113942e-15,4.945921e-16,...,2.503612e-16,4.199017e-16,3.102171e-16,1.472813e-16,-4.158742e-17,1.417661e-16,-5.991416e-17,-5.724630e-18,-1.873657e-17,-8.026347e-17
/c/en/newswoman/n/wn/person,-1.359884e-16,1.892566e-16,9.050011e-16,6.338830e-16,1.314426e-16,-6.617436e-16,1.223304e-16,-2.006318e-16,-4.035126e-16,-1.084171e-16,...,9.920146e-17,9.327909e-17,1.083779e-16,2.054054e-17,-9.728926e-18,1.392814e-16,-6.228230e-17,9.794216e-18,2.489755e-17,-2.865989e-17
/c/en/newsman/n/wn/person,-2.373951e-16,1.535810e-16,8.138856e-16,6.398764e-16,1.821011e-18,-6.606025e-16,1.967540e-16,-2.650846e-16,-3.577246e-16,-1.718056e-16,...,7.647771e-17,8.402585e-17,1.088979e-16,1.025740e-18,-8.835057e-19,1.423982e-16,-8.621140e-17,2.325140e-17,5.859425e-17,-1.978601e-17


In [5]:
save_hdf(ppmi_df, filename='data/conceptnet_api/hdf/test.hdf')

### Retrofitting

In [None]:
sharded_retrofit(
    dense_hdf_filename="data/conceptnet_api/hdf/test.hdf",
    conceptnet_filename="data/conceptnet_api/csv/edge_extract.csv",
    output_filename="data/conceptnet_api/retrofit/test_retrofitted"
)

join_shards(output_filename="data/conceptnet_api/retrofit/test_retrofitted", nshards=8, sort=False)

In [7]:
pd.read_hdf("data/conceptnet_api/retrofit/test_retrofitted").head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
/c/en/chair_meeting,-4.2211570000000007e-17,6.753685e-16,1.230684e-17,2.004569e-07,-3.139845e-07,6.492718e-08,-5.941687e-16,-7.451524e-16,-3.245492e-16,1.050718e-16,...,-0.029592,0.028524,-0.038507,-0.051363,-0.052692,-0.054144,-0.052064,-0.045782,-0.061692,0.096738
/c/en/chairperson,-4.212794e-17,6.804785e-16,1.3207340000000001e-17,1.889603e-07,-2.959769e-07,6.120348e-08,-6.157436e-16,-6.56382e-16,-3.615849e-16,1.146372e-16,...,-0.027895,0.026888,-0.036298,-0.048417,-0.04967,-0.051038,-0.049078,-0.043156,-0.058154,0.09119
/c/en/chair,-8.906453000000001e-17,6.522849e-16,-2.7179810000000002e-17,1.983456e-07,-3.106775e-07,6.424335e-08,-8.001632e-16,-1.145839e-16,-6.61741e-16,2.461867e-16,...,-0.029281,0.028224,-0.038101,-0.050822,-0.052137,-0.053573,-0.051516,-0.0453,-0.061043,0.09572
/c/en/chairperson/n,-9.113447000000001e-17,6.59468e-16,-2.0270640000000003e-17,1.867658e-07,-2.925395e-07,6.04927e-08,-7.669644e-16,-2.256429e-16,-6.009574e-16,2.290223e-16,...,-0.027571,0.026576,-0.035877,-0.047855,-0.049093,-0.050446,-0.048508,-0.042655,-0.057479,0.090131
/c/en/president/n/wn/person,2.159347e-15,-3.783949e-16,-5.054159e-15,1.601425e-07,-2.508382e-07,5.186957e-08,-3.33826e-15,-2.85979e-16,3.288175e-15,4.581335e-15,...,-0.023641,0.022788,-0.030763,-0.041033,-0.042095,-0.043255,-0.041593,-0.036575,-0.049285,0.077283
