In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse import linalg

from utils.ppmi import build_ppmi
from utils.retrofit import sharded_retrofit, join_shards
from utils.formats import load_hdf, save_hdf

#### Vectorization of ConceptNet Data (Adjacency)

In [3]:
df = pd.read_csv("data/conceptnet_api/csv/edge_extract.csv")
print(df.shape)
print(df['weight'].describe())
df.head(3)

(4282, 8)
count    4282.000000
mean        1.529992
std         0.916131
min         0.779000
25%         1.000000
50%         1.000000
75%         2.000000
max        13.576303
Name: weight, dtype: float64


Unnamed: 0,end_id,end_label,start_id,start_label,rel_id,surface_text,weight,dataset
0,/c/en/help_child,help a child,/c/en/adult,an adult,/r/CapableOf,[[an adult]] can [[help a child]],3.464102,/d/conceptnet/4/en
1,/c/en/adult,adult,/c/en/man,man,/r/RelatedTo,[[man]] is related to [[adult]],3.062025,/d/verbosity
2,/c/en/sign_contract,sign a contract,/c/en/adult,an adult,/r/CapableOf,[[an adult]] can [[sign a contract]],2.828427,/d/conceptnet/4/en


In [4]:
NDIM = 128
MODEL_NAME = "albert" # based on model name 
ppmi_df = build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extract.csv", ndim=NDIM)
save_hdf(ppmi_df, filename=f'data/conceptnet_api/hdf/ppmi-{MODEL_NAME}-{NDIM}.hdf')
ppmi_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
/c/en/help_child,6.044422000000001e-17,-0.035469,-0.019781,-4.2e-05,0.014495,-2.955393e-16,-0.019826,1.064215e-15,1.631865e-15,-2.877729e-16,...,-0.002325,-0.004004,-0.000241,0.025836,-0.04182,0.005383,-0.001438,0.000221,0.000418,-0.005307
/c/en/adult,-1.463223e-16,-0.061701,-0.052887,-0.00027,0.028143,-4.461336e-16,-0.041698,4.526839e-15,4.342135e-15,-1.064343e-15,...,-0.018061,-0.0304,-0.001878,0.202185,-0.345947,0.045245,-0.012511,0.001947,0.003833,-0.052088
/c/en/man,3.552702e-16,-0.017737,0.121323,-0.000685,-0.19304,5.299669e-15,0.244068,2.599646e-14,-1.820622e-15,-6.285917e-15,...,-0.155744,-0.142276,-0.001892,0.75451,-0.660803,0.098269,-0.024405,0.007568,-0.03928,-0.059049
/c/en/sign_contract,-3.9900040000000004e-17,-0.036922,-0.018067,-1.7e-05,0.014801,-2.414464e-16,-0.019997,9.875624e-16,1.71033e-15,-2.460405e-16,...,-0.002268,-0.003947,-0.000239,0.025528,-0.041064,0.005288,-0.001411,0.000217,0.000418,-0.005244
/c/en/dress_herself,-3.412731e-17,-0.036922,-0.018067,-1.7e-05,0.014801,-2.349064e-16,-0.019997,1.044048e-15,1.714206e-15,-2.550197e-16,...,-0.002268,-0.003947,-0.000239,0.025528,-0.041064,0.005288,-0.001411,0.000217,0.000418,-0.005244


### Retrofitting

In [6]:
NUM_SHARDS = 8 # Numbver of sharding partitions

sharded_retrofit(
    dense_hdf_filename=f"data/conceptnet_api/hdf/ppmi-{MODEL_NAME}-{NDIM}.hdf",
    conceptnet_filename="data/conceptnet_api/csv/edge_extract.csv",
    output_filename=f"data/conceptnet_api/retrofit/retrofitted-{MODEL_NAME}-{NDIM}",
    nshards=NUM_SHARDS
)

join_shards(output_filename=f"data/conceptnet_api/retrofit/retrofitted-{MODEL_NAME}-{NDIM}", nshards=NUM_SHARDS, sort=False)
df = pd.read_hdf(f"data/conceptnet_api/retrofit/retrofitted-{MODEL_NAME}-{NDIM}")
print(df.shape)
df.head()

(4081, 128)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
/c/en/help_child,-0.000251,-0.010002,-0.006968,0.000776,0.0041,0.001745,-0.005115,-0.001407,3e-06,0.00256,...,0.030421,-0.043209,-0.044639,0.018009,-0.033039,-0.048886,0.055936,0.052766,0.071421,0.092698
/c/en/adult,-0.000234,-0.007938,-0.006174,0.000723,0.003105,0.001632,-0.003933,-0.001316,2e-06,0.002395,...,0.027635,-0.045798,-0.042103,0.058996,-0.069027,-0.040135,0.050881,0.049507,0.067248,0.077322
/c/en/man,-0.00025,-0.005036,0.014826,0.000771,-0.01862,0.001739,0.024272,-0.001402,3e-06,0.002552,...,-0.007104,-0.081502,-0.044615,0.222954,-0.129706,-0.03082,0.051938,0.053575,0.056078,0.090643
/c/en/sign_contract,-0.000251,-0.010131,-0.006857,0.00078,0.004132,0.001748,-0.005139,-0.00141,3e-06,0.002566,...,0.030492,-0.043298,-0.044735,0.018026,-0.033055,-0.048998,0.056059,0.05288,0.071576,0.092903
/c/en/dress_herself,-0.000251,-0.010131,-0.006857,0.00078,0.004132,0.001748,-0.005139,-0.00141,3e-06,0.002566,...,0.030492,-0.043298,-0.044735,0.018026,-0.033055,-0.048998,0.056059,0.05288,0.071576,0.092903
