In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse import linalg

from utils.ppmi import build_ppmi
from utils.retrofit import sharded_retrofit, join_shards
from utils.formats import load_hdf, save_hdf

#### Vectorization of ConceptNet Data (Adjacency)

In [3]:
df = pd.read_csv("data/conceptnet_api/csv/edge_extract.csv")
print(df.shape)
print(df['weight'].describe())
df.head(3)

(4282, 8)
count    4282.000000
mean        1.529992
std         0.916131
min         0.779000
25%         1.000000
50%         1.000000
75%         2.000000
max        13.576303
Name: weight, dtype: float64


Unnamed: 0,end_id,end_label,start_id,start_label,rel_id,surface_text,weight,dataset
0,/c/en/chair_meeting,chair a meeting,/c/en/chairperson,A chairperson,/r/CapableOf,[[A chairperson]] can [[chair a meeting]],4.898979,/d/conceptnet/4/en
1,/c/en/chair,chair,/c/en/chairperson/n,chairperson,/r/Synonym,,2.0,/d/wiktionary/en
2,/c/en/president/n/wn/person,president,/c/en/chairperson/n/wn/person,chairperson,/r/Synonym,[[chairperson]] is a synonym of [[president]],2.0,/d/wordnet/3.1


In [4]:
NDIM = 512
MODEL_NAME = "mobilebert" # based on model name 
ppmi_df = build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extract.csv", ndim=NDIM)
save_hdf(ppmi_df, filename=f'data/conceptnet_api/hdf/ppmi-{MODEL_NAME}-{NDIM}.hdf')
ppmi_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
/c/en/chair_meeting,8.226038000000001e-17,-5.654146e-16,1.044566e-15,1.165548e-15,1.168844e-15,-4.229006e-18,-9.007376e-16,-3.258017e-16,-7.34139e-16,4.442363e-17,...,-4.751361e-16,1.473878e-16,-6.404864e-16,-7.306593e-16,-1.502965e-16,-1.692696e-16,3.181518e-16,5.55716e-16,-1.74523e-16,1.016917e-16
/c/en/chairperson,-4.785172e-16,4.934194e-16,-1.038376e-16,-8.419112e-16,-2.066339e-16,-1.268891e-16,-7.795192e-16,1.315562e-15,-6.274334e-16,-7.001291e-16,...,-3.517713e-16,-3.652089e-16,-7.640168e-16,-5.759109e-16,-1.091361e-15,-5.975755e-16,1.806109e-16,8.476623e-16,-8.587455000000001e-17,3.474624e-16
/c/en/chair,-3.346088e-16,7.580059000000001e-17,1.893049e-16,-1.812264e-16,-9.982813e-16,2.288977e-15,9.864325e-16,-7.37609e-16,3.14426e-16,7.498893e-16,...,-2.688973e-16,-2.112381e-16,-5.363475e-16,-4.443658e-16,-6.831589e-16,-3.815402e-16,1.492602e-16,5.806414e-16,-1.511621e-16,4.236036e-17
/c/en/chairperson/n,1.595173e-16,6.679314000000001e-17,2.356024e-17,1.896837e-16,-1.21209e-15,3.04413e-16,6.758283e-17,-6.380305e-16,-3.65421e-16,-2.832168e-16,...,-6.540308e-16,2.003865e-16,-9.274167e-16,-9.98921e-16,-2.860567e-16,-2.818586e-16,4.297314e-16,8.199123e-16,-3.320904e-16,4.008568e-16
/c/en/president/n/wn/person,9.504399e-16,2.936054e-16,-2.049997e-16,5.4423280000000006e-17,-1.426454e-16,-4.60179e-16,2.099735e-16,-1.745398e-16,2.072283e-16,4.0758930000000003e-17,...,-2.751341e-16,2.31254e-16,-2.358741e-16,-5.418867e-16,4.587858e-16,1.166155e-16,-3.280679e-16,1.183527e-16,4.1116430000000005e-17,-4.209583e-16


### Retrofitting

In [5]:
NUM_SHARDS = 8 # Numbver of sharding partitions

sharded_retrofit(
    dense_hdf_filename=f"data/conceptnet_api/hdf/ppmi-{MODEL_NAME}-{NDIM}.hdf",
    conceptnet_filename="data/conceptnet_api/csv/edge_extract.csv",
    output_filename=f"data/conceptnet_api/retrofit/retrofitted-{MODEL_NAME}-{NDIM}",
    nshards=NUM_SHARDS
)

join_shards(output_filename=f"data/conceptnet_api/retrofit/retrofitted-{MODEL_NAME}-{NDIM}", nshards=NUM_SHARDS, sort=False)
pd.read_hdf(f"data/conceptnet_api/retrofit/retrofitted-{MODEL_NAME}-{NDIM}").head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
/c/en/chair_meeting,1.909116e-16,1.894384e-16,7.737913e-16,1.288819e-16,4.873481e-16,-4.535891e-16,-3.794817e-17,1.772721e-16,-2.756901e-16,-9.4e-05,...,-0.024085,-0.023216,0.031341,0.041805,-0.042886,0.044068,0.042375,0.037262,0.050212,-0.078736
/c/en/chairperson,1.780916e-16,2.258107e-16,7.456995e-16,6.782034000000001e-17,4.36367e-16,-4.535279e-16,-2.2906850000000002e-17,2.19204e-16,-2.709898e-16,-9.5e-05,...,-0.024429,-0.023547,0.031788,0.042401,-0.043498,0.044697,0.04298,0.037794,0.050928,-0.07986
/c/en/chair,2.028771e-16,2.36332e-16,7.450249e-16,7.142677000000001e-17,2.497004e-16,-2.472532e-16,1.845068e-16,3.364021e-17,-1.803828e-16,-9.9e-05,...,-0.025512,-0.024591,0.033198,0.044281,-0.045427,0.046679,0.044886,0.03947,0.053186,-0.0834
/c/en/chairperson/n,2.216181e-16,2.424634e-16,7.66011e-16,8.527731e-17,2.816373e-16,-3.512606e-16,1.288799e-16,6.735327e-17,-2.225475e-16,-0.000102,...,-0.026228,-0.025281,0.034129,0.045523,-0.046701,0.047988,0.046145,0.040577,0.054678,-0.08574
/c/en/president/n/wn/person,3.930058e-16,2.392099e-16,6.252828e-16,8.181764000000001e-17,3.376011e-16,-5.115662e-16,1.359494e-16,6.697450000000001e-17,-9.754901000000001e-17,-8.8e-05,...,-0.02259,-0.021774,0.029395,0.039209,-0.040223,0.041331,0.039744,0.034948,0.047094,-0.073847
