In [32]:
import pandas as pd

## Definitions

In [33]:
def expand_name(name: str) -> tuple[str, str, str]:
    parts = name.removeprefix('__').split('_')
    return ' '.join(parts[:-2]), parts[-2], parts[-1]

In [34]:
definitions_df = pd.read_csv(
    'wordnet-mlj12/wordnet-mlj12-definitions.txt',
    names=['synset_id', 'name', 'definition'],
    index_col='synset_id',
    sep='\t',
)
definitions_df[['word', 'POS_tag', 'sense_index']] = pd.DataFrame(
    definitions_df.pop('name').apply(expand_name).to_list(), 
    index=definitions_df.index
)
definitions_df.head()

Unnamed: 0_level_0,definition,word,POS_tag,sense_index
synset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14854262,solid excretory product evacuated from the bow...,stool,NN,2
590383,the position of chieftain,chieftainship,NN,1
8769179,an area in Germany around the upper Elbe river...,saxony,NN,1
2338145,beaver-like aquatic rodent of North America wi...,ondatra zibethica,NN,1
1990168,sink below the surface,founder,VB,2


In [38]:
definitions_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40943 entries, 14854262 to 10227266
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   definition   40943 non-null  object
 1   word         40943 non-null  object
 2   POS_tag      40943 non-null  object
 3   sense_index  40943 non-null  object
dtypes: object(4)
memory usage: 1.6+ MB


In [35]:
definitions_df.describe()

Unnamed: 0,definition,word,POS_tag,sense_index
count,40943,40943,40943,40943
unique,40728,32547,4,46
top,one species,run,NN,1
freq,11,33,32093,29235


In [36]:
definitions_df[definitions_df['word'] == 'run']

Unnamed: 0_level_0,definition,word,POS_tag,sense_index
synset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7460104,"a race run on foot; ""she broke the record for ...",run,NN,3
7407777,the pouring forth of a fluid,run,NN,14
2099829,"move about freely and without restraint, or ac...",run,VB,11
334803,"come unraveled or undone as if by snagging; ""H...",run,VB,40
1086103,"compete in a race; ""he is running the Marathon...",run,VB,37
1212230,"pass over, across, or through; ""He ran his eye...",run,VB,22
1927447,run with the ball; in such sports as football,run,VB,33
1526290,"be operating, running or functioning; ""The car...",run,VB,13
1641545,"carry out; ""run an errand""",run,VB,21
7443010,"a row of unravelled stitches; ""she got a run i...",run,NN,13


In [37]:
definitions_df[definitions_df['definition'].str.contains('one species')]

Unnamed: 0_level_0,definition,word,POS_tag,sense_index
synset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1487743,small-toothed sharks comprising only one speci...,rhincodontidae,NN,1
11659500,one species,sundacarpus,NN,1
12257920,one species: one-flowered wintergreen; sometim...,moneses,NN,1
11620248,one species: golden larch,pseudolarix,NN,1
12380197,one species,genus idesia,NN,1
...,...,...,...,...
12533588,one species: salt tree,halimodendron,NN,1
12116267,a monocotyledonous grass of the family Gramine...,genus dactylis,NN,1
11773860,one species: oleander,nerium,NN,1
11907267,one species: creamcups,platystemon,NN,1


## Train/Test/Validation

In [50]:
def read_df(scope: str) -> pd.DataFrame:
    return pd.read_csv(
        f'wordnet-mlj12/wordnet-mlj12-{scope}.txt',
        names=['left_synset_id', 'relation', 'right_synset_id'],
        sep='\t',
    )

In [52]:
triplets_df = pd.concat([
    read_df('train'),
    read_df('test'),
    read_df('valid'),
])
triplets_df.head()

Unnamed: 0,left_synset_id,relation,right_synset_id
0,3964744,_hyponym,4371774
1,260881,_hypernym,260622
2,2199712,_member_holonym,2188065
3,1332730,_derivationally_related_form,3122748
4,6066555,_derivationally_related_form,645415


In [53]:
triplets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151442 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   left_synset_id   151442 non-null  int64 
 1   relation         151442 non-null  object
 2   right_synset_id  151442 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.6+ MB


In [54]:
triplets_df.groupby('relation')['relation'].count().sort_values(ascending=False)

relation
_hypernym                       37221
_hyponym                        37221
_derivationally_related_form    31867
_member_holonym                  7928
_member_meronym                  7928
_part_of                         5148
_has_part                        5142
_member_of_domain_topic          3341
_synset_domain_topic_of          3335
_instance_hypernym               3150
_instance_hyponym                3150
_also_see                        1396
_verb_group                      1220
_member_of_domain_region          983
_synset_domain_region_of          982
_member_of_domain_usage           675
_synset_domain_usage_of           669
_similar_to                        86
Name: relation, dtype: int64

In [55]:
dist_synset_ids = set(triplets_df.left_synset_id) | set(triplets_df.right_synset_id)
len(dist_synset_ids)

40943