# MOFs LM Probing for hasFamily

In [1]:
from baseline import PromptSet

from tqdm.auto import tqdm
from nltk.corpus import stopwords

import pandas as pd

from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, logging

logging.set_verbosity_error()  # avoid irritating transformers warnings

## Read in the Triples

In [2]:
file = "../data/mof_family.csv"

In [3]:
df = pd.read_csv(file)
df.head()

Unnamed: 0,SubjectEntity,Relation,ObjectEntity
0,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide
1,Spirof-MOF,hasMOFFamily,Zr-oxide/zirconium oxide/UiO-66
2,MOF-1,hasMOFFamily,Zn-oxide/zinc oxide
3,URMOF-1,hasMOFFamily,Cu-Cu paddleWheels/HKUST-1
4,URMOF-2,hasMOFFamily,Zn-oxide/zinc oxide


In [4]:
df.shape

(832, 3)

## Probing BERT

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased")
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased")

In [6]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [7]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Create Prompts

In [8]:
def create_family_prompt1(sub, mask):
    return f"{sub} is a class of {mask} metal organic frameworks."
    
def create_family_prompt2(sub, mask):
    return f"{sub} is a metal organic framework with {mask} structure."

def create_family_prompt3(sub, mask):
    return f"{sub} is a metal organic framework. {sub} is a class of {mask}."

def create_family_prompt4(sub, mask):
    return f"As a metal organic framework, {sub} is in the MOF family {mask}."
    
def create_family_prompt5(sub, mask):
    return f"The target MOF family of {sub} is {mask}."
    
def create_family_prompt6(sub, mask):
    return f"{sub} is from the MOF family {mask}." 
    
def create_family_prompt7(sub, mask):
    return f"{sub} is an {mask}-like MOF."
    
def create_family_prompt8(sub, mask):
    return f"{sub} is a metal organic framework. {sub} is an type of {mask} material."
    
def create_family_prompt9(sub, mask):
    return f"Base on the SBUs and their connection to the organic linkers, the MOF family of {sub} is {mask}."
    
def create_family_prompt10(sub, mask):
    return f"{sub} is a metal organic framework. {sub} is {mask}-like structure."
    
def create_family_prompt11(sub, mask):
    return f"{sub} is a type of {mask} MOF."
    
def create_family_prompt12(sub, mask):
    return f"As a metal organic framework or MOF, {sub} has the similar structure as {mask}."

In [9]:
processors = [create_family_prompt1, create_family_prompt2,
              create_family_prompt3, create_family_prompt4,
              create_family_prompt5, create_family_prompt6,
              create_family_prompt7, create_family_prompt8,
              create_family_prompt9, create_family_prompt10,
              create_family_prompt11, create_family_prompt12
             ]

## Test Small

In [10]:
def create_family_prompt1(sub, mask):
    return f"{sub} is a class of {mask} metal organic frameworks."
    
def create_family_prompt2(sub, mask):
    return f"{sub} is a metal organic framework with {mask} structure."

In [11]:
processors = [create_family_prompt1, create_family_prompt2]

In [12]:
processors[0]('MOF-1', '[MASK]')

'MOF-1 is a class of [MASK] metal organic frameworks.'

In [13]:
mofs_sub =df[:10]

In [14]:
prompts = PromptSet([processors[0](row.SubjectEntity, mask_token) for _, row in mofs_sub.iterrows()])

In [15]:
prompts.__getitem__(9)

'TMU-23 is a class of [MASK] metal organic frameworks.'

In [16]:
for i in prompts:
    print(i)

MOF-CJ3 is a class of [MASK] metal organic frameworks.
Spirof-MOF is a class of [MASK] metal organic frameworks.
MOF-1 is a class of [MASK] metal organic frameworks.
URMOF-1 is a class of [MASK] metal organic frameworks.
URMOF-2 is a class of [MASK] metal organic frameworks.
UMCM-151 is a class of [MASK] metal organic frameworks.
UMCM-154 is a class of [MASK] metal organic frameworks.
JUC-199 is a class of [MASK] metal organic frameworks.
TMU-22 is a class of [MASK] metal organic frameworks.
TMU-23 is a class of [MASK] metal organic frameworks.


In [17]:
# Run the model
outputs = []
for out in tqdm(pipe(prompts, batch_size=8), total=len(prompts)):
    outputs.append(out)

  0%|          | 0/10 [00:00<?, ?it/s]

In [57]:
outputs

[[{'score': 0.9558262228965759,
   'token': 6468,
   'token_str': 'transition',
   'sequence': 'MOF - CJ3 is a class of transition metal organic frameworks.'},
  {'score': 0.013618923723697662,
   'token': 2302,
   'token_str': 'heavy',
   'sequence': 'MOF - CJ3 is a class of heavy metal organic frameworks.'},
  {'score': 0.005631731823086739,
   'token': 23512,
   'token_str': 'transitional',
   'sequence': 'MOF - CJ3 is a class of transitional metal organic frameworks.'},
  {'score': 0.0025018223095685244,
   'token': 19159,
   'token_str': 'zinc',
   'sequence': 'MOF - CJ3 is a class of zinc metal organic frameworks.'},
  {'score': 0.002342591993510723,
   'token': 20403,
   'token_str': 'nickel',
   'sequence': 'MOF - CJ3 is a class of nickel metal organic frameworks.'},
  {'score': 0.00182063365355134,
   'token': 3216,
   'token_str': 'mixed',
   'sequence': 'MOF - CJ3 is a class of mixed metal organic frameworks.'},
  {'score': 0.0012487806379795074,
   'token': 8649,
   'token_

In [18]:
results = []
for idx, row in mofs_sub.iterrows():
    for output in outputs[idx]:
        record = {}
        record['SubjectEntity'] = row['SubjectEntity']
        record['Relation'] = row['Relation']
        record['ObjectEntity'] = row['ObjectEntity']
        record['prompt']= prompts[idx]
        record['predictedScore'] = output['score']
        record['predictedToken'] = output['token_str']
        results.append(record)

In [19]:
predicted_df = pd.DataFrame(results)
predicted_df

Unnamed: 0,SubjectEntity,Relation,ObjectEntity,prompt,predictedScore,predictedToken
0,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide,MOF-CJ3 is a class of [MASK] metal organic fra...,0.955826,transition
1,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide,MOF-CJ3 is a class of [MASK] metal organic fra...,0.013619,heavy
2,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide,MOF-CJ3 is a class of [MASK] metal organic fra...,0.005632,transitional
3,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide,MOF-CJ3 is a class of [MASK] metal organic fra...,0.002502,zinc
4,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide,MOF-CJ3 is a class of [MASK] metal organic fra...,0.002343,nickel
...,...,...,...,...,...,...
995,TMU-23,hasMOFFamily,Zn-oxide/zinc oxide,TMU-23 is a class of [MASK] metal organic fram...,0.000010,multiple
996,TMU-23,hasMOFFamily,Zn-oxide/zinc oxide,TMU-23 is a class of [MASK] metal organic fram...,0.000010,synthetic
997,TMU-23,hasMOFFamily,Zn-oxide/zinc oxide,TMU-23 is a class of [MASK] metal organic fram...,0.000010,modern
998,TMU-23,hasMOFFamily,Zn-oxide/zinc oxide,TMU-23 is a class of [MASK] metal organic fram...,0.000010,21


### Evaluate on Small

In [20]:
predicted_df.loc[10]['ObjectEntity'].split("/")

['Zn-oxide', 'zinc oxide']

In [21]:
# Compute a column indicating whether the predictedToken in the ObjectEntity list
def hasPredicted(row):
    token = row['predictedToken']
    objectEntities = row['ObjectEntity']
    #print(objectEntities)
    if pd.notna(objectEntities):
        families = objectEntities.split("/") # get individual family
        for family in families: # for each family
            family_words = family.lower().split() # get the individual words in the family
            if token.lower() in family_words: # check if a family word was predicted
                return 1
       
    return 0

In [22]:
s = "Cu-Cu paddleWheels/HKUST-1"
sl = s.split("/")

In [24]:
ss = 'Cu-Cu'
for e in sl:
    el = e.lower().split()
    print(el)
    if ss.lower() in el:
        print(True)

['cu-cu', 'paddlewheels']
True
['hkust-1']


In [25]:
predicted_df['predicted'] = predicted_df.apply(hasPredicted, axis=1)

In [26]:
predicted_df.head()

Unnamed: 0,SubjectEntity,Relation,ObjectEntity,prompt,predictedScore,predictedToken,predicted
0,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide,MOF-CJ3 is a class of [MASK] metal organic fra...,0.955826,transition,0
1,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide,MOF-CJ3 is a class of [MASK] metal organic fra...,0.013619,heavy,0
2,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide,MOF-CJ3 is a class of [MASK] metal organic fra...,0.005632,transitional,0
3,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide,MOF-CJ3 is a class of [MASK] metal organic fra...,0.002502,zinc,1
4,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide,MOF-CJ3 is a class of [MASK] metal organic fra...,0.002343,nickel,0


In [27]:
predicted_df['predicted'].sum()

15

In [28]:
predicted_df[predicted_df.predicted == 1]

Unnamed: 0,SubjectEntity,Relation,ObjectEntity,prompt,predictedScore,predictedToken,predicted
3,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide,MOF-CJ3 is a class of [MASK] metal organic fra...,0.002502,zinc,1
47,MOF-CJ3,hasMOFFamily,Zn-oxide/zinc oxide,MOF-CJ3 is a class of [MASK] metal organic fra...,8.1e-05,oxide,1
130,Spirof-MOF,hasMOFFamily,Zr-oxide/zirconium oxide/UiO-66,Spirof-MOF is a class of [MASK] metal organic ...,0.000126,oxide,1
204,MOF-1,hasMOFFamily,Zn-oxide/zinc oxide,MOF-1 is a class of [MASK] metal organic frame...,0.001852,zinc,1
228,MOF-1,hasMOFFamily,Zn-oxide/zinc oxide,MOF-1 is a class of [MASK] metal organic frame...,0.000188,oxide,1
407,URMOF-2,hasMOFFamily,Zn-oxide/zinc oxide,URMOF-2 is a class of [MASK] metal organic fra...,0.001419,zinc,1
433,URMOF-2,hasMOFFamily,Zn-oxide/zinc oxide,URMOF-2 is a class of [MASK] metal organic fra...,0.000224,oxide,1
603,UMCM-154,hasMOFFamily,Zn-oxide/zinc oxide,UMCM-154 is a class of [MASK] metal organic fr...,0.002729,zinc,1
631,UMCM-154,hasMOFFamily,Zn-oxide/zinc oxide,UMCM-154 is a class of [MASK] metal organic fr...,0.000195,oxide,1
702,JUC-199,hasMOFFamily,Zn-oxide/zinc oxide,JUC-199 is a class of [MASK] metal organic fra...,0.006043,zinc,1


In [29]:
# return the top-k rows based on predictedScore
def top(df, col, n=10):
    return df.sort_values(by=col, ascending=False)[:n]

In [30]:
topk_df = predicted_df.groupby('SubjectEntity', as_index=False).apply(top, col='predictedScore', n=100)

In [31]:
# check whether the predicted is 1 for each group
def predictedOne(df, col):
    total = df[col].sum()
    if total > 0:
        return 1
    else:
        return 0

In [32]:
topk_agg_df = topk_df.groupby('SubjectEntity', as_index=False).apply(predictedOne, col='predicted')

In [33]:
topk_agg_df.columns = ['SubjectEntity', 'predicted']
topk_agg_df

Unnamed: 0,SubjectEntity,predicted
0,JUC-199,1
1,MOF-1,1
2,MOF-CJ3,1
3,Spirof-MOF,1
4,TMU-22,1
5,TMU-23,1
6,UMCM-151,0
7,UMCM-154,1
8,URMOF-1,0
9,URMOF-2,1


In [34]:
topk_agg_df.predicted.mean()

0.8

## Test BERT

In [10]:
def prompt_probe(model, tokenizer, prompt_processor, df_sub, mask_token, \
                 subjectCol, relationCol, objectCol, top_k=100):

    pipe = pipeline(
        task="fill-mask",
        model=model,
        tokenizer=tokenizer,
        top_k=top_k,
    )

    prompts = PromptSet([prompt_processor(row.SubjectEntity, mask_token) for _, row in df_sub.iterrows()])

    outputs = []
    for out in tqdm(pipe(prompts, batch_size=8), total=len(prompts)):
        outputs.append(out)

    results = []
    for idx, row in df_sub.iterrows():
        for output in outputs[idx]:
            record = {}
            record['SubjectEntity'] = row[subjectCol]
            record['Relation'] = row[relationCol]
            record['ObjectEntity'] = row[objectCol]
            record['prompt']= prompts[idx]
            record['predictedScore'] = output['score']
            record['predictedToken'] = output['token_str']
            results.append(record)

    return pd.DataFrame(results)

In [11]:
# Compute a column indicating whether the predictedToken in the ObjectEntity list
def hasPredicted(row):
    token = row['predictedToken']
    objectEntities = row['ObjectEntity']
    #print(objectEntities)
    if pd.notna(objectEntities):
        families = objectEntities.split("/") # get individual family
        for family in families: # for each family
            family_words = family.lower().split() # get the individual words in the family
            if token.lower() in family_words: # check if a family word was predicted
                return 1
       
    return 0

In [12]:
# return the top-k rows based on predictedScore
def top(df, col, n=10):
    return df.sort_values(by=col, ascending=False)[:n]

In [13]:
# check whether the predicted is 1 for each group
def predictedOne(df, col):
    total = df[col].sum()
    if total > 0:
        return 1
    else:
        return 0

In [14]:
# Assume the predicted_df has a binary column 'predicted' indicating whether ObjectEntity has been predicted
# 1 for yes, 0 for no
def evaluate_acc_n(predicted_df, relation, prompt_template, at_k=5):
    topk_df = predicted_df.groupby('SubjectEntity', as_index=False).apply(top, col='predictedScore', n=at_k)
    topk_agg_df = topk_df.groupby('SubjectEntity', as_index=False).apply(predictedOne, col='predicted')
    topk_agg_df.columns = ['SubjectEntity', 'predicted']
    result = {}
    result['Relation'] = relation
    result['prompt_template'] = prompt_template
    result['acc'] = topk_agg_df.predicted.mean()
    result['at_k'] = at_k

    return result

In [15]:
pd.set_option('display.max_colwidth', None)

In [16]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

CPU times: user 1h 28min 40s, sys: 55.4 s, total: 1h 29min 35s
Wall time: 7min 1s


In [17]:
hasFamily_df = pd.DataFrame(acc_list)

In [18]:
hasFamily_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
28,hasMetals,The target MOF family of {SubjectEntity} is [MASK].,0.0,50
69,hasMetals,"As a metal organic framework or MOF, {SubjectEntity} has the similar structure as [MASK].",0.382653,20
1,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.357143,5
56,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is [MASK]-like structure.,0.022959,10
11,hasMetals,{SubjectEntity} is a metal organic framework with [MASK] structure.,0.084184,100
48,hasMetals,"Base on the SBUs and their connection to the organic linkers, the MOF family of {SubjectEntity} is [MASK].",0.0,1
7,hasMetals,{SubjectEntity} is a metal organic framework with [MASK] structure.,0.0,5
43,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.002551,5
35,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.0,100
14,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is a class of [MASK].,0.002551,10


In [19]:
result_df.to_csv("../data/predicted_hasFamily_BERT.csv", index=None)

In [20]:
hasFamily_df.to_csv("../data/predicted_hasFamily_BERT_eval.csv", index=None)

In [21]:
hasFamily_df[hasFamily_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
60,hasMetals,{SubjectEntity} is a type of [MASK] MOF.,0.005102,1
66,hasMetals,"As a metal organic framework or MOF, {SubjectEntity} has the similar structure as [MASK].",0.005102,1
36,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.002551,1
0,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.0,1
6,hasMetals,{SubjectEntity} is a metal organic framework with [MASK] structure.,0.0,1
12,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is a class of [MASK].,0.0,1
18,hasMetals,"As a metal organic framework, {SubjectEntity} is in the MOF family [MASK].",0.0,1
24,hasMetals,The target MOF family of {SubjectEntity} is [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.0,1
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.0,1


In [22]:
hasFamily_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
5,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.653061,100
17,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is a class of [MASK].,0.632653,100
4,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.627551,50
71,hasMetals,"As a metal organic framework or MOF, {SubjectEntity} has the similar structure as [MASK].",0.602041,100
59,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is [MASK]-like structure.,0.540816,100
...,...,...,...,...
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.000000,1
31,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.000000,5
32,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.000000,10
33,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.000000,20


## Probing MatBERT

In [23]:
# MatBERT
tokenizer = AutoTokenizer.from_pretrained("../model/matbert-base-cased")
model = AutoModelForMaskedLM.from_pretrained("../model/matbert-base-cased")

In [24]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [25]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [26]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

CPU times: user 32min 14s, sys: 7.37 s, total: 32min 21s
Wall time: 2min 56s


In [27]:
hasFamily_df = pd.DataFrame(acc_list)

In [28]:
hasFamily_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
1,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.0,5
37,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.0,5
67,hasMetals,"As a metal organic framework or MOF, {SubjectEntity} has the similar structure as [MASK].",0.0,5
50,hasMetals,"Base on the SBUs and their connection to the organic linkers, the MOF family of {SubjectEntity} is [MASK].",0.0,10
52,hasMetals,"Base on the SBUs and their connection to the organic linkers, the MOF family of {SubjectEntity} is [MASK].",0.0,50
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.0,1
12,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is a class of [MASK].,0.0,1
60,hasMetals,{SubjectEntity} is a type of [MASK] MOF.,0.002551,1
34,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.0,50
35,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.002551,100


In [29]:
result_df.to_csv("../data/predicted_hasFamily_MatBERT.csv", index=None)

In [30]:
hasFamily_df.to_csv("../data/predicted_hasFamily_MatBERT_eval.csv", index=None)

In [31]:
hasFamily_df[hasFamily_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
60,hasMetals,{SubjectEntity} is a type of [MASK] MOF.,0.002551,1
0,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.0,1
6,hasMetals,{SubjectEntity} is a metal organic framework with [MASK] structure.,0.0,1
12,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is a class of [MASK].,0.0,1
18,hasMetals,"As a metal organic framework, {SubjectEntity} is in the MOF family [MASK].",0.0,1
24,hasMetals,The target MOF family of {SubjectEntity} is [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.0,1
36,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.0,1
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.0,1
48,hasMetals,"Base on the SBUs and their connection to the organic linkers, the MOF family of {SubjectEntity} is [MASK].",0.0,1


In [32]:
hasFamily_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
47,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.306122,100
59,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is [MASK]-like structure.,0.198980,100
5,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.147959,100
46,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.125000,50
17,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is a class of [MASK].,0.125000,100
...,...,...,...,...
33,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.000000,20
34,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.000000,50
1,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.000000,5
37,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.000000,5


## Probing MatSciBERT

In [33]:
# MatSciBERT
tokenizer = AutoTokenizer.from_pretrained('m3rg-iitd/matscibert')
model = AutoModelForMaskedLM.from_pretrained('m3rg-iitd/matscibert')

In [34]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [35]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [36]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

CPU times: user 32min 35s, sys: 1.51 s, total: 32min 36s
Wall time: 2min 57s


In [37]:
hasFamily_df = pd.DataFrame(acc_list)

In [38]:
hasFamily_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
60,hasMetals,{SubjectEntity} is a type of [MASK] MOF.,0.0,1
6,hasMetals,{SubjectEntity} is a metal organic framework with [MASK] structure.,0.0,1
47,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.507653,100
50,hasMetals,"Base on the SBUs and their connection to the organic linkers, the MOF family of {SubjectEntity} is [MASK].",0.0,10
32,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.0,10
4,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.094388,50
0,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.0,1
39,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.234694,20
40,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.510204,50
57,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is [MASK]-like structure.,0.0,20


In [39]:
result_df.to_csv("../data/predicted_hasFamily_MatSciBERT.csv", index=None)

In [40]:
hasFamily_df.to_csv("../data/predicted_hasFamily_MatSciBERT_eval.csv", index=None)

In [41]:
hasFamily_df[hasFamily_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
0,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.0,1
6,hasMetals,{SubjectEntity} is a metal organic framework with [MASK] structure.,0.0,1
12,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is a class of [MASK].,0.0,1
18,hasMetals,"As a metal organic framework, {SubjectEntity} is in the MOF family [MASK].",0.0,1
24,hasMetals,The target MOF family of {SubjectEntity} is [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.0,1
36,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.0,1
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.0,1
48,hasMetals,"Base on the SBUs and their connection to the organic linkers, the MOF family of {SubjectEntity} is [MASK].",0.0,1
54,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is [MASK]-like structure.,0.0,1


In [42]:
hasFamily_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
41,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.533163,100
40,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.510204,50
47,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.507653,100
46,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.242347,50
39,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.234694,20
...,...,...,...,...
32,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.000000,10
33,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.000000,20
34,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.000000,50
35,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.000000,100


## Probing SciBERT

In [43]:
# SciBERT
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_cased')
model = AutoModelForMaskedLM.from_pretrained('allenai/scibert_scivocab_cased')

In [44]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [45]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [46]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

CPU times: user 32min 49s, sys: 1.96 s, total: 32min 51s
Wall time: 2min 58s


In [47]:
hasFamily_df = pd.DataFrame(acc_list)

In [48]:
hasFamily_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
4,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.002551,50
46,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.515306,50
21,hasMetals,"As a metal organic framework, {SubjectEntity} is in the MOF family [MASK].",0.0,20
64,hasMetals,{SubjectEntity} is a type of [MASK] MOF.,0.030612,50
23,hasMetals,"As a metal organic framework, {SubjectEntity} is in the MOF family [MASK].",0.0,100
45,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.158163,20
28,hasMetals,The target MOF family of {SubjectEntity} is [MASK].,0.0,50
1,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.0,5
47,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.538265,100
60,hasMetals,{SubjectEntity} is a type of [MASK] MOF.,0.0,1


In [49]:
result_df.to_csv("../data/predicted_hasFamily_SciBERT.csv", index=None)

In [50]:
hasFamily_df.to_csv("../data/predicted_hasFamily_SciBERT_eval.csv", index=None)

In [51]:
hasFamily_df[hasFamily_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
66,hasMetals,"As a metal organic framework or MOF, {SubjectEntity} has the similar structure as [MASK].",0.002551,1
0,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.0,1
6,hasMetals,{SubjectEntity} is a metal organic framework with [MASK] structure.,0.0,1
12,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is a class of [MASK].,0.0,1
18,hasMetals,"As a metal organic framework, {SubjectEntity} is in the MOF family [MASK].",0.0,1
24,hasMetals,The target MOF family of {SubjectEntity} is [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.0,1
36,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.0,1
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.0,1
48,hasMetals,"Base on the SBUs and their connection to the organic linkers, the MOF family of {SubjectEntity} is [MASK].",0.0,1


In [52]:
hasFamily_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
17,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is a class of [MASK].,0.635204,100
59,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is [MASK]-like structure.,0.591837,100
47,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.538265,100
46,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.515306,50
58,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is [MASK]-like structure.,0.382653,50
...,...,...,...,...
34,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.000000,50
35,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.000000,100
1,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.000000,5
37,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.000000,5


## Probing RoBERTa

In [53]:
# RoBERTa
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForMaskedLM.from_pretrained("roberta-large")

In [54]:
mask_token = tokenizer.mask_token
mask_token

'<mask>'

## Creat the Prompt Pipeline

In [55]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [56]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

  0%|          | 0/832 [00:00<?, ?it/s]

CPU times: user 1h 29min 38s, sys: 3.76 s, total: 1h 29min 41s
Wall time: 7min 1s


In [57]:
hasFamily_df = pd.DataFrame(acc_list)

In [58]:
hasFamily_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
28,hasMetals,The target MOF family of {SubjectEntity} is [MASK].,0.0,50
46,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.0,50
66,hasMetals,"As a metal organic framework or MOF, {SubjectEntity} has the similar structure as [MASK].",0.0,1
10,hasMetals,{SubjectEntity} is a metal organic framework with [MASK] structure.,0.0,50
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.0,1
67,hasMetals,"As a metal organic framework or MOF, {SubjectEntity} has the similar structure as [MASK].",0.0,5
70,hasMetals,"As a metal organic framework or MOF, {SubjectEntity} has the similar structure as [MASK].",0.0,50
52,hasMetals,"Base on the SBUs and their connection to the organic linkers, the MOF family of {SubjectEntity} is [MASK].",0.0,50
15,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is a class of [MASK].,0.0,20
61,hasMetals,{SubjectEntity} is a type of [MASK] MOF.,0.0,5


In [59]:
result_df.to_csv("../data/predicted_hasFamily_RoBERTa.csv", index=None)

In [60]:
hasFamily_df.to_csv("../data/predicted_hasFamily_RoBERTa_eval.csv", index=None)

In [61]:
hasFamily_df[hasFamily_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
0,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.0,1
6,hasMetals,{SubjectEntity} is a metal organic framework with [MASK] structure.,0.0,1
12,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is a class of [MASK].,0.0,1
18,hasMetals,"As a metal organic framework, {SubjectEntity} is in the MOF family [MASK].",0.0,1
24,hasMetals,The target MOF family of {SubjectEntity} is [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is from the MOF family [MASK].,0.0,1
36,hasMetals,{SubjectEntity} is an [MASK]-like MOF.,0.0,1
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is an type of [MASK] material.,0.0,1
48,hasMetals,"Base on the SBUs and their connection to the organic linkers, the MOF family of {SubjectEntity} is [MASK].",0.0,1
54,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} is [MASK]-like structure.,0.0,1


In [62]:
hasFamily_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
0,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.0,1
1,hasMetals,{SubjectEntity} is a class of [MASK] metal organic frameworks.,0.0,5
52,hasMetals,"Base on the SBUs and their connection to the organic linkers, the MOF family of {SubjectEntity} is [MASK].",0.0,50
51,hasMetals,"Base on the SBUs and their connection to the organic linkers, the MOF family of {SubjectEntity} is [MASK].",0.0,20
50,hasMetals,"Base on the SBUs and their connection to the organic linkers, the MOF family of {SubjectEntity} is [MASK].",0.0,10
...,...,...,...,...
23,hasMetals,"As a metal organic framework, {SubjectEntity} is in the MOF family [MASK].",0.0,100
22,hasMetals,"As a metal organic framework, {SubjectEntity} is in the MOF family [MASK].",0.0,50
21,hasMetals,"As a metal organic framework, {SubjectEntity} is in the MOF family [MASK].",0.0,20
20,hasMetals,"As a metal organic framework, {SubjectEntity} is in the MOF family [MASK].",0.0,10
