# MOFs LM Probing for hasMetals

In [1]:
from baseline import PromptSet, logger as baseline_logger
from evaluate import evaluate_per_sr_pair, combine_scores_per_relation

from tqdm.auto import tqdm
from nltk.corpus import stopwords

import pandas as pd

from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, logging

logging.set_verbosity_error()  # avoid irritating transformers warnings

## Read in the Triples

In [2]:
file = "../data/mof_metals.csv"

In [3]:
df = pd.read_csv(file)
df.head()

Unnamed: 0,SubjectEntity,Relation,ObjectEntity,ObjectEntityName
0,MROF-1,hasMetals,[In],"In,Indium"
1,MOF-CJ3,hasMetals,[O][Zn][O]([Zn][O])[Zn],"O,Zn,O,Zn,O,Zn,Zinc,Zinc,Zinc"
2,"catena-(octakis(μ6-4,4',4''-s-Triazine-2,4,6-t...",hasMetals,[O][Cu][Cu][O],"O,Cu,Cu,O,Copper,Copper"
3,Spirof-MOF,hasMetals,*,
4,NJU-Bai9,hasMetals,[O][Zn],"O,Zn,Zinc"


In [4]:
df.shape

(1795, 4)

## Probing BERT

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased")
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased")

In [6]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [7]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Create Prompts

In [8]:
def create_metal_prompt1(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} contains {mask} which is an metal."
def create_metal_prompt2(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} contains an metal which is {mask}."
def create_metal_prompt3(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} is a metal organic framework. {sub} contains {mask} which is an metal."
def create_metal_prompt4(sub, rel, mask):
    if rel == 'hasMetals':
        return f"As a metal organic framework, {sub} contains an metal which is {mask}."
def create_metal_prompt5(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} is a MOF. {sub} has SBU metal {mask}."
def create_metal_prompt6(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} is MOF. {sub} contains {mask}." 
def create_metal_prompt7(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} contains {mask}."
def create_metal_prompt8(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} is a metal organic framework. {sub} contains {mask}."
def create_metal_prompt9(sub, rel, mask):
    if rel == 'hasMetals':
        return f"The SBU of {sub} is {mask}."
def create_metal_prompt10(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} is a metal organic framework structure composed of metal cluster {mask}."
def create_metal_prompt11(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} is a metal organic framework. {sub} has SBU metal {mask}."
def create_metal_prompt12(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{mask} was used as metal center in the synthesized {sub} material."

In [9]:
processors = [create_metal_prompt1, create_metal_prompt2,
              create_metal_prompt3, create_metal_prompt4,
              create_metal_prompt5, create_metal_prompt6,
              create_metal_prompt7, create_metal_prompt8,
              create_metal_prompt9, create_metal_prompt10,
              create_metal_prompt11, create_metal_prompt12
             ]

In [10]:
def prompt_probe(model, tokenizer, prompt_processor, df_sub, mask_token, \
                 subjectCol, relationCol, objectCol, top_k=100):

    pipe = pipeline(
        task="fill-mask",
        model=model,
        tokenizer=tokenizer,
        top_k=top_k,
    )

    prompts = PromptSet([prompt_processor(row.SubjectEntity, row.Relation, mask_token) for _, row in df_sub.iterrows()])

    outputs = []
    for out in tqdm(pipe(prompts, batch_size=8), total=len(prompts)):
        outputs.append(out)

    results = []
    for idx, row in df_sub.iterrows():
        for output in outputs[idx]:
            record = {}
            record['SubjectEntity'] = row[subjectCol]
            record['Relation'] = row[relationCol]
            record['ObjectEntity'] = row[objectCol]
            record['prompt']= prompts[idx]
            record['predictedScore'] = output['score']
            record['predictedToken'] = output['token_str']
            results.append(record)

    return pd.DataFrame(results)

In [11]:
# Compute a column indicating whether the predictedToken in the ObjectEntity list
def hasPredicted(row):
    token = row['predictedToken']
    objectEntities = row['ObjectEntity']
    #print(objectEntities)
    if pd.notna(objectEntities):
        objectList = objectEntities.split(",")
        if token in objectList:
            return 1
        else:
            return 0
    else:
        return 0

In [12]:
# return the top-k rows based on predictedScore
def top(df, col, n=10):
    return df.sort_values(by=col, ascending=False)[:n]

In [13]:
# check whether the predicted is 1 for each group
def predictedOne(df, col):
    total = df[col].sum()
    if total > 0:
        return 1
    else:
        return 0

In [14]:
# Assume the predicted_df has a binary column 'predicted' indicating whether ObjectEntity has been predicted
# 1 for yes, 0 for no
def evaluate_acc_n(predicted_df, relation, prompt_template, at_k=5):
    topk_df = predicted_df.groupby('SubjectEntity', as_index=False).apply(top, col='predictedScore', n=at_k)
    topk_agg_df = topk_df.groupby('SubjectEntity', as_index=False).apply(predictedOne, col='predicted')
    topk_agg_df.columns = ['SubjectEntity', 'predicted']
    result = {}
    result['Relation'] = relation
    result['prompt_template'] = prompt_template
    result['acc'] = topk_agg_df.predicted.mean()
    result['at_k'] = at_k

    return result

In [15]:
pd.set_option('display.max_colwidth', None)

In [16]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntityName', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', 'hasMetals', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

CPU times: user 4h 11min 33s, sys: 4min 40s, total: 4h 16min 13s
Wall time: 20min


In [17]:
result_df.head()

Unnamed: 0,SubjectEntity,Relation,ObjectEntity,prompt,predictedScore,predictedToken,predicted
0,MROF-1,hasMetals,"In,Indium",[MASK] was used as metal center in the synthesized MROF-1 material.,0.810544,It,0
1,MROF-1,hasMetals,"In,Indium",[MASK] was used as metal center in the synthesized MROF-1 material.,0.032774,This,0
2,MROF-1,hasMetals,"In,Indium",[MASK] was used as metal center in the synthesized MROF-1 material.,0.010165,it,0
3,MROF-1,hasMetals,"In,Indium",[MASK] was used as metal center in the synthesized MROF-1 material.,0.00748,Silicon,0
4,MROF-1,hasMetals,"In,Indium",[MASK] was used as metal center in the synthesized MROF-1 material.,0.005661,.,0


In [18]:
result_df.shape

(179500, 7)

In [19]:
result_df.to_csv("../data/predicted_hasMetals_BERT.csv", index=None)

In [20]:
hasMetals_df = pd.DataFrame(acc_list)

In [21]:
hasMetals_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
4,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.409823,50
12,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK] which is an metal.,0.016908,1
21,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.008857,20
8,hasMetals,{SubjectEntity} contains an metal which is [MASK].,0.00161,10
26,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.0,10
55,hasMetals,{SubjectEntity} is a metal organic framework structure composed of metal cluster [MASK].,0.0,5
37,hasMetals,{SubjectEntity} contains [MASK].,0.0,5
59,hasMetals,{SubjectEntity} is a metal organic framework structure composed of metal cluster [MASK].,0.0,100
29,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.0,100
69,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.255233,20


In [22]:
hasMetals_df.to_csv("../data/predicted_hasMetals_BERT_eval.csv", index=None)

In [23]:
hasMetals_df[hasMetals_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
12,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK] which is an metal.,0.016908,1
0,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.013688,1
66,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.009662,1
30,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.000805,1
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK].,0.000805,1
6,hasMetals,{SubjectEntity} contains an metal which is [MASK].,0.0,1
18,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.0,1
24,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.0,1
36,hasMetals,{SubjectEntity} contains [MASK].,0.0,1
48,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.0,1


In [24]:
hasMetals_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
5,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.530596,100
17,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK] which is an metal.,0.480676,100
4,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.409823,50
47,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK].,0.408213,100
71,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.361514,100
...,...,...,...,...
25,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.000000,5
26,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.000000,10
27,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.000000,20
28,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.000000,50


## Probing MatBERT

In [25]:
# MatBERT
tokenizer = AutoTokenizer.from_pretrained("../model/matbert-base-cased")
model = AutoModelForMaskedLM.from_pretrained("../model/matbert-base-cased")

In [26]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [27]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [28]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntityName', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', 'hasMetals', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

CPU times: user 1h 29min 13s, sys: 1min 1s, total: 1h 30min 15s
Wall time: 8min 7s


In [29]:
hasMetals_df = pd.DataFrame(acc_list)

In [30]:
hasMetals_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
49,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.0,5
53,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.0,100
63,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} has SBU metal [MASK].,0.0,20
19,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.0,5
13,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK] which is an metal.,0.204509,5
40,hasMetals,{SubjectEntity} contains [MASK].,0.223027,50
37,hasMetals,{SubjectEntity} contains [MASK].,0.039452,5
9,hasMetals,{SubjectEntity} contains an metal which is [MASK].,0.087762,20
21,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.008052,20
68,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.547504,10


In [31]:
result_df.to_csv("../data/predicted_hasMetals_MatBERT.csv", index=None)

In [32]:
hasMetals_df.to_csv("../data/predicted_hasMetals_MatBERT_eval.csv", index=None)

In [33]:
hasMetals_df[hasMetals_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
66,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.191626,1
0,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.049919,1
12,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK] which is an metal.,0.019324,1
6,hasMetals,{SubjectEntity} contains an metal which is [MASK].,0.00161,1
18,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.0,1
24,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.0,1
36,hasMetals,{SubjectEntity} contains [MASK].,0.0,1
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK].,0.0,1
48,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.0,1


In [34]:
hasMetals_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
71,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.901771,100
5,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.842190,100
17,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK] which is an metal.,0.830918,100
70,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.781804,50
4,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.719002,50
...,...,...,...,...
27,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.000000,20
28,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.000000,50
29,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.000000,100
30,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.000000,1


## Probing MatSciBERT

In [35]:
# MatSciBERT
tokenizer = AutoTokenizer.from_pretrained('m3rg-iitd/matscibert')
model = AutoModelForMaskedLM.from_pretrained('m3rg-iitd/matscibert')

In [36]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [37]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [38]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntityName', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', 'hasMetals', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

CPU times: user 1h 31min 55s, sys: 41.9 s, total: 1h 32min 37s
Wall time: 8min 18s


In [39]:
hasMetals_df = pd.DataFrame(acc_list)

In [40]:
hasMetals_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
34,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.0,50
61,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} has SBU metal [MASK].,0.0,5
28,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.0,50
63,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} has SBU metal [MASK].,0.0,20
4,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.0,50
26,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.0,10
55,hasMetals,{SubjectEntity} is a metal organic framework structure composed of metal cluster [MASK].,0.0,5
64,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} has SBU metal [MASK].,0.0,50
58,hasMetals,{SubjectEntity} is a metal organic framework structure composed of metal cluster [MASK].,0.0,50
70,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.0,50


In [41]:
result_df.to_csv("../data/predicted_hasMetals_MatSciBERT.csv", index=None)

In [42]:
hasMetals_df.to_csv("../data/predicted_hasMetals_MatSciBERT_eval.csv", index=None)

In [43]:
hasMetals_df[hasMetals_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
0,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.0,1
6,hasMetals,{SubjectEntity} contains an metal which is [MASK].,0.0,1
12,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK] which is an metal.,0.0,1
18,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.0,1
24,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.0,1
36,hasMetals,{SubjectEntity} contains [MASK].,0.0,1
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK].,0.0,1
48,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.0,1
54,hasMetals,{SubjectEntity} is a metal organic framework structure composed of metal cluster [MASK].,0.0,1


In [44]:
hasMetals_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
0,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.0,1
1,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.0,5
52,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.0,50
51,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.0,20
50,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.0,10
...,...,...,...,...
23,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.0,100
22,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.0,50
21,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.0,20
20,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.0,10


## Probing SciBERT

In [45]:
# SciBERT
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_cased')
model = AutoModelForMaskedLM.from_pretrained('allenai/scibert_scivocab_cased')

In [46]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [47]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [48]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntityName', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', 'hasMetals', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

CPU times: user 1h 32min 49s, sys: 21.1 s, total: 1h 33min 10s
Wall time: 8min 21s


In [49]:
hasMetals_df = pd.DataFrame(acc_list)

In [50]:
hasMetals_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
24,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.0,1
34,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.019324,50
31,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.00161,5
41,hasMetals,{SubjectEntity} contains [MASK].,0.168277,100
64,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} has SBU metal [MASK].,0.0,50
52,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.0,50
39,hasMetals,{SubjectEntity} contains [MASK].,0.033816,20
19,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.000805,5
20,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.044283,10
15,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK] which is an metal.,0.301127,20


In [51]:
result_df.to_csv("../data/predicted_hasMetals_SciBERT.csv", index=None)

In [52]:
hasMetals_df.to_csv("../data/predicted_hasMetals_SciBERT_eval.csv", index=None)

In [53]:
hasMetals_df[hasMetals_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
66,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.098229,1
0,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.008052,1
12,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK] which is an metal.,0.002415,1
36,hasMetals,{SubjectEntity} contains [MASK].,0.000805,1
6,hasMetals,{SubjectEntity} contains an metal which is [MASK].,0.0,1
18,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.0,1
24,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.0,1
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK].,0.0,1
48,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.0,1


In [54]:
hasMetals_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
71,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.858293,100
17,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK] which is an metal.,0.733494,100
70,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.716586,50
5,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.702899,100
23,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.621578,100
...,...,...,...,...
52,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.000000,50
51,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.000000,20
50,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.000000,10
49,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.000000,5


## Probing RoBERTa

In [55]:
# RoBERTa
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForMaskedLM.from_pretrained("roberta-large")

In [56]:
mask_token = tokenizer.mask_token
mask_token

'<mask>'

## Creat the Prompt Pipeline

In [57]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [58]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntityName', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', 'hasMetals', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

  0%|          | 0/1795 [00:00<?, ?it/s]

CPU times: user 4h 7min 59s, sys: 9.64 s, total: 4h 8min 9s
Wall time: 19min 21s


In [59]:
hasMetals_df = pd.DataFrame(acc_list)

In [60]:
hasMetals_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
4,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.0,50
24,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.0,1
22,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.0,50
46,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK].,0.0,50
60,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} has SBU metal [MASK].,0.0,1
5,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.004831,100
63,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} has SBU metal [MASK].,0.0,20
44,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK].,0.0,10
25,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.0,5
70,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.211755,50


In [61]:
result_df.to_csv("../data/predicted_hasMetals_RoBERTa.csv", index=None)

In [62]:
hasMetals_df.to_csv("../data/predicted_hasMetals_RoBERTa_eval.csv", index=None)

In [63]:
hasMetals_df[hasMetals_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
66,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.012077,1
0,hasMetals,{SubjectEntity} contains [MASK] which is an metal.,0.0,1
6,hasMetals,{SubjectEntity} contains an metal which is [MASK].,0.0,1
12,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK] which is an metal.,0.0,1
18,hasMetals,"As a metal organic framework, {SubjectEntity} contains an metal which is [MASK].",0.0,1
24,hasMetals,{SubjectEntity} is a MOF. {SubjectEntity} has SBU metal [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.0,1
36,hasMetals,{SubjectEntity} contains [MASK].,0.0,1
42,hasMetals,{SubjectEntity} is a metal organic framework. {SubjectEntity} contains [MASK].,0.0,1
48,hasMetals,The SBU of {SubjectEntity} is [MASK].,0.0,1


In [64]:
hasMetals_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
71,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.475040,100
70,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.211755,50
69,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.105475,20
68,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.093398,10
67,hasMetals,[MASK] was used as metal center in the synthesized {SubjectEntity} material.,0.062802,5
...,...,...,...,...
32,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.000000,10
33,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.000000,20
34,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.000000,50
35,hasMetals,{SubjectEntity} is MOF. {SubjectEntity} contains [MASK].,0.000000,100
