# MOFs LM Probing for Space Group

In [1]:
from baseline import PromptSet

from tqdm.auto import tqdm
from nltk.corpus import stopwords

import pandas as pd

from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, logging

logging.set_verbosity_error()  # avoid irritating transformers warnings

## Read in the Triples

In [2]:
file = "../data/mof_spaceGroup.csv"

In [3]:
df = pd.read_csv(file)
df.head()

Unnamed: 0,SubjectEntity,Relation,ObjectEntity
0,MROF-1,hasSpaceGroup,P6/mcc
1,MOF-CJ3,hasSpaceGroup,I4cm
2,"catena-(octakis(μ6-4,4',4''-s-Triazine-2,4,6-t...",hasSpaceGroup,R-3m
3,Spirof-MOF,hasSpaceGroup,Cccm
4,NJU-Bai9,hasSpaceGroup,Fd-3c


In [4]:
df.shape

(1800, 3)

In [5]:
df.ObjectEntity.nunique()

166

## Probing BERT

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased")
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased")

In [7]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [8]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Create Prompts

In [9]:
def create_spaceGroup_prompt1(sub, mask):
    return f"{sub} is an MOF. {sub} has the space group symbol {mask}."
    
def create_spaceGroup_prompt2(sub, mask):
    return f"{sub} is an MOF. {sub} has an space group code {mask}."

def create_spaceGroup_prompt3(sub, mask):
    return f"{sub} is an MOF. {sub} has an space group {mask}."

def create_spaceGroup_prompt4(sub, mask):
    return f"{sub} is an MOF. The space group of {sub} is {mask}."
    
def create_spaceGroup_prompt5(sub, mask):
    return f"{sub} is an MOF. {sub} is in the space group {mask}."
    
def create_spaceGroup_prompt6(sub, mask):
    return f"{sub} is an metal organic framework. {sub}'s space group is {mask}." 
    
def create_spaceGroup_prompt7(sub, mask):
    return f"{sub} is an MOF. {sub} is an type of {mask} space group."
    
def create_spaceGroup_prompt8(sub, mask):
    return f"{sub} is an MOF. {sub} is in the class of {mask} space group."
    
def create_spaceGroup_prompt9(sub, mask):
    return f"{sub} has SBUs and organic linkers. The space group of {sub} is {mask}."
    
def create_spaceGroup_prompt10(sub, mask):
    return f"{sub} is an MOF. The {mask} describes {sub}'s space group."
    
def create_spaceGroup_prompt11(sub, mask):
    return f"{sub} is a metal-organic framework. The space group of {sub} is {mask}."
    
def create_spaceGroup_prompt12(sub, mask):
    return f"As a MOF, {sub} has the space group {mask}."

In [10]:
processors = [create_spaceGroup_prompt1, create_spaceGroup_prompt2,
              create_spaceGroup_prompt3, create_spaceGroup_prompt4,
              create_spaceGroup_prompt5, create_spaceGroup_prompt6,
              create_spaceGroup_prompt7, create_spaceGroup_prompt8,
              create_spaceGroup_prompt9, create_spaceGroup_prompt10,
              create_spaceGroup_prompt11, create_spaceGroup_prompt12
             ]

## Check Whether the Predicted Tokens Match

In [11]:
# Compute a column indicating whether the predictedToken in the ObjectEntity list
def hasPredicted(row):
    token = row['predictedToken']
    objectEntities = row['ObjectEntity']
    #print(objectEntities)
    if pd.notna(objectEntities):
        types = objectEntities.split(",") # get individual types
        for tp in types: # for each type
            type_words = tp.lower().split() # get the individual words in the type
            if token.lower() in type_words: # check if a type word was predicted
                return 1
       
    return 0

In [24]:
# test what the function does
def hasPredicted_demo(objectEntities, token):
    #print(objectEntities)
    if pd.notna(objectEntities):
        types = objectEntities.split(",") # get individual types
        for tp in types: # for each type
            type_words = tp.lower().split() # get the individual words in the type
            print(type_words)
            if token.lower() in type_words: # check if a type word was predicted
                return 1
       
    return 0

In [25]:
hasPredicted_demo('P6/mcc', 'Framework')

['p6/mcc']


0

## Test Small

In [11]:
processors = [create_spaceGroup_prompt1, create_spaceGroup_prompt2, create_spaceGroup_prompt3]

In [12]:
processors[0]('MOF-1', '[MASK]')

'MOF-1 is an MOF. MOF-1 has the space group symbol [MASK].'

In [13]:
mofs_sub =df[:10]

In [14]:
prompts = PromptSet([processors[0](row.SubjectEntity, mask_token) for _, row in mofs_sub.iterrows()])

In [15]:
prompts.__getitem__(9)

"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)diphthalato)-aqua-di-cadmium(ii) hemihydrate] is an MOF. catena-[(μ8-4,4'-(Hexafluoroisopropylidene)diphthalato)-aqua-di-cadmium(ii) hemihydrate] has the space group symbol [MASK]."

In [16]:
for i in prompts:
    print(i)

MROF-1 is an MOF. MROF-1 has the space group symbol [MASK].
MOF-CJ3 is an MOF. MOF-CJ3 has the space group symbol [MASK].
catena-(octakis(μ6-4,4',4''-s-Triazine-2,4,6-triyltribenzoato)-dodeca-aqua-dodeca-copper dimethylsulfoxide solvate hydrate)|PCN-6|Teaching Subset: MOFs is an MOF. catena-(octakis(μ6-4,4',4''-s-Triazine-2,4,6-triyltribenzoato)-dodeca-aqua-dodeca-copper dimethylsulfoxide solvate hydrate)|PCN-6|Teaching Subset: MOFs has the space group symbol [MASK].
Spirof-MOF is an MOF. Spirof-MOF has the space group symbol [MASK].
NJU-Bai9 is an MOF. NJU-Bai9 has the space group symbol [MASK].
MOF-1 is an MOF. MOF-1 has the space group symbol [MASK].
MOF-2 is an MOF. MOF-2 has the space group symbol [MASK].
TCM-9-Ho is an MOF. TCM-9-Ho has the space group symbol [MASK].
catena-[bis(μ4-(2RS,3RS)-2,3-Dihydroxybutanedioato-O,O',O'',O''',O'''',O''''')-(μ2-oxalato-O,O',O'',O''')-diaqua-di-lanthanum(iii) tetrahydrate] is an MOF. catena-[bis(μ4-(2RS,3RS)-2,3-Dihydroxybutanedioato-O,O',O'',

In [17]:
# Run the model
outputs = []
for out in tqdm(pipe(prompts, batch_size=8), total=len(prompts)):
    outputs.append(out)

  0%|          | 0/10 [00:00<?, ?it/s]

In [18]:
outputs

[[{'score': 0.2004951536655426,
   'token': 100,
   'token_str': '[UNK]',
   'sequence': 'MROF - 1 is an MOF. MROF - 1 has the space group symbol.'},
  {'score': 0.0913800597190857,
   'token': 25827,
   'token_str': 'MR',
   'sequence': 'MROF - 1 is an MOF. MROF - 1 has the space group symbol MR.'},
  {'score': 0.061055149883031845,
   'token': 429,
   'token_str': 'μ',
   'sequence': 'MROF - 1 is an MOF. MROF - 1 has the space group symbol μ.'},
  {'score': 0.03965787589550018,
   'token': 396,
   'token_str': 'Γ',
   'sequence': 'MROF - 1 is an MOF. MROF - 1 has the space group symbol Γ.'},
  {'score': 0.02583909034729004,
   'token': 131,
   'token_str': ':',
   'sequence': 'MROF - 1 is an MOF. MROF - 1 has the space group symbol :.'},
  {'score': 0.021816439926624298,
   'token': 117,
   'token_str': ',',
   'sequence': 'MROF - 1 is an MOF. MROF - 1 has the space group symbol,.'},
  {'score': 0.021558035165071487,
   'token': 150,
   'token_str': 'M',
   'sequence': 'MROF - 1 is a

In [26]:
results = []
for idx, row in mofs_sub.iterrows():
    for output in outputs[idx]:
        record = {}
        record['SubjectEntity'] = row['SubjectEntity']
        record['Relation'] = row['Relation']
        record['ObjectEntity'] = row['ObjectEntity']
        record['prompt']= prompts[idx]
        record['predictedScore'] = output['score']
        record['predictedToken'] = output['token_str']
        results.append(record)

In [27]:
predicted_df = pd.DataFrame(results)
predicted_df

Unnamed: 0,SubjectEntity,Relation,ObjectEntity,prompt,predictedScore,predictedToken
0,MROF-1,hasSpaceGroup,P6/mcc,MROF-1 is an MOF. MROF-1 has the space group s...,0.200495,[UNK]
1,MROF-1,hasSpaceGroup,P6/mcc,MROF-1 is an MOF. MROF-1 has the space group s...,0.091380,MR
2,MROF-1,hasSpaceGroup,P6/mcc,MROF-1 is an MOF. MROF-1 has the space group s...,0.061055,μ
3,MROF-1,hasSpaceGroup,P6/mcc,MROF-1 is an MOF. MROF-1 has the space group s...,0.039658,Γ
4,MROF-1,hasSpaceGroup,P6/mcc,MROF-1 is an MOF. MROF-1 has the space group s...,0.025839,:
...,...,...,...,...,...,...
995,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",hasSpaceGroup,R-3,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",0.001212,†
996,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",hasSpaceGroup,R-3,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",0.001209,Z
997,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",hasSpaceGroup,R-3,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",0.001205,delta
998,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",hasSpaceGroup,R-3,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",0.001200,HM


### Evaluate on Small

In [28]:
predicted_df.loc[10]['ObjectEntity'].split(",")

['P6/mcc']

In [31]:
predicted_df['predicted'] = predicted_df.apply(hasPredicted, axis=1)

In [32]:
predicted_df.head()

Unnamed: 0,SubjectEntity,Relation,ObjectEntity,prompt,predictedScore,predictedToken,predicted
0,MROF-1,hasSpaceGroup,P6/mcc,MROF-1 is an MOF. MROF-1 has the space group s...,0.200495,[UNK],0
1,MROF-1,hasSpaceGroup,P6/mcc,MROF-1 is an MOF. MROF-1 has the space group s...,0.09138,MR,0
2,MROF-1,hasSpaceGroup,P6/mcc,MROF-1 is an MOF. MROF-1 has the space group s...,0.061055,μ,0
3,MROF-1,hasSpaceGroup,P6/mcc,MROF-1 is an MOF. MROF-1 has the space group s...,0.039658,Γ,0
4,MROF-1,hasSpaceGroup,P6/mcc,MROF-1 is an MOF. MROF-1 has the space group s...,0.025839,:,0


In [33]:
predicted_df['predicted'].sum()

0

In [34]:
predicted_df[predicted_df.predicted == 1]

Unnamed: 0,SubjectEntity,Relation,ObjectEntity,prompt,predictedScore,predictedToken,predicted


In [35]:
# return the top-k rows based on predictedScore
def top(df, col, n=10):
    return df.sort_values(by=col, ascending=False)[:n]

In [36]:
topk_df = predicted_df.groupby('SubjectEntity', as_index=False).apply(top, col='predictedScore', n=100)

In [37]:
# check whether the predicted is 1 for each group
def predictedOne(df, col):
    total = df[col].sum()
    if total > 0:
        return 1
    else:
        return 0

In [38]:
topk_agg_df = topk_df.groupby('SubjectEntity', as_index=False).apply(predictedOne, col='predicted')

In [39]:
topk_agg_df.columns = ['SubjectEntity', 'predicted']
topk_agg_df

Unnamed: 0,SubjectEntity,predicted
0,MOF-1,0
1,MOF-2,0
2,MOF-CJ3,0
3,MROF-1,0
4,NJU-Bai9,0
5,Spirof-MOF,0
6,TCM-9-Ho,0
7,"catena-(octakis(μ6-4,4',4''-s-Triazine-2,4,6-t...",0
8,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",0
9,"catena-[bis(μ4-(2RS,3RS)-2,3-Dihydroxybutanedi...",0


In [40]:
topk_agg_df.predicted.mean()

0.0

## Test BERT

In [12]:
def prompt_probe(model, tokenizer, prompt_processor, df_sub, mask_token, \
                 subjectCol, relationCol, objectCol, top_k=100):

    pipe = pipeline(
        task="fill-mask",
        model=model,
        tokenizer=tokenizer,
        top_k=top_k,
    )

    prompts = PromptSet([prompt_processor(row.SubjectEntity, mask_token) for _, row in df_sub.iterrows()])

    outputs = []
    for out in tqdm(pipe(prompts, batch_size=8), total=len(prompts)):
        outputs.append(out)

    results = []
    for idx, row in df_sub.iterrows():
        for output in outputs[idx]:
            record = {}
            record['SubjectEntity'] = row[subjectCol]
            record['Relation'] = row[relationCol]
            record['ObjectEntity'] = row[objectCol]
            record['prompt']= prompts[idx]
            record['predictedScore'] = output['score']
            record['predictedToken'] = output['token_str']
            results.append(record)

    return pd.DataFrame(results)

In [13]:
# return the top-k rows based on predictedScore
def top(df, col, n=10):
    return df.sort_values(by=col, ascending=False)[:n]

In [14]:
# check whether the predicted is 1 for each group
def predictedOne(df, col):
    total = df[col].sum()
    if total > 0:
        return 1
    else:
        return 0

In [15]:
# Assume the predicted_df has a binary column 'predicted' indicating whether ObjectEntity has been predicted
# 1 for yes, 0 for no
def evaluate_acc_n(predicted_df, relation, prompt_template, at_k=5):
    topk_df = predicted_df.groupby('SubjectEntity', as_index=False).apply(top, col='predictedScore', n=at_k)
    topk_agg_df = topk_df.groupby('SubjectEntity', as_index=False).apply(predictedOne, col='predicted')
    topk_agg_df.columns = ['SubjectEntity', 'predicted']
    result = {}
    result['Relation'] = relation
    result['prompt_template'] = prompt_template
    result['acc'] = topk_agg_df.predicted.mean()
    result['at_k'] = at_k

    return result

In [16]:
pd.set_option('display.max_colwidth', None)

In [17]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

CPU times: user 8h 4min 40s, sys: 13min 8s, total: 8h 17min 49s
Wall time: 37min 54s


In [18]:
hasType_df = pd.DataFrame(acc_list)

In [19]:
hasType_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
9,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group code [MASK].,0.0,20
53,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.0,100
32,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.0,10
68,hasMetals,"As a MOF, {SubjectEntity} has the space group [MASK].",0.0,10
65,hasMetals,{SubjectEntity} is a metal-organic framework. The space group of {SubjectEntity} is [MASK].,0.0,100
37,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.0,5
46,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] space group.,0.0,50
67,hasMetals,"As a MOF, {SubjectEntity} has the space group [MASK].",0.0,5
60,hasMetals,{SubjectEntity} is a metal-organic framework. The space group of {SubjectEntity} is [MASK].,0.0,1
14,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group [MASK].,0.0,10


In [20]:
result_df.to_csv("../data/predicted_hasSpaceGroup_BERT.csv", index=None)

In [21]:
hasType_df.to_csv("../data/predicted_hasSpaceGroup_BERT_eval.csv", index=None)

In [22]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
0,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.0,1
6,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group code [MASK].,0.0,1
12,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group [MASK].,0.0,1
18,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.0,1
24,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the space group [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.0,1
36,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.0,1
42,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] space group.,0.0,1
48,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.0,1
54,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s space group.,0.0,1


In [23]:
hasType_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
5,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.005587,100
11,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group code [MASK].,0.003990,100
4,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.003192,50
29,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the space group [MASK].,0.003192,100
10,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group code [MASK].,0.002394,50
...,...,...,...,...
34,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.000000,50
35,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.000000,100
1,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.000000,5
37,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.000000,5


## Probing MatBERT

In [24]:
# MatBERT
tokenizer = AutoTokenizer.from_pretrained("../model/matbert-base-cased")
model = AutoModelForMaskedLM.from_pretrained("../model/matbert-base-cased")

In [25]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [26]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [27]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

CPU times: user 2h 59min 56s, sys: 3min 12s, total: 3h 3min 8s
Wall time: 15min 30s


In [28]:
hasType_df = pd.DataFrame(acc_list)

In [29]:
hasType_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
19,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.019952,5
31,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.019154,5
38,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.0,10
10,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group code [MASK].,0.035914,50
16,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group [MASK].,0.047087,50
69,hasMetals,"As a MOF, {SubjectEntity} has the space group [MASK].",0.039904,20
68,hasMetals,"As a MOF, {SubjectEntity} has the space group [MASK].",0.029529,10
65,hasMetals,{SubjectEntity} is a metal-organic framework. The space group of {SubjectEntity} is [MASK].,0.050279,100
50,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.045491,10
20,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.034318,10


In [30]:
result_df.to_csv("../data/predicted_hasSpaceGroup_MatBERT.csv", index=None)

In [31]:
hasType_df.to_csv("../data/predicted_hasSpaceGroup_MatBERT_eval.csv", index=None)

In [32]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
48,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.011971,1
24,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the space group [MASK].,0.007183,1
66,hasMetals,"As a MOF, {SubjectEntity} has the space group [MASK].",0.007183,1
12,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group [MASK].,0.006385,1
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.006385,1
6,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group code [MASK].,0.005587,1
60,hasMetals,{SubjectEntity} is a metal-organic framework. The space group of {SubjectEntity} is [MASK].,0.005587,1
18,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.004789,1
42,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] space group.,0.002394,1
0,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.0,1


In [33]:
hasType_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
35,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.050279,100
52,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.050279,50
63,hasMetals,{SubjectEntity} is a metal-organic framework. The space group of {SubjectEntity} is [MASK].,0.050279,20
64,hasMetals,{SubjectEntity} is a metal-organic framework. The space group of {SubjectEntity} is [MASK].,0.050279,50
34,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.050279,50
...,...,...,...,...
55,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s space group.,0.000000,5
54,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s space group.,0.000000,1
38,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.000000,10
37,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.000000,5


## Probing MatSciBERT

In [34]:
# MatSciBERT
tokenizer = AutoTokenizer.from_pretrained('m3rg-iitd/matscibert')
model = AutoModelForMaskedLM.from_pretrained('m3rg-iitd/matscibert')

In [35]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [36]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [37]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

CPU times: user 3h 2min 27s, sys: 3min 7s, total: 3h 5min 34s
Wall time: 15min 57s


In [38]:
hasType_df = pd.DataFrame(acc_list)

In [39]:
hasType_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
24,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the space group [MASK].,0.000798,1
71,hasMetals,"As a MOF, {SubjectEntity} has the space group [MASK].",0.006385,100
41,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.000798,100
43,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] space group.,0.000798,5
66,hasMetals,"As a MOF, {SubjectEntity} has the space group [MASK].",0.0,1
55,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s space group.,0.0,5
46,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] space group.,0.007183,50
32,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.0,10
2,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.000798,10
45,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] space group.,0.002394,20


In [40]:
result_df.to_csv("../data/predicted_hasSpaceGroup_MatSciBERT.csv", index=None)

In [41]:
hasType_df.to_csv("../data/predicted_hasSpaceGroup_MatSciBERT_eval.csv", index=None)

In [42]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
24,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the space group [MASK].,0.000798,1
0,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.0,1
6,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group code [MASK].,0.0,1
12,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group [MASK].,0.0,1
18,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.0,1
36,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.0,1
42,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] space group.,0.0,1
48,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.0,1
54,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s space group.,0.0,1


In [43]:
hasType_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
53,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.010375,100
47,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] space group.,0.010375,100
23,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.010375,100
51,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.009577,20
52,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.009577,50
...,...,...,...,...
37,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.000000,5
38,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.000000,10
39,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.000000,20
40,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.000000,50


## Probing SciBERT

In [44]:
# SciBERT
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_cased')
model = AutoModelForMaskedLM.from_pretrained('allenai/scibert_scivocab_cased')

In [45]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [46]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [47]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

CPU times: user 3h 3min 23s, sys: 2min 26s, total: 3h 5min 49s
Wall time: 15min 54s


In [48]:
hasType_df = pd.DataFrame(acc_list)

In [49]:
hasType_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
51,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.0,20
11,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group code [MASK].,0.0,100
19,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.0,5
3,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.0,20
16,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group [MASK].,0.0,50
9,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group code [MASK].,0.0,20
5,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.005587,100
36,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.0,1
29,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the space group [MASK].,0.000798,100
49,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.0,5


In [50]:
result_df.to_csv("../data/predicted_hasSpaceGroup_SciBERT.csv", index=None)

In [51]:
hasType_df.to_csv("../data/predicted_hasSpaceGroup_SciBERT_eval.csv", index=None)

In [52]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
0,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.0,1
6,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group code [MASK].,0.0,1
12,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group [MASK].,0.0,1
18,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.0,1
24,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the space group [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.0,1
36,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.0,1
42,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] space group.,0.0,1
48,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.0,1
54,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s space group.,0.0,1


In [53]:
hasType_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
5,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.005587,100
71,hasMetals,"As a MOF, {SubjectEntity} has the space group [MASK].",0.000798,100
4,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.000798,50
29,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the space group [MASK].,0.000798,100
46,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] space group.,0.000000,50
...,...,...,...,...
32,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.000000,10
33,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.000000,20
34,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.000000,50
35,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.000000,100


## Probing RoBERTa

In [54]:
# RoBERTa
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForMaskedLM.from_pretrained("roberta-large")

In [55]:
mask_token = tokenizer.mask_token
mask_token

'<mask>'

## Creat the Prompt Pipeline

In [56]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [57]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

CPU times: user 8h 31min 8s, sys: 3min 54s, total: 8h 35min 3s
Wall time: 39min


In [58]:
hasType_df = pd.DataFrame(acc_list)

In [59]:
hasType_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
45,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] space group.,0.0,20
40,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.0,50
33,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.0,20
54,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s space group.,0.0,1
38,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.0,10
5,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.0,100
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.0,1
36,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.0,1
12,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group [MASK].,0.0,1
49,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.0,5


In [60]:
result_df.to_csv("../data/predicted_hasSpaceGroup_RoBERTa.csv", index=None)

In [61]:
hasType_df.to_csv("../data/predicted_hasSpaceGroup_RoBERTa_eval.csv", index=None)

In [62]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
0,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.0,1
6,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group code [MASK].,0.0,1
12,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an space group [MASK].,0.0,1
18,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.0,1
24,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the space group [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s space group is [MASK].,0.0,1
36,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] space group.,0.0,1
42,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] space group.,0.0,1
48,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.0,1
54,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s space group.,0.0,1


In [63]:
hasType_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
0,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.0,1
1,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the space group symbol [MASK].,0.0,5
52,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.0,50
51,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.0,20
50,hasMetals,{SubjectEntity} has SBUs and organic linkers. The space group of {SubjectEntity} is [MASK].,0.0,10
...,...,...,...,...
23,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.0,100
22,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.0,50
21,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.0,20
20,hasMetals,{SubjectEntity} is an MOF. The space group of {SubjectEntity} is [MASK].,0.0,10
