# MOFs LM Probing for Crystal System

In [1]:
from baseline import PromptSet

from tqdm.auto import tqdm
from nltk.corpus import stopwords

import pandas as pd

from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, logging

logging.set_verbosity_error()  # avoid irritating transformers warnings

## Read in the Triples

In [2]:
file = "../data/mof_crystalSystem.csv"

In [3]:
df = pd.read_csv(file)
df.head()

Unnamed: 0,SubjectEntity,Relation,ObjectEntity
0,MROF-1,hasCrystalSystem,hexagonal
1,MOF-CJ3,hasCrystalSystem,tetragonal
2,"catena-(octakis(μ6-4,4',4''-s-Triazine-2,4,6-t...",hasCrystalSystem,rhombohedral
3,Spirof-MOF,hasCrystalSystem,orthorhombic
4,NJU-Bai9,hasCrystalSystem,cubic


In [4]:
df.shape

(1800, 3)

In [5]:
df.ObjectEntity.nunique()

8

## Probing BERT

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased")
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased")

In [7]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [8]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Create Prompts

In [9]:
def create_crystalSystem_prompt1(sub, mask):
    return f"{sub} is an MOF. {sub} has the crystal system {mask}."
    
def create_crystalSystem_prompt2(sub, mask):
    return f"{sub} is an MOF. {sub} has an crystal family {mask}."

def create_crystalSystem_prompt3(sub, mask):
    return f"{sub} is an MOF. {sub} has an crystal system code {mask}."

def create_crystalSystem_prompt4(sub, mask):
    return f"{sub} is an MOF. The crystal system of {sub} is {mask}."
    
def create_crystalSystem_prompt5(sub, mask):
    return f"{sub} is an crystal. {sub} is in the family {mask}."
    
def create_crystalSystem_prompt6(sub, mask):
    return f"{sub} is an metal organic framework. {sub}'s crystal system is {mask}." 
    
def create_crystalSystem_prompt7(sub, mask):
    return f"{sub} is an MOF. {sub} is an type of {mask} crystal system."
    
def create_crystalSystem_prompt8(sub, mask):
    return f"{sub} is an MOF. {sub} is in the class of {mask} crystal family."
    
def create_crystalSystem_prompt9(sub, mask):
    return f"{sub} has SBUs and organic linkers. The crystal system of {sub} is {mask}."
    
def create_crystalSystem_prompt10(sub, mask):
    return f"{sub} is an MOF. The {mask} describes {sub}'s crystal structure."
    
def create_crystalSystem_prompt11(sub, mask):
    return f"{sub} is a MOF. The crystal family  of {sub} is {mask}."
    
def create_crystalSystem_prompt12(sub, mask):
    return f"As a MOF, {sub} has a crystal system {mask}."

In [10]:
processors = [create_crystalSystem_prompt1, create_crystalSystem_prompt2,
              create_crystalSystem_prompt3, create_crystalSystem_prompt4,
              create_crystalSystem_prompt5, create_crystalSystem_prompt6,
              create_crystalSystem_prompt7, create_crystalSystem_prompt8,
              create_crystalSystem_prompt9, create_crystalSystem_prompt10,
              create_crystalSystem_prompt11, create_crystalSystem_prompt12
             ]

## Check Whether the Predicted Tokens Match

In [11]:
# Compute a column indicating whether the predictedToken in the ObjectEntity list
def hasPredicted(row):
    token = row['predictedToken']
    objectEntities = row['ObjectEntity']
    #print(objectEntities)
    if pd.notna(objectEntities):
        types = objectEntities.split(",") # get individual types
        for tp in types: # for each type
            type_words = tp.lower().split() # get the individual words in the type
            if token.lower() in type_words: # check if a type word was predicted
                return 1
       
    return 0

In [12]:
# test what the function does
def hasPredicted_demo(objectEntities, token):
    #print(objectEntities)
    if pd.notna(objectEntities):
        types = objectEntities.split(",") # get individual types
        for tp in types: # for each type
            type_words = tp.lower().split() # get the individual words in the type
            print(type_words)
            if token.lower() in type_words: # check if a type word was predicted
                return 1
       
    return 0

In [13]:
hasPredicted_demo('rhombohedral', 'Framework')

['rhombohedral']


0

## Test Small

In [14]:
processors = [create_crystalSystem_prompt1, create_crystalSystem_prompt2, create_crystalSystem_prompt3]

In [15]:
processors[0]('MOF-1', '[MASK]')

'MOF-1 is an MOF. MOF-1 has the crystal system [MASK].'

In [16]:
mofs_sub =df[:10]

In [17]:
prompts = PromptSet([processors[0](row.SubjectEntity, mask_token) for _, row in mofs_sub.iterrows()])

In [18]:
prompts.__getitem__(9)

"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)diphthalato)-aqua-di-cadmium(ii) hemihydrate] is an MOF. catena-[(μ8-4,4'-(Hexafluoroisopropylidene)diphthalato)-aqua-di-cadmium(ii) hemihydrate] has the crystal system [MASK]."

In [19]:
for i in prompts:
    print(i)

MROF-1 is an MOF. MROF-1 has the crystal system [MASK].
MOF-CJ3 is an MOF. MOF-CJ3 has the crystal system [MASK].
catena-(octakis(μ6-4,4',4''-s-Triazine-2,4,6-triyltribenzoato)-dodeca-aqua-dodeca-copper dimethylsulfoxide solvate hydrate)|PCN-6|Teaching Subset: MOFs is an MOF. catena-(octakis(μ6-4,4',4''-s-Triazine-2,4,6-triyltribenzoato)-dodeca-aqua-dodeca-copper dimethylsulfoxide solvate hydrate)|PCN-6|Teaching Subset: MOFs has the crystal system [MASK].
Spirof-MOF is an MOF. Spirof-MOF has the crystal system [MASK].
NJU-Bai9 is an MOF. NJU-Bai9 has the crystal system [MASK].
MOF-1 is an MOF. MOF-1 has the crystal system [MASK].
MOF-2 is an MOF. MOF-2 has the crystal system [MASK].
TCM-9-Ho is an MOF. TCM-9-Ho has the crystal system [MASK].
catena-[bis(μ4-(2RS,3RS)-2,3-Dihydroxybutanedioato-O,O',O'',O''',O'''',O''''')-(μ2-oxalato-O,O',O'',O''')-diaqua-di-lanthanum(iii) tetrahydrate] is an MOF. catena-[bis(μ4-(2RS,3RS)-2,3-Dihydroxybutanedioato-O,O',O'',O''',O'''',O''''')-(μ2-oxalato-O

In [20]:
# Run the model
outputs = []
for out in tqdm(pipe(prompts, batch_size=8), total=len(prompts)):
    outputs.append(out)

  0%|          | 0/10 [00:00<?, ?it/s]

In [21]:
outputs

[[{'score': 0.21015463769435883,
   'token': 2401,
   'token_str': 'structure',
   'sequence': 'MROF - 1 is an MOF. MROF - 1 has the crystal system structure.'},
  {'score': 0.14482861757278442,
   'token': 16558,
   'token_str': 'symmetry',
   'sequence': 'MROF - 1 is an MOF. MROF - 1 has the crystal system symmetry.'},
  {'score': 0.07524316012859344,
   'token': 1532,
   'token_str': 'form',
   'sequence': 'MROF - 1 is an MOF. MROF - 1 has the crystal system form.'},
  {'score': 0.06823645532131195,
   'token': 22411,
   'token_str': 'topology',
   'sequence': 'MROF - 1 is an MOF. MROF - 1 has the crystal system topology.'},
  {'score': 0.03589552640914917,
   'token': 12053,
   'token_str': 'geometry',
   'sequence': 'MROF - 1 is an MOF. MROF - 1 has the crystal system geometry.'},
  {'score': 0.020992277190089226,
   'token': 131,
   'token_str': ':',
   'sequence': 'MROF - 1 is an MOF. MROF - 1 has the crystal system :.'},
  {'score': 0.020732421427965164,
   'token': 2235,
   't

In [22]:
results = []
for idx, row in mofs_sub.iterrows():
    for output in outputs[idx]:
        record = {}
        record['SubjectEntity'] = row['SubjectEntity']
        record['Relation'] = row['Relation']
        record['ObjectEntity'] = row['ObjectEntity']
        record['prompt']= prompts[idx]
        record['predictedScore'] = output['score']
        record['predictedToken'] = output['token_str']
        results.append(record)

In [23]:
predicted_df = pd.DataFrame(results)
predicted_df

Unnamed: 0,SubjectEntity,Relation,ObjectEntity,prompt,predictedScore,predictedToken
0,MROF-1,hasCrystalSystem,hexagonal,MROF-1 is an MOF. MROF-1 has the crystal syste...,0.210155,structure
1,MROF-1,hasCrystalSystem,hexagonal,MROF-1 is an MOF. MROF-1 has the crystal syste...,0.144829,symmetry
2,MROF-1,hasCrystalSystem,hexagonal,MROF-1 is an MOF. MROF-1 has the crystal syste...,0.075243,form
3,MROF-1,hasCrystalSystem,hexagonal,MROF-1 is an MOF. MROF-1 has the crystal syste...,0.068236,topology
4,MROF-1,hasCrystalSystem,hexagonal,MROF-1 is an MOF. MROF-1 has the crystal syste...,0.035896,geometry
...,...,...,...,...,...,...
995,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",hasCrystalSystem,rhombohedral,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",0.000849,signature
996,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",hasCrystalSystem,rhombohedral,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",0.000833,energy
997,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",hasCrystalSystem,rhombohedral,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",0.000799,no
998,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",hasCrystalSystem,rhombohedral,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",0.000787,lattice


### Evaluate on Small

In [24]:
predicted_df.loc[10]['ObjectEntity'].split(",")

['hexagonal']

In [25]:
predicted_df['predicted'] = predicted_df.apply(hasPredicted, axis=1)

In [26]:
predicted_df.head()

Unnamed: 0,SubjectEntity,Relation,ObjectEntity,prompt,predictedScore,predictedToken,predicted
0,MROF-1,hasCrystalSystem,hexagonal,MROF-1 is an MOF. MROF-1 has the crystal syste...,0.210155,structure,0
1,MROF-1,hasCrystalSystem,hexagonal,MROF-1 is an MOF. MROF-1 has the crystal syste...,0.144829,symmetry,0
2,MROF-1,hasCrystalSystem,hexagonal,MROF-1 is an MOF. MROF-1 has the crystal syste...,0.075243,form,0
3,MROF-1,hasCrystalSystem,hexagonal,MROF-1 is an MOF. MROF-1 has the crystal syste...,0.068236,topology,0
4,MROF-1,hasCrystalSystem,hexagonal,MROF-1 is an MOF. MROF-1 has the crystal syste...,0.035896,geometry,0


In [27]:
predicted_df['predicted'].sum()

0

In [28]:
predicted_df[predicted_df.predicted == 1]

Unnamed: 0,SubjectEntity,Relation,ObjectEntity,prompt,predictedScore,predictedToken,predicted


In [29]:
# return the top-k rows based on predictedScore
def top(df, col, n=10):
    return df.sort_values(by=col, ascending=False)[:n]

In [30]:
topk_df = predicted_df.groupby('SubjectEntity', as_index=False).apply(top, col='predictedScore', n=100)

In [31]:
# check whether the predicted is 1 for each group
def predictedOne(df, col):
    total = df[col].sum()
    if total > 0:
        return 1
    else:
        return 0

In [32]:
topk_agg_df = topk_df.groupby('SubjectEntity', as_index=False).apply(predictedOne, col='predicted')

In [33]:
topk_agg_df.columns = ['SubjectEntity', 'predicted']
topk_agg_df

Unnamed: 0,SubjectEntity,predicted
0,MOF-1,0
1,MOF-2,0
2,MOF-CJ3,0
3,MROF-1,0
4,NJU-Bai9,0
5,Spirof-MOF,0
6,TCM-9-Ho,0
7,"catena-(octakis(μ6-4,4',4''-s-Triazine-2,4,6-t...",0
8,"catena-[(μ8-4,4'-(Hexafluoroisopropylidene)dip...",0
9,"catena-[bis(μ4-(2RS,3RS)-2,3-Dihydroxybutanedi...",0


In [34]:
topk_agg_df.predicted.mean()

0.0

## Test BERT

In [12]:
def prompt_probe(model, tokenizer, prompt_processor, df_sub, mask_token, \
                 subjectCol, relationCol, objectCol, top_k=100):

    pipe = pipeline(
        task="fill-mask",
        model=model,
        tokenizer=tokenizer,
        top_k=top_k,
    )

    prompts = PromptSet([prompt_processor(row.SubjectEntity, mask_token) for _, row in df_sub.iterrows()])

    outputs = []
    for out in tqdm(pipe(prompts, batch_size=8), total=len(prompts)):
        outputs.append(out)

    results = []
    for idx, row in df_sub.iterrows():
        for output in outputs[idx]:
            record = {}
            record['SubjectEntity'] = row[subjectCol]
            record['Relation'] = row[relationCol]
            record['ObjectEntity'] = row[objectCol]
            record['prompt']= prompts[idx]
            record['predictedScore'] = output['score']
            record['predictedToken'] = output['token_str']
            results.append(record)

    return pd.DataFrame(results)

In [13]:
# return the top-k rows based on predictedScore
def top(df, col, n=10):
    return df.sort_values(by=col, ascending=False)[:n]

In [14]:
# check whether the predicted is 1 for each group
def predictedOne(df, col):
    total = df[col].sum()
    if total > 0:
        return 1
    else:
        return 0

In [15]:
# Assume the predicted_df has a binary column 'predicted' indicating whether ObjectEntity has been predicted
# 1 for yes, 0 for no
def evaluate_acc_n(predicted_df, relation, prompt_template, at_k=5):
    topk_df = predicted_df.groupby('SubjectEntity', as_index=False).apply(top, col='predictedScore', n=at_k)
    topk_agg_df = topk_df.groupby('SubjectEntity', as_index=False).apply(predictedOne, col='predicted')
    topk_agg_df.columns = ['SubjectEntity', 'predicted']
    result = {}
    result['Relation'] = relation
    result['prompt_template'] = prompt_template
    result['acc'] = topk_agg_df.predicted.mean()
    result['at_k'] = at_k

    return result

In [16]:
pd.set_option('display.max_colwidth', None)

In [17]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

CPU times: user 8h 50min 13s, sys: 14min 48s, total: 9h 5min 1s
Wall time: 41min 24s


In [18]:
hasType_df = pd.DataFrame(acc_list)

In [19]:
hasType_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
16,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal system code [MASK].,0.0,50
9,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.0,20
14,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal system code [MASK].,0.0,10
59,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s crystal structure.,0.0,100
1,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the crystal system [MASK].,0.0,5
33,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.142059,20
11,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.0,100
66,hasMetals,"As a MOF, {SubjectEntity} has a crystal system [MASK].",0.0,1
24,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.0,1
27,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.0,20


In [20]:
result_df.to_csv("../data/predicted_hasCrystalSystem_BERT.csv", index=None)

In [21]:
hasType_df.to_csv("../data/predicted_hasCrystalSystem_BERT_eval.csv", index=None)

In [22]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
18,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.074222,1
42,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.067039,1
36,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.022346,1
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.014366,1
48,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.011971,1
0,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the crystal system [MASK].,0.0,1
6,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.0,1
12,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal system code [MASK].,0.0,1
24,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.0,1
54,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s crystal structure.,0.0,1


In [23]:
hasType_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
47,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.146049,100
46,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.146049,50
41,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.146049,100
40,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.146049,50
52,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.145251,50
...,...,...,...,...
24,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.000000,1
25,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.000000,5
26,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.000000,10
27,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.000000,20


## Probing MatBERT

In [24]:
# MatBERT
tokenizer = AutoTokenizer.from_pretrained("../model/matbert-base-cased")
model = AutoModelForMaskedLM.from_pretrained("../model/matbert-base-cased")

In [25]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [26]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [27]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

CPU times: user 2h 51min 16s, sys: 3min 18s, total: 2h 54min 35s
Wall time: 14min 55s


In [28]:
hasType_df = pd.DataFrame(acc_list)

In [29]:
hasType_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
2,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the crystal system [MASK].,0.013567,10
18,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.093376,1
61,hasMetals,{SubjectEntity} is a MOF. The crystal family of {SubjectEntity} is [MASK].,0.533919,5
19,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.525938,5
56,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s crystal structure.,0.0,10
55,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s crystal structure.,0.0,5
52,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.968077,50
32,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.899441,10
65,hasMetals,{SubjectEntity} is a MOF. The crystal family of {SubjectEntity} is [MASK].,0.991221,100
22,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.972865,50


In [30]:
result_df.to_csv("../data/predicted_hasCrystalSystem_MatBERT.csv", index=None)

In [31]:
hasType_df.to_csv("../data/predicted_hasCrystalSystem_MatBERT_eval.csv", index=None)

In [32]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.223464,1
48,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.2083,1
18,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.093376,1
60,hasMetals,{SubjectEntity} is a MOF. The crystal family of {SubjectEntity} is [MASK].,0.089385,1
36,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.023144,1
0,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the crystal system [MASK].,0.0,1
6,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.0,1
12,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal system code [MASK].,0.0,1
24,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.0,1
42,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.0,1


In [33]:
hasType_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
35,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.996010,100
53,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.993615,100
65,hasMetals,{SubjectEntity} is a MOF. The crystal family of {SubjectEntity} is [MASK].,0.991221,100
23,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.991221,100
34,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.981644,50
...,...,...,...,...
25,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.000000,5
26,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.000000,10
27,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.000000,20
28,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.000000,50


## Probing MatSciBERT

In [34]:
# MatSciBERT
tokenizer = AutoTokenizer.from_pretrained('m3rg-iitd/matscibert')
model = AutoModelForMaskedLM.from_pretrained('m3rg-iitd/matscibert')

In [35]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [36]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [37]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

CPU times: user 3h 1min 48s, sys: 2min 44s, total: 3h 4min 32s
Wall time: 15min 31s


In [38]:
hasType_df = pd.DataFrame(acc_list)

In [39]:
hasType_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
39,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.137271,20
38,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.065443,10
41,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.210694,100
9,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.0,20
57,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s crystal structure.,0.0,20
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.0,1
11,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.0,100
21,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.05826,20
18,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.001596,1
23,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.145251,100


In [40]:
result_df.to_csv("../data/predicted_hasCrystalSystem_MatSciBERT.csv", index=None)

In [41]:
hasType_df.to_csv("../data/predicted_hasCrystalSystem_MatSciBERT_eval.csv", index=None)

In [42]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
18,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.001596,1
0,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the crystal system [MASK].,0.0,1
6,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.0,1
12,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal system code [MASK].,0.0,1
24,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.0,1
36,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.0,1
42,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.0,1
48,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.0,1
54,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s crystal structure.,0.0,1


In [43]:
hasType_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
41,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.210694,100
47,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.199521,100
40,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.187550,50
23,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.145251,100
46,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.143655,50
...,...,...,...,...
27,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.000000,20
28,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.000000,50
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.000000,1
42,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.000000,1


## Probing SciBERT

In [44]:
# SciBERT
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_cased')
model = AutoModelForMaskedLM.from_pretrained('allenai/scibert_scivocab_cased')

In [45]:
mask_token = tokenizer.mask_token
mask_token

'[MASK]'

## Creat the Prompt Pipeline

In [46]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [47]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

CPU times: user 3h 9min 15s, sys: 2min 9s, total: 3h 11min 25s
Wall time: 16min 8s


In [48]:
hasType_df = pd.DataFrame(acc_list)

In [49]:
hasType_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
71,hasMetals,"As a MOF, {SubjectEntity} has a crystal system [MASK].",0.052674,100
32,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.024741,10
10,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.0,50
23,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.205108,100
28,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.0,50
46,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.097366,50
45,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.017558,20
49,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.03352,5
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.0,1
8,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.0,10


In [50]:
result_df.to_csv("../data/predicted_hasCrystalSystem_SciBERT.csv", index=None)

In [51]:
hasType_df.to_csv("../data/predicted_hasCrystalSystem_SciBERT_eval.csv", index=None)

In [52]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
48,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.007183,1
36,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.002394,1
18,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.001596,1
0,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the crystal system [MASK].,0.0,1
6,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.0,1
12,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal system code [MASK].,0.0,1
24,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.0,1
42,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.0,1
54,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s crystal structure.,0.0,1


In [53]:
hasType_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
41,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.212291,100
23,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.205108,100
40,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.198723,50
22,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.194733,50
53,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.186752,100
...,...,...,...,...
13,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal system code [MASK].,0.000000,5
12,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal system code [MASK].,0.000000,1
11,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.000000,100
10,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.000000,50


## Probing RoBERTa

In [54]:
# RoBERTa
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForMaskedLM.from_pretrained("roberta-large")

In [55]:
mask_token = tokenizer.mask_token
mask_token

'<mask>'

## Creat the Prompt Pipeline

In [56]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [57]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

CPU times: user 7h 49min 51s, sys: 3min 6s, total: 7h 52min 57s
Wall time: 35min 43s


In [58]:
hasType_df = pd.DataFrame(acc_list)

In [59]:
hasType_df.sample(10)

Unnamed: 0,Relation,prompt_template,acc,at_k
29,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.0,100
28,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.0,50
26,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.0,10
34,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.0,50
15,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal system code [MASK].,0.0,20
40,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.0,50
46,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.0,50
13,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal system code [MASK].,0.0,5
3,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the crystal system [MASK].,0.0,20
2,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the crystal system [MASK].,0.0,10


In [60]:
result_df.to_csv("../data/predicted_hasCrystalSystem_RoBERTa.csv", index=None)

In [61]:
hasType_df.to_csv("../data/predicted_hasCrystalSystem_RoBERTa_eval.csv", index=None)

In [62]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
0,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the crystal system [MASK].,0.0,1
6,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal family [MASK].,0.0,1
12,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has an crystal system code [MASK].,0.0,1
18,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.0,1
24,hasMetals,{SubjectEntity} is an crystal. {SubjectEntity} is in the family [MASK].,0.0,1
30,hasMetals,{SubjectEntity} is an metal organic framework. {SubjectEntity}'s crystal system is [MASK].,0.0,1
36,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is an type of [MASK] crystal system.,0.0,1
42,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} is in the class of [MASK] crystal family.,0.0,1
48,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.0,1
54,hasMetals,{SubjectEntity} is an MOF. The [MASK] describes {SubjectEntity}'s crystal structure.,0.0,1


In [63]:
hasType_df.sort_values('acc', ascending=False)

Unnamed: 0,Relation,prompt_template,acc,at_k
0,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the crystal system [MASK].,0.0,1
1,hasMetals,{SubjectEntity} is an MOF. {SubjectEntity} has the crystal system [MASK].,0.0,5
52,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.0,50
51,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.0,20
50,hasMetals,{SubjectEntity} has SBUs and organic linkers. The crystal system of {SubjectEntity} is [MASK].,0.0,10
...,...,...,...,...
23,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.0,100
22,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.0,50
21,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.0,20
20,hasMetals,{SubjectEntity} is an MOF. The crystal system of {SubjectEntity} is [MASK].,0.0,10
