# MOFs LM Probing for hasMetals

In [None]:
from baseline import PromptSet, logger as baseline_logger
from evaluate import evaluate_per_sr_pair, combine_scores_per_relation

from tqdm.auto import tqdm
from nltk.corpus import stopwords

import pandas as pd

from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, logging

logging.set_verbosity_error()  # avoid irritating transformers warnings

## Read in the Triples

In [None]:
file = "../data/mof_metals.csv"

In [None]:
df = pd.read_csv(file)
df.head()

In [None]:
df.shape

## Probing BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased")
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased")

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Create Prompts

In [None]:
def create_metal_prompt1(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} contains {mask} which is an metal."
def create_metal_prompt2(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} contains an metal which is {mask}."
def create_metal_prompt3(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} is a metal organic framework. {sub} contains {mask} which is an metal."
def create_metal_prompt4(sub, rel, mask):
    if rel == 'hasMetals':
        return f"As a metal organic framework, {sub} contains an metal which is {mask}."
def create_metal_prompt5(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} is a MOF. {sub} has SBU metal {mask}."
def create_metal_prompt6(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} is MOF. {sub} contains {mask}." 
def create_metal_prompt7(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} contains {mask}."
def create_metal_prompt8(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} is a metal organic framework. {sub} contains {mask}."
def create_metal_prompt9(sub, rel, mask):
    if rel == 'hasMetals':
        return f"The SBU of {sub} is {mask}."
def create_metal_prompt10(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} is a metal organic framework structure composed of metal cluster {mask}."
def create_metal_prompt11(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{sub} is a metal organic framework. {sub} has SBU metal {mask}."
def create_metal_prompt12(sub, rel, mask):
    if rel == 'hasMetals':
        return f"{mask} was used as metal center in the synthesized {sub} material."

In [None]:
processors = [create_metal_prompt1, create_metal_prompt2,
              create_metal_prompt3, create_metal_prompt4,
              create_metal_prompt5, create_metal_prompt6,
              create_metal_prompt7, create_metal_prompt8,
              create_metal_prompt9, create_metal_prompt10,
              create_metal_prompt11, create_metal_prompt12
             ]

In [None]:
def prompt_probe(model, tokenizer, prompt_processor, df_sub, mask_token, \
                 subjectCol, relationCol, objectCol, top_k=100):

    pipe = pipeline(
        task="fill-mask",
        model=model,
        tokenizer=tokenizer,
        top_k=top_k,
    )

    prompts = PromptSet([prompt_processor(row.SubjectEntity, row.Relation, mask_token) for _, row in df_sub.iterrows()])

    outputs = []
    for out in tqdm(pipe(prompts, batch_size=8), total=len(prompts)):
        outputs.append(out)

    results = []
    for idx, row in df_sub.iterrows():
        for output in outputs[idx]:
            record = {}
            record['SubjectEntity'] = row[subjectCol]
            record['Relation'] = row[relationCol]
            record['ObjectEntity'] = row[objectCol]
            record['prompt']= prompts[idx]
            record['predictedScore'] = output['score']
            record['predictedToken'] = output['token_str']
            results.append(record)

    return pd.DataFrame(results)

In [None]:
# Compute a column indicating whether the predictedToken in the ObjectEntity list
def hasPredicted(row):
    token = row['predictedToken']
    objectEntities = row['ObjectEntity']
    #print(objectEntities)
    if pd.notna(objectEntities):
        objectList = objectEntities.split(",")
        if token in objectList:
            return 1
        else:
            return 0
    else:
        return 0

In [None]:
# return the top-k rows based on predictedScore
def top(df, col, n=10):
    return df.sort_values(by=col, ascending=False)[:n]

In [None]:
# check whether the predicted is 1 for each group
def predictedOne(df, col):
    total = df[col].sum()
    if total > 0:
        return 1
    else:
        return 0

In [None]:
# Assume the predicted_df has a binary column 'predicted' indicating whether ObjectEntity has been predicted
# 1 for yes, 0 for no
def evaluate_acc_n(predicted_df, relation, prompt_template, at_k=5):
    topk_df = predicted_df.groupby('SubjectEntity', as_index=False).apply(top, col='predictedScore', n=at_k)
    topk_agg_df = topk_df.groupby('SubjectEntity', as_index=False).apply(predictedOne, col='predicted')
    topk_agg_df.columns = ['SubjectEntity', 'predicted']
    result = {}
    result['Relation'] = relation
    result['prompt_template'] = prompt_template
    result['acc'] = topk_agg_df.predicted.mean()
    result['at_k'] = at_k

    return result

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntityName', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', 'hasMetals', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
result_df.head()

In [None]:
result_df.shape

In [None]:
result_df.to_csv("../data/predicted_hasMetals_BERT.csv", index=None)

In [None]:
hasMetals_df = pd.DataFrame(acc_list)

In [None]:
hasMetals_df.sample(10)

In [None]:
hasMetals_df.to_csv("../data/predicted_hasMetals_BERT_eval.csv", index=None)

In [None]:
hasMetals_df[hasMetals_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasMetals_df.sort_values('acc', ascending=False)

## Probing MatBERT

In [None]:
# MatBERT
tokenizer = AutoTokenizer.from_pretrained("../model/model/matbert-base-cased")
model = AutoModelForMaskedLM.from_pretrained("../model/model/matbert-base-cased")

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntityName', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', 'hasMetals', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasMetals_df = pd.DataFrame(acc_list)

In [None]:
hasMetals_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasMetals_MatBERT.csv", index=None)

In [None]:
hasMetals_df.to_csv("../data/predicted_hasMetals_MatBERT_eval.csv", index=None)

In [None]:
hasMetals_df[hasMetals_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasMetals_df.sort_values('acc', ascending=False)

## Probing MatSciBERT

In [None]:
# MatSciBERT
tokenizer = AutoTokenizer.from_pretrained('m3rg-iitd/matscibert')
model = AutoModelForMaskedLM.from_pretrained('m3rg-iitd/matscibert')

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntityName', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', 'hasMetals', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasMetals_df = pd.DataFrame(acc_list)

In [None]:
hasMetals_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasMetals_MatSciBERT.csv", index=None)

In [None]:
hasMetals_df.to_csv("../data/predicted_hasMetals_MatSciBERT_eval.csv", index=None)

In [None]:
hasMetals_df[hasMetals_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasMetals_df.sort_values('acc', ascending=False)

## Probing SciBERT

In [None]:
# SciBERT
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_cased')
model = AutoModelForMaskedLM.from_pretrained('allenai/scibert_scivocab_cased')

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntityName', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', 'hasMetals', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasMetals_df = pd.DataFrame(acc_list)

In [None]:
hasMetals_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasMetals_SciBERT.csv", index=None)

In [None]:
hasMetals_df.to_csv("../data/predicted_hasMetals_SciBERT_eval.csv", index=None)

In [None]:
hasMetals_df[hasMetals_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasMetals_df.sort_values('acc', ascending=False)

## Probing RoBERTa

In [None]:
# RoBERTa
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForMaskedLM.from_pretrained("roberta-large")

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntityName', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', 'hasMetals', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasMetals', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasMetals_df = pd.DataFrame(acc_list)

In [None]:
hasMetals_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasMetals_RoBERTa.csv", index=None)

In [None]:
hasMetals_df.to_csv("../data/predicted_hasMetals_RoBERTa_eval.csv", index=None)

In [None]:
hasMetals_df[hasMetals_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasMetals_df.sort_values('acc', ascending=False)

# MOFs LM Probing for isMOF

In [None]:
from baseline import PromptSet

from tqdm.auto import tqdm
from nltk.corpus import stopwords

import pandas as pd

from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, logging

logging.set_verbosity_error()  # avoid irritating transformers warnings

## Read in the Triples

In [None]:
file = "../data/mof_synonyms.csv"

In [None]:
df = pd.read_csv(file)
df.head()

In [None]:
df.shape

## Probing BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased")
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased")

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Create Prompts

In [None]:
def create_type_prompt1(sub, mask):
    return f"{sub} is an {mask} material."
    
def create_type_prompt2(sub, mask):
    return f"{sub} is an {mask} structure."

def create_type_prompt3(sub, mask):
    return f"{sub} is an {mask}."

def create_type_prompt4(sub, mask):
    return f"{sub} is an {mask} made of metal center and organic linkers."
    
def create_type_prompt5(sub, mask):
    return f"{sub} is an {mask} material containing metal center and organic linkers."
    
def create_type_prompt6(sub, mask):
    return f"{sub} is an {mask} structure with metal center and organic linkers." 
    
def create_type_prompt7(sub, mask):
    return f"{sub} is an type of {mask} material."
    
def create_type_prompt8(sub, mask):
    return f"{sub} is in the class of {mask} material."
    
def create_type_prompt9(sub, mask):
    return f"{sub} has SBUs and organic linkers. The type of {sub} is {mask}."
    
def create_type_prompt10(sub, mask):
    return f"{sub} is a type of polymer with metal clusters and organic linkers. {sub} is an {mask} framework."
    
def create_type_prompt11(sub, mask):
    return f"{sub} is an type of {mask}."
    
def create_type_prompt12(sub, mask):
    return f"The porous structure of {sub} makes it an {mask}."

In [None]:
processors = [create_type_prompt1, create_type_prompt2,
              create_type_prompt3, create_type_prompt4,
              create_type_prompt5, create_type_prompt6,
              create_type_prompt7, create_type_prompt8,
              create_type_prompt9, create_type_prompt10,
              create_type_prompt11, create_type_prompt12
             ]

## Test BERT

In [None]:
def prompt_probe(model, tokenizer, prompt_processor, df_sub, mask_token, \
                 subjectCol, relationCol, objectCol, top_k=100):

    pipe = pipeline(
        task="fill-mask",
        model=model,
        tokenizer=tokenizer,
        top_k=top_k,
    )

    prompts = PromptSet([prompt_processor(row.SubjectEntity, mask_token) for _, row in df_sub.iterrows()])

    outputs = []
    for out in tqdm(pipe(prompts, batch_size=8), total=len(prompts)):
        outputs.append(out)

    results = []
    for idx, row in df_sub.iterrows():
        for output in outputs[idx]:
            record = {}
            record['SubjectEntity'] = row[subjectCol]
            record['Relation'] = row[relationCol]
            record['ObjectEntity'] = row[objectCol]
            record['prompt']= prompts[idx]
            record['predictedScore'] = output['score']
            record['predictedToken'] = output['token_str']
            results.append(record)

    return pd.DataFrame(results)

In [None]:
# Compute a column indicating whether the predictedToken in the ObjectEntity list
def hasPredicted(row):
    token = row['predictedToken']
    objectEntities = row['ObjectEntity']
    #print(objectEntities)
    if pd.notna(objectEntities):
        types = objectEntities.split(",") # get individual types
        for tp in types: # for each type
            type_words = tp.lower().split() # get the individual words in the type
            if token.lower() in type_words: # check if a type word was predicted
                return 1
       
    return 0

In [None]:
# return the top-k rows based on predictedScore
def top(df, col, n=10):
    return df.sort_values(by=col, ascending=False)[:n]

In [None]:
# check whether the predicted is 1 for each group
def predictedOne(df, col):
    total = df[col].sum()
    if total > 0:
        return 1
    else:
        return 0

In [None]:
# Assume the predicted_df has a binary column 'predicted' indicating whether ObjectEntity has been predicted
# 1 for yes, 0 for no
def evaluate_acc_n(predicted_df, relation, prompt_template, at_k=5):
    topk_df = predicted_df.groupby('SubjectEntity', as_index=False).apply(top, col='predictedScore', n=at_k)
    topk_agg_df = topk_df.groupby('SubjectEntity', as_index=False).apply(predictedOne, col='predicted')
    topk_agg_df.columns = ['SubjectEntity', 'predicted']
    result = {}
    result['Relation'] = relation
    result['prompt_template'] = prompt_template
    result['acc'] = topk_agg_df.predicted.mean()
    result['at_k'] = at_k

    return result

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'isTypeMOF', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasType_BERT.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasType_BERT_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

## Probing MatBERT

In [None]:
# MatBERT
tokenizer = AutoTokenizer.from_pretrained("../model/model/matbert-base-cased")
model = AutoModelForMaskedLM.from_pretrained("../model/model/matbert-base-cased")

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'isTypeMOF', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasType_MatBERT.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasType_MatBERT_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

## Probing MatSciBERT

In [None]:
# MatSciBERT
tokenizer = AutoTokenizer.from_pretrained('m3rg-iitd/matscibert')
model = AutoModelForMaskedLM.from_pretrained('m3rg-iitd/matscibert')

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'isTypeMOF', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasType_MatSciBERT.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasType_MatSciBERT_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

## Probing SciBERT

In [None]:
# SciBERT
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_cased')
model = AutoModelForMaskedLM.from_pretrained('allenai/scibert_scivocab_cased')

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'isTypeMOF', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasType_SciBERT.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasType_SciBERT_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

## Probing RoBERTa

In [None]:
# RoBERTa
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForMaskedLM.from_pretrained("roberta-large")

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'isTypeMOF', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasType_RoBERTa.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasType_RoBERTa_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

# MOFs LM Probing for Linker

In [None]:
from baseline import PromptSet

from tqdm.auto import tqdm
from nltk.corpus import stopwords

import pandas as pd

from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, logging

logging.set_verbosity_error()  # avoid irritating transformers warnings

## Read in the Triples

In [None]:
file = "../data/mof_linkers.csv"

In [None]:
df = pd.read_csv(file)
df.head()

In [None]:
df.shape

## Probing BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased")
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased")

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Create Prompts

In [None]:
def create_linker_prompt1(sub, mask):
    return f"{sub} contains a {mask} which is organic linker."
    
def create_linker_prompt2(sub, mask):
    return f"{sub} contains an organic linker which is {mask}."

def create_linker_prompt3(sub, mask):
    return f"The metal clusters in {sub} are connected by {mask}."

def create_linker_prompt4(sub, mask):
    return f"{sub} is an MOF made of metal centers connectd by organic linker {mask}."
    
def create_linker_prompt5(sub, mask):
    return f"{sub} is an MOF material containing metal centers and organic linkers {mask}."
    
def create_linker_prompt6(sub, mask):
    return f"{sub} is an MOF structure with metal centers and organic linkers {mask}." 
    
def create_linker_prompt7(sub, mask):
    return f"The organic linker in MOF {sub} is {mask}."
    
def create_linker_prompt8(sub, mask):
    return f"{sub} has an organic linker with SMILES string {mask}."
    
def create_linker_prompt9(sub, mask):
    return f"{sub} has SBUs and organic linkers {mask}."
    
def create_linker_prompt10(sub, mask):
    return f"{sub} is a type of polymer with metal clusters and organic linkers {mask}."
    
def create_linker_prompt11(sub, mask):
    return f"{mask} is the structure of the organic linker in MOF {sub}."
    
def create_linker_prompt12(sub, mask):
    return f"The structure of the organic linker of MOF {sub} is {mask}."

In [None]:
processors = [create_linker_prompt1, create_linker_prompt2,
              create_linker_prompt3, create_linker_prompt4,
              create_linker_prompt5, create_linker_prompt6,
              create_linker_prompt7, create_linker_prompt8,
              create_linker_prompt9, create_linker_prompt10,
              create_linker_prompt11, create_linker_prompt12
             ]

## Test BERT

In [None]:
def prompt_probe(model, tokenizer, prompt_processor, df_sub, mask_token, \
                 subjectCol, relationCol, objectCol, top_k=100):

    pipe = pipeline(
        task="fill-mask",
        model=model,
        tokenizer=tokenizer,
        top_k=top_k,
    )

    prompts = PromptSet([prompt_processor(row.SubjectEntity, mask_token) for _, row in df_sub.iterrows()])

    outputs = []
    for out in tqdm(pipe(prompts, batch_size=8), total=len(prompts)):
        outputs.append(out)

    results = []
    for idx, row in df_sub.iterrows():
        for output in outputs[idx]:
            record = {}
            record['SubjectEntity'] = row[subjectCol]
            record['Relation'] = row[relationCol]
            record['ObjectEntity'] = row[objectCol]
            record['prompt']= prompts[idx]
            record['predictedScore'] = output['score']
            record['predictedToken'] = output['token_str']
            results.append(record)

    return pd.DataFrame(results)

In [None]:
# Compute a column indicating whether the predictedToken in the ObjectEntity list
def hasPredicted(row):
    token = row['predictedToken']
    objectEntities = row['ObjectEntity']
    #print(objectEntities)
    if pd.notna(objectEntities):
        if token.lower() == objectEntities.lower():
            return 1
       
    return 0

In [None]:
# return the top-k rows based on predictedScore
def top(df, col, n=10):
    return df.sort_values(by=col, ascending=False)[:n]

In [None]:
# check whether the predicted is 1 for each group
def predictedOne(df, col):
    total = df[col].sum()
    if total > 0:
        return 1
    else:
        return 0

In [None]:
# Assume the predicted_df has a binary column 'predicted' indicating whether ObjectEntity has been predicted
# 1 for yes, 0 for no
def evaluate_acc_n(predicted_df, relation, prompt_template, at_k=5):
    topk_df = predicted_df.groupby('SubjectEntity', as_index=False).apply(top, col='predictedScore', n=at_k)
    topk_agg_df = topk_df.groupby('SubjectEntity', as_index=False).apply(predictedOne, col='predicted')
    topk_agg_df.columns = ['SubjectEntity', 'predicted']
    result = {}
    result['Relation'] = relation
    result['prompt_template'] = prompt_template
    result['acc'] = topk_agg_df.predicted.mean()
    result['at_k'] = at_k

    return result

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasLinker', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasLinker_BERT.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasLinker_BERT_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

## Probing MatBERT

In [None]:
# MatBERT
tokenizer = AutoTokenizer.from_pretrained("../model/model/matbert-base-cased")
model = AutoModelForMaskedLM.from_pretrained("../model/model/matbert-base-cased")

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasLinker', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasLinker_MatBERT.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasLinker_MatBERT_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

## Probing MatSciBERT

In [None]:
# MatSciBERT
tokenizer = AutoTokenizer.from_pretrained('m3rg-iitd/matscibert')
model = AutoModelForMaskedLM.from_pretrained('m3rg-iitd/matscibert')

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasLinker', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasLinker_MatSciBERT.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasLinker_MatSciBERT_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

## Probing SciBERT

In [None]:
# SciBERT
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_cased')
model = AutoModelForMaskedLM.from_pretrained('allenai/scibert_scivocab_cased')

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasLinker', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasLinker_SciBERT.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasLinker_SciBERT_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

## Probing RoBERTa

In [None]:
# RoBERTa
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForMaskedLM.from_pretrained("roberta-large")

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasLinker', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasLinker_RoBERTa.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasLinker_RoBERTa_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

# MOFs LM Probing for Space Group

In [None]:
from baseline import PromptSet

from tqdm.auto import tqdm
from nltk.corpus import stopwords

import pandas as pd

from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, logging

logging.set_verbosity_error()  # avoid irritating transformers warnings

## Read in the Triples

In [None]:
file = "../data/mof_spaceGroup.csv"

In [None]:
df = pd.read_csv(file)
df.head()

In [None]:
df.shape

## Probing BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased")
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased")

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Create Prompts

In [None]:
def create_spaceGroup_prompt1(sub, mask):
    return f"{sub} is an MOF. {sub} has the space group symbol {mask}."
    
def create_spaceGroup_prompt2(sub, mask):
    return f"{sub} is an MOF. {sub} has an space group code {mask}."

def create_spaceGroup_prompt3(sub, mask):
    return f"{sub} is an MOF. {sub} has an space group {mask}."

def create_spaceGroup_prompt4(sub, mask):
    return f"{sub} is an MOF. The space group of {sub} is {mask}."
    
def create_spaceGroup_prompt5(sub, mask):
    return f"{sub} is an MOF. {sub} is in the space group {mask}."
    
def create_spaceGroup_prompt6(sub, mask):
    return f"{sub} is an metal organic framework. {sub}'s space group is {mask}." 
    
def create_spaceGroup_prompt7(sub, mask):
    return f"{sub} is an MOF. {sub} is an type of {mask} space group."
    
def create_spaceGroup_prompt8(sub, mask):
    return f"{sub} is an MOF. {sub} is in the class of {mask} space group."
    
def create_spaceGroup_prompt9(sub, mask):
    return f"{sub} has SBUs and organic linkers. The space group of {sub} is {mask}."
    
def create_spaceGroup_prompt10(sub, mask):
    return f"{sub} is an MOF. The {mask} describes {sub}'s space group."
    
def create_spaceGroup_prompt11(sub, mask):
    return f"{sub} is a metal-organic framework. The space group of {sub} is {mask}."
    
def create_spaceGroup_prompt12(sub, mask):
    return f"As a MOF, {sub} has the space group {mask}."

In [None]:
processors = [create_spaceGroup_prompt1, create_spaceGroup_prompt2,
              create_spaceGroup_prompt3, create_spaceGroup_prompt4,
              create_spaceGroup_prompt5, create_spaceGroup_prompt6,
              create_spaceGroup_prompt7, create_spaceGroup_prompt8,
              create_spaceGroup_prompt9, create_spaceGroup_prompt10,
              create_spaceGroup_prompt11, create_spaceGroup_prompt12
             ]

## Check Whether the Predicted Tokens Match

In [None]:
# Compute a column indicating whether the predictedToken in the ObjectEntity list
def hasPredicted(row):
    token = row['predictedToken']
    objectEntities = row['ObjectEntity']
    #print(objectEntities)
    if pd.notna(objectEntities):
        types = objectEntities.split(",") # get individual types
        for tp in types: # for each type
            type_words = tp.lower().split() # get the individual words in the type
            if token.lower() in type_words: # check if a type word was predicted
                return 1
       
    return 0

## Test BERT

In [None]:
def prompt_probe(model, tokenizer, prompt_processor, df_sub, mask_token, \
                 subjectCol, relationCol, objectCol, top_k=100):

    pipe = pipeline(
        task="fill-mask",
        model=model,
        tokenizer=tokenizer,
        top_k=top_k,
    )

    prompts = PromptSet([prompt_processor(row.SubjectEntity, mask_token) for _, row in df_sub.iterrows()])

    outputs = []
    for out in tqdm(pipe(prompts, batch_size=8), total=len(prompts)):
        outputs.append(out)

    results = []
    for idx, row in df_sub.iterrows():
        for output in outputs[idx]:
            record = {}
            record['SubjectEntity'] = row[subjectCol]
            record['Relation'] = row[relationCol]
            record['ObjectEntity'] = row[objectCol]
            record['prompt']= prompts[idx]
            record['predictedScore'] = output['score']
            record['predictedToken'] = output['token_str']
            results.append(record)

    return pd.DataFrame(results)

In [None]:
# return the top-k rows based on predictedScore
def top(df, col, n=10):
    return df.sort_values(by=col, ascending=False)[:n]

In [None]:
# check whether the predicted is 1 for each group
def predictedOne(df, col):
    total = df[col].sum()
    if total > 0:
        return 1
    else:
        return 0

In [None]:
# Assume the predicted_df has a binary column 'predicted' indicating whether ObjectEntity has been predicted
# 1 for yes, 0 for no
def evaluate_acc_n(predicted_df, relation, prompt_template, at_k=5):
    topk_df = predicted_df.groupby('SubjectEntity', as_index=False).apply(top, col='predictedScore', n=at_k)
    topk_agg_df = topk_df.groupby('SubjectEntity', as_index=False).apply(predictedOne, col='predicted')
    topk_agg_df.columns = ['SubjectEntity', 'predicted']
    result = {}
    result['Relation'] = relation
    result['prompt_template'] = prompt_template
    result['acc'] = topk_agg_df.predicted.mean()
    result['at_k'] = at_k

    return result

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasSpaceGroup', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasSpaceGroup_BERT.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasSpaceGroup_BERT_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

## Probing MatBERT

In [None]:
# MatBERT
tokenizer = AutoTokenizer.from_pretrained("../model/model/matbert-base-cased")
model = AutoModelForMaskedLM.from_pretrained("../model/model/matbert-base-cased")

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasSpaceGroup', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasSpaceGroup_MatBERT.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasSpaceGroup_MatBERT_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

## Probing MatSciBERT

In [None]:
# MatSciBERT
tokenizer = AutoTokenizer.from_pretrained('m3rg-iitd/matscibert')
model = AutoModelForMaskedLM.from_pretrained('m3rg-iitd/matscibert')

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasSpaceGroup', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasSpaceGroup_MatSciBERT.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasSpaceGroup_MatSciBERT_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

## Probing SciBERT

In [None]:
# SciBERT
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_cased')
model = AutoModelForMaskedLM.from_pretrained('allenai/scibert_scivocab_cased')

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasSpaceGroup', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasSpaceGroup_SciBERT.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasSpaceGroup_SciBERT_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)

## Probing RoBERTa

In [None]:
# RoBERTa
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForMaskedLM.from_pretrained("roberta-large")

In [None]:
mask_token = tokenizer.mask_token
mask_token

## Creat the Prompt Pipeline

In [None]:
pipe = pipeline(
    task="fill-mask",
    model=model,
    tokenizer=tokenizer,
    top_k=100,
)

## Run

In [None]:
%%time
acc_list = []
atks = [1, 5, 10, 20, 50, 100]
for processor in processors:
    result_df = prompt_probe(model, tokenizer, processor, df, mask_token, \
                     'SubjectEntity', 'Relation', 'ObjectEntity', top_k=100)
    result_df['predicted'] = result_df.apply(hasPredicted, axis=1)

    prompt_template = processor('{SubjectEntity}', '[MASK]')
    for at_k in atks:
        acc_df = evaluate_acc_n(result_df, 'hasSpaceGroup', prompt_template, at_k=at_k)
        acc_list.append(acc_df)

In [None]:
hasType_df = pd.DataFrame(acc_list)

In [None]:
hasType_df.sample(10)

In [None]:
result_df.to_csv("../data/predicted_hasSpaceGroup_RoBERTa.csv", index=None)

In [None]:
hasType_df.to_csv("../data/predicted_hasSpaceGroup_RoBERTa_eval.csv", index=None)

In [None]:
hasType_df[hasType_df['at_k'] == 1].sort_values('acc', ascending=False)

In [None]:
hasType_df.sort_values('acc', ascending=False)