In [1]:
import json 
from collections import Counter
from typing import List
import regex as re


def manual_prefix_suffix(key):
    matches = re.findall('(.*)(?:\-|\_)\(?(\d+b)\)?', key)
    return matches[0] if matches else (None, None)

path = '/home/gridsan/afogelson/osfm/paper_analysis_toolkit/data/foundation_models.json'
prompt = 'The following sentences are taken from an academic publication. Within these sentences, there are citations, one of which is highlighted using <cite></cite>. This citation references the foundation model {foundation_model_name}. However, this foundation model has a few variants, and we\'d like to determine which variant is being cited. The variants are given by their shorthands {comma_variant}. Only answer definitively if there is clear evidence in the sentence. Otherwise, answer UNCLEAR. Format your response as JSON {{ "answer": "{piped_variant}" | "UNCLEAR" | "ALL"}}'

with open(path, 'r') as f:
    fm = json.load(f)
    mapping = {key: fm[key]['paperId'] for key in fm}
    
id_to_keys = {id: [] for id in {item for _, item in mapping.items()}}
for key, item in mapping.items():
    id_to_keys[item].append(key)

nonunique_id_to_keys = {key:item for key, item in id_to_keys.items() if len(item) > 1}

In [2]:
foundation_model_name = 'chatgpt'
variant_list = ['gpt3.5', 'gpt4', 'gptturbo']

print(prompt)

The following sentences are taken from an academic publication. Within these sentences, there are citations, one of which is highlighted using <cite></cite>. This citation references the foundation model {foundation_model_name}. However, this foundation model has a few variants, and we'd like to determine which variant is being cited. The variants are given by their shorthands {comma_variant}. Only answer definitively if there is clear evidence in the sentence. Otherwise, answer UNCLEAR. Format your response as JSON {{ "answer": "{piped_variant}" | "UNCLEAR" | "ALL"}}


In [3]:
class DuplicateFoundationModel:
    def __init__(self, id, foundation_models_path :
        self.keys =  list({key for key, item in mapping.items() if item == id})     
        self.id = id
        self.model_name, self.variants = self.extract_suffixes()

    def extract_suffixes(self):
        stripped_keys = list(map(lambda s: re.findall('\d+_(.*)', s)[0], self.keys))
        
        print(stripped_keys)
        
        names, variants = list(zip(*map(manual_prefix_suffix, stripped_keys)))
        if len(set(names)) == 1 and None not in names:
            model_name = next(iter(names))
        else:
            max_idx = 0
            for chars in zip(*stripped_keys):
                if len(set(chars)) == 1:
                    max_idx += 1
                else:
                    break
            
            if max_idx == 0:
                return None, self.keys
            
            formatter = lambda s: re.sub('\(*\)*', '', s)
            model_name =  formatter(next(iter(stripped_keys))[:max_idx])
            variants = [model_name + formatter(key[max_idx:]) for key in stripped_keys]
        
        return model_name, stripped_keys
    
    def get_prompt(self):
        piped_variant = '" | "'.join(variant_list)
        comma_variant = '"' + '", "'.join(variant_list) + '"'
        return prompt.format(foundation_model_name = self.model_name, piped_variant = piped_variant, comma_variant = comma_variant)



In [4]:
dup_models = [DuplicateFoundationModel(id) for id in nonunique_id_to_keys]

delineator_mapping = {model.id: (model.model_name, model.variants) for model in dup_models}
with open('output.txt', 'w') as f:
    json.dump(delineator_mapping, f, indent = 2)
    


['gpt3-6.7b_+_mup', 'gpt3-6.7b_(rerun_of_original)']
['zoneout_+_variational_lstm_(wt2)', 'pointer_sentinel-lstm_(medium)']
['mamba-2.8b', 'mamba-24m_(sc09)']
['flan-palm_540b', 'flan-t5_11b']
['resnet-110_(cifar-10)', 'resnet-152_(imagenet)']
['big-little_net_(vision)', 'big-little_net_(speech)', 'big-little_net']
['spalm_+_relationlm', 'transformerxl+relationlm']
['vit-huge_14', 'vit-base_32']
['resnet-50_billion-scale', 'resnext-101_billion-scale']
['igpt-xl', 'igpt-l']
['ct-mos_+_dynamiceval_(wt2)', 'ct-mos_(wt2)']
['ankh_large', 'ankh_base']
['m6-10b', 'm6-100b']
['resnet-1001', 'resnet-200']
['gshard_(600b)', 'gshard_(dense)']
['glove_(32b)', 'glove_(6b)']
['rnn_(sgd+clr)', 'rnn_(sgd+clr)_(ptb)']
['data2vec_(speech)', 'data2vec_(vision)', 'data2vec_(language)']
['tcn_(p-mnist)', 'tcn_(148m)']
['ldm-1.45b', 'stable_diffusion_(ldm-kl-8-g)']
['glee', 'boxes']
['rnn_500_10_+_rt09_lm_(nist_rt05)', 'kn5_lm_+_rnn_400_10_(wsj)']
['dit-xl_2_+_discriminator_guidance', 'discriminator_guidan

In [5]:
for model in dup_models:
    print(model.get_prompt())

The following sentences are taken from an academic publication. Within these sentences, there are citations, one of which is highlighted using <cite></cite>. This citation references the foundation model gpt3-6.7b_. However, this foundation model has a few variants, and we'd like to determine which variant is being cited. The variants are given by their shorthands "gpt3.5", "gpt4", "gptturbo". Only answer definitively if there is clear evidence in the sentence. Otherwise, answer UNCLEAR. Format your response as JSON { "answer": "gpt3.5" | "gpt4" | "gptturbo" | "UNCLEAR" | "ALL"}
The following sentences are taken from an academic publication. Within these sentences, there are citations, one of which is highlighted using <cite></cite>. This citation references the foundation model None. However, this foundation model has a few variants, and we'd like to determine which variant is being cited. The variants are given by their shorthands "gpt3.5", "gpt4", "gptturbo". Only answer definitivel