In [8]:
from collections import defaultdict
import pandas as pd

In [51]:
# Latest for GPT-3.5: 'output_data/parsed_paper_data_2023-05-17_15-50-57.csv'
# Latest for GPT-4: 'output_data/parsed_paper_data_2023-05-17_16-24-14.csv'
df = pd.read_csv('output_data/parsed_paper_data_2023-05-17_16-24-14.csv', keep_default_na=False)
df

Unnamed: 0.1,Unnamed: 0,Author(s),Publication date,Reference,Link,Number of hardware units,Hardware model
0,3,"Aitor Lewkowycz, Anders Andreassen, David Doha...",2022-06-29,Solving Quantitative Reasoning Problems with L...,https://arxiv.org/abs/2206.14858,"The 8B model was trained on a v4-128, the 62B ...","v4 TPU (""We used the t5x framework (Roberts et..."
1,4,"Aakanksha Chowdhery, Sharan Narang, Jacob Devl...",2022-04-04,PaLM: Scaling Language Modeling with Pathways,https://arxiv.org/abs/2204.02311,"6144 (""We trained PaLM on 6144 TPU v4 chips"")","TPU v4 (""We trained PaLM on 6144 TPU v4 chips"")"
2,6,"Jordan Hoffmann, Sebastian Borgeaud, Arthur Me...",2022-03-29,Training Compute-Optimal Large Language Models,https://arxiv.org/abs/2203.15556,,"TPUv3/TPUv4 (""All models in this analysis have..."
3,7,"Jiahui Yu, Yuanzhong Xu, Jing Yu Koh, Thang Lu...",2022-06-22,Scaling Autoregressive Models for Content-Rich...,https://arxiv.org/abs/2206.10789v1,,"CloudTPUv4 (""We implement our models in Lingvo..."
4,8,"Romal Thoppilan, Daniel De Freitas, Jamie Hall...",2022-02-10,LaMDA: Language Models for Dialog Applications,https://arxiv.org/abs/2201.08239,"1024 (""We pre-trained LaMDA on 1024 TPU-v3 chi...","TPU-v3 (""We pre-trained LaMDA on 1024 TPU-v3 c..."
5,21,"Ping Yu, Mikel Artexte, Myle Ott, Sam Shleife...",2022-04-14,Efficient Language Modeling with Sparse all-MLP,https://arxiv.org/abs/2203.06850,"32 (""Num of GPUs: 32"")","Nvidia 32G V100 (""All the models are trained o..."
6,100,"Tom B. Brown, Benjamin Mann, Nick Ryder, Melan...",2020-05-28,Language models are Few- Shot Learners,https://arxiv.org/abs/2005.14165,,"V100 (""All models were trained on V100 GPU’s o..."
7,103,"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu,...",2020-06-30,GShard: Scaling Giant Models with Conditional ...,https://arxiv.org/abs/2006.16668,"2048 (""We demonstrate that such a giant model ...","TPU v3 (""We demonstrate that such a giant mode..."
8,108,"Zhenzhong Lan, Mingda Chen, Sebastian Goodman,...",2020-02-09,ALBERT: A Lite BERT for Self-supervised Learni...,https://arxiv.org/abs/1909.11942,The number of TPUs used for training ranged fr...,"Cloud TPU V3 (""Training was done on Cloud TPU ..."
9,111,"Han Cai, Chuang Gan, Tianzhe Wang, Zhekai Zhan...",2020-04-29,Once for all: Train one network and specialize...,https://arxiv.org/abs/1908.09791,"32 GPUs (""The full network is trained for 180 ...","V100 (""The whole training process takes around..."


In [52]:
df['Hardware model']

0     v4 TPU ("We used the t5x framework (Roberts et...
1       TPU v4 ("We trained PaLM on 6144 TPU v4 chips")
2     TPUv3/TPUv4 ("All models in this analysis have...
3     CloudTPUv4 ("We implement our models in Lingvo...
4     TPU-v3 ("We pre-trained LaMDA on 1024 TPU-v3 c...
5     Nvidia 32G V100 ("All the models are trained o...
6     V100 ("All models were trained on V100 GPU’s o...
7     TPU v3 ("We demonstrate that such a giant mode...
8     Cloud TPU V3 ("Training was done on Cloud TPU ...
9     V100 ("The whole training process takes around...
10    TPUv3 ("it could be trained using a standard c...
11                        Nvidia P100 ("1 Nvidia P100")
12    First-generation TPUs and second-generation TP...
13    NVIDIA K80 ("We use asynchronous gradient desc...
14    Tesla K40 ("We trained our models using Tensor...
15    P100 ("In the Mobile setting, we use distribut...
16    NVIDIA P100 ("We trained our models on one mac...
17    NVIDIA GeForce GTX 1080 ("With sparse and 

In [53]:
key = 'Hardware model'
answers = df[key].values
i = 0
answer = answers[i]

In [54]:
# Get the supporting quote from the answer string
# The supporting quote should be in parentheses
# E.g. 'TPU v4 ( "With Pathways, we trained a 540B parameter language model on 6144 TPU v4 chips" )'
parsed_answer = answer.split("(", maxsplit=1)
# E.g. ['TPU v4 ', ' "With Pathways, we trained a 540B parameter language model on 6144 TPU v4 chips" )']
parsed_answer = parsed_answer[-1].strip('"() ')
quote = parsed_answer
quote

'We used the t5x framework (Roberts et al., 2022) and trained our models with v4 TPU on Google Cloud.'

In [55]:
with open('input_data/solving_quantitative_reasoning_problems_with_language_models.txt', 'r') as f:
    text = f.read()

In [56]:
quote in text.replace("\n", " ")

True

In [57]:
def extract_last_parenthetical(string):
    """
    Extract the parenthetical from a string corresponding to the last occurring closing parenthesis.

    Example:
        string = "This is a (sample) string (to extract (substring))"
        extract_last_parenthetical(string) -> "to extract (substring)"
    
    """
    stack = []
    substring = ''
    for i, char in enumerate(string[::-1]):
        if char == ')':
            stack.append(i)
        elif char == '(':
            idx = stack.pop()
            if len(stack) == 0:
                substring = string[-i:-(idx+1)]
                break     
    return substring

In [58]:
extract_last_parenthetical("This is a (sample) string (to extract (substring))")

'to extract (substring)'

In [59]:
quote_matches = defaultdict(int)
for i, row in df.iterrows():
    for key in ['Number of hardware units', 'Hardware model']:
        answer = row[key]
        if "N/A" in answer: continue
        # Get the supporting quote from the answer string
        # The supporting quote should be in parentheses
        # E.g. 'TPU v4 ( "With Pathways, we trained a 540B parameter language model on 6144 TPU v4 chips" )'
        quote = extract_last_parenthetical(answer)
        quote = quote.strip('"() .')

        paper_title = row['Reference'].replace(' ', '_').replace(':', '').replace('"', '').lower()
        with open('input_data/' + paper_title + '.txt', 'r') as f:
            text = f.read()

        if quote.lower() in text.lower().replace("\n", " "):
            quote_matches[key] += 1
        else:
            print(f"Quote not found in {paper_title}: {quote}")

Quote not found in efficient_language_modeling_with_sparse_all-mlp: Num of GPUs: 32
Quote not found in gshard_scaling_giant_models_with_conditional_computation_and_automatic_sharding: We demonstrate that such a giant model can efficiently be trained on 2048 TPU v3 accelerators in 4 days
Quote not found in gshard_scaling_giant_models_with_conditional_computation_and_automatic_sharding: We demonstrate that such a giant model can efficiently be trained on 2048 TPU v3 accelerators in 4 days
Quote not found in albert_a_lite_bert_for_self-supervised_learning_of_language_representations.: Training was done on Cloud TPU V3
Quote not found in an_image_is_worth_16x16_words_transformers_for_image_recognition_at_scale: it could be trained using a standard cloud TPUv3 with 8 cores in approximately 30 days
Quote not found in an_image_is_worth_16x16_words_transformers_for_image_recognition_at_scale: it could be trained using a standard cloud TPUv3 with 8 cores in approximately 30 days
Quote not found

In [50]:
quote_matches

defaultdict(int, {'Number of hardware units': 8, 'Hardware model': 11})

Checking whether the flagged cases for GPT-4 actually pass if we allow some reasonable flexibility (but not allowing any hallucination or change of meaning):
- [QUESTIONABLE] Quote not found in efficient_language_modeling_with_sparse_all-mlp: Num of GPUs: 32
  - It's not a direct quote but it's a correct read from a Table if you look at the PDF.
  - The table has "Num of", "GPUs", and "32" in separate positions of the text.
- [PASSED] Quote not found in gshard_scaling_giant_models_with_conditional_computation_and_automatic_sharding: We demonstrate that such a giant model can efficiently be trained on 2048 TPU v3 accelerators in 4 days
  - Actual quote: "We demonstrate that such a giant model can efﬁcienctly be trained on 2048 TPU v3 accelerators in 4 days"
  - There's a special character, "ﬁ"
- [PASSED] Quote not found in albert_a_lite_bert_for_self-supervised_learning_of_language_representations.: Training was done on Cloud TPU V3
  - Actual quote: "Train-ing was done on Cloud TPU V3."
  - Hyphenation across line break.
- [PASSED] Quote not found in an_image_is_worth_16x16_words_transformers_for_image_recognition_at_scale: it could be trained using a standard cloud TPUv3 with 8 cores in approximately 30 days
  - Actual quote: "it could be trained using a standard cloud TPUv3 with 8 cores in ap-proximately 30 days."
  - Hyphenation across line break.
- [PASSED] Quote not found in mastering_chess_and_shogi_by_self-play_with_a_general_reinforcement_learning_algorithm: Training proceeded for 700,000 steps (mini-batches of size 4,096) starting from randomly initialised parameters, using 5,000 first-generation TPUs (15) to generate self-play games and 64 second-generation TPUs to train the neural networks
  - Actual quote: "Training proceeded for 700,000 steps (mini-batches of size 4,096) starting from randomly initialised parameters, using 5,000 ﬁrst-generation TPUs (15) to generate self-play games and 64 second-generation TPUs to train the neural networks."
  - Looks correct, probably a punctuation mismatch
- [PASSED] Quote not found in outrageously_large_neural_networks_the_sparsely-gated_mixture-of-experts_layer: We trained our models using TensorFlow (Abadi et al., 2016) on clusters containing 16-32 Tesla K40 GPUs
  - Actual quote: "We trained our models using TensorFlow (Abadi et al., 2016) on clus-ters containing 16-32 Tesla K40 GPUs."
  - Hyphenation again

Checking whether the flagged cases for GPT-3.5 actually pass if we allow some reasonable flexibility (but not allowing any hallucination or change of meaning):

- [PASSED] Quote not found in scaling_autoregressive_models_for_content-rich_text-to-image_generation: We implement our models in Lingvo and scale with GSPMD on CloudTPUv4 hardware for both training and inference
  - The quote is there if you remove references like "[39]"
- [PASSED] Quote not found in gshard_scaling_giant_models_with_conditional_computation_and_automatic_sharding: Figure 1: "Our best quality dense single Transformer model (2.3B parameters) achieving ∆BLEU of 6.1, was trained with GPipe [15] on 2048 TPU v3 cores for 6 weeks or total of 235.5 TPU v3 core-years
  - This is in the paper: "Our best quality dense single Transformer model (2.3B parameters) achieving ∆BLEU of 6.1, was trained with GPipe [15] on 2048 TPU v3 cores for 6 weeks or total of 235.5 TPU v3 core-years."
  - And that quote is part of the caption for Figure 1 in the paper. GPT just mashed them together.
- [PASSED] Quote not found in albert_a_lite_bert_for_self-supervised_learning_of_language_representations.: All the model updates use a batch size of 4096 and a LAMB optimizer with learning rate 0.00176 (You et al., 2019). We train all models for 125,000 steps unless otherwise specified. Training was done on Cloud TPU V3
  - This is an exact quote, I think it's just an issue with missing spaces on newlines
- [PASSED] Quote not found in mastering_chess_and_shogi_by_self-play_with_a_general_reinforcement_learning_algorithm: We trained a separate instance of AlphaZero for each game. Training proceeded for 700,000 steps (mini-batches of size 4,096) starting from randomly initialised parameters, using 5,000 first-generation TPUs (15) to generate self-play games and 64 second-generation TPUs to train the neural networks
  - Actual quote: "We trained a separate instance of AlphaZero for each game. Training proceeded for 700,000 steps (mini-batches of size 4,096) starting from randomly initialised parameters, using 5,000 ﬁrst-generation TPUs (15) to generate self-play games and 64 second-generation TPUs to train the neural networks"
  - Looks correct, maybe an issue with newline vs. spaces again
- [QUESTIONABLE] Quote not found in mastering_chess_and_shogi_by_self-play_with_a_general_reinforcement_learning_algorithm: AlphaZero was executed on a single machine with 4 TPUs during evaluation
  - Closest I could find: "During evaluation, AlphaZero selects moves greedily with respect to the root visit count. Each MCTS was executed on a single machine with 4 TPUs."
  - Hmm, this is borderline. 
- [PASSED] Quote not found in attention_is_all_you_need: Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.0 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature
  - Actual quote: "Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.0 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature"
  - Looks correct, maybe an issue with newline vs. spaces again