# TODO
If we're already at it, let's also look for keywords, such as:
- related to GPU/TPU Hardware
- training time
- inference compute

In [None]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer.six-20211012-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 3.7 MB/s 
[?25hCollecting cryptography
  Downloading cryptography-36.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 30.7 MB/s 
Installing collected packages: cryptography, pdfminer.six
Successfully installed cryptography-36.0.1 pdfminer.six-20211012


In [None]:
# Download dataset from the Parameters, Compute and Data Trends in ML sheet
import pandas as pd
df = pd.read_csv('https://docs.google.com/spreadsheets/d/1AAIebjNsnJj_uKALHbXNfn3_YsT6sHXtCU0q7OIPuc4/export?format=csv#gid=0')

year_start = 2017

# Recode columns
df['Publication date'] = pd.to_datetime(df['Publication date'], 
                                        errors='coerce', 
                                        dayfirst=True)

# Filter for papers of only the last 5 years
df = df[df['Publication date'] > f'{year_start}-01-01']


# Keep only bibliographical data
df = df.filter(['Author(s)', 'Publication date', 'Reference', 'Link'])
df = df[df['Link'].notna()]
# Keep only links which forward to a pdf or an arxiv link
df = df[df['Link'].str.contains('(arxiv|.pdf$)', regex=True)]


import requests
from pdfminer.high_level import extract_text
import re


# Set patterns and keys
patterns = [r".*@.*\..*", 
            r"[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+", 
            r"[a-zA-Z0-9._-]+.at.[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+",
            r"([^.]*?GPU[^.]*\.)",
            r"([^.]*?TPU[^.]*\.)",
            r"([^.]*?NVIDIA[^.]*\.)"
            ]
keys = ['email_1', 'email_2', 'email_3', 'GPU', 'TPU', 'NVIDIA']

tuples = (patterns, keys)

# Enable for test running with the first ten papers
# df = df[:10]

for i, row in df.iterrows():

  url = row['Link']

  # replace "abs" with "pdf" in arxiv url links
  url = url.replace('abs', 'pdf')
  print(f"Looking into {row['Reference']}")

  try:
    response = requests.get(url)
  except Exception as e:
    print(f"There's something wrong with downloading: {e}")
    continue

  file = open("download.pdf", "wb")
  file.seek(0) # overwrite previous file
  file.write(response.content)
  file.close()

  try:
    text = extract_text('download.pdf')



    for pattern, key in zip(*tuples):
      matches = re.findall(pattern, text)
      print(matches)

      matches = ';'.join(matches)
      df.loc[i,key]  = matches if matches else ""
  except Exception as e:
    print(f"There's something wrong with extracting the text: {e}")
    continue


    # print("There's something wrong with downloading the paper.")

  print("---")

  df.loc[i, 'email_subject'] = f"Trends in Machine Learning - Report your data from {row['Reference']}"
  df.loc[i, 'email_body'] = f"""
Dear all,

We are writing to you about your paper: “{row['Reference']}”. We’ve enjoyed reading it and we would like to include information about it in our public dataset (https://docs.google.com/spreadsheets/d/1AAIebjNsnJj_uKALHbXNfn3_YsT6sHXtCU0q7OIPuc4/edit#gid=0) of milestone systems in AI.

Could you share some more information about the primary system developed in the paper with us? We’re looking for estimates of:
(A) Parameter count
(B) Training compute (any type of metric is fine: GPU days and the hardware, number of operations, number of FLOPs, etc.)
    (B.1) The used number representation during the training (e.g., float16, float32, bloat16).
(C) Inference compute (number of operations/FLOPs per forward pass)
(D) The size of the training dataset

This information will help us in our investigation of trends in parameters, compute, and data usage in Machine Learning. 

We understand that some of this information is already available in your paper - your answer will help us guarantee we didn’t misinterpret the results in the paper.

Feel free to just answer this email or fill out this minimal form (https://forms.gle/kPs8xoPif2H56DCz5). We would be grateful for any kind of information.
If you have any questions, feel free to get back to us.

This is a joint project by Jaime Sevilla, Lennart Heim, and others.

Best regards,
Lennart Heim


Stanford's Existential Risk Initiative (SERI)
Center for International Security and Cooperation, Stanford University
mail: 	lennart@heim.xyz

"""

display(df)
df.to_csv('emails.csv')

  return func(self, *args, **kwargs)


Looking into DeepStack: Expert-Level Artificial Intelligence in No-Limit Poker
There's something wrong with extracting the text: No /Root object! - Is this really a PDF?
Looking into Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer
There's something wrong with extracting the text: No /Root object! - Is this really a PDF?
Looking into Mask R-CNN
There's something wrong with extracting the text: No /Root object! - Is this really a PDF?
Looking into MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
There's something wrong with extracting the text: No /Root object! - Is this really a PDF?
Looking into Thinking Fast and Slow with Deep Learning and Tree Search
There's something wrong with extracting the text: No /Root object! - Is this really a PDF?
Looking into Attention Is All You Need
['avaswani@google.com', 'noam@google.com', 'nikip@google.com', 'usz@google.com', 'llion@google.com', 'aidan@cs.toronto.edu', 'lukaszkaiser@goog

Unnamed: 0,Author(s),Publication date,Reference,Link,email_1,email_2,email_3,GPU,TPU,NVIDIA,email_subject,email_body
252,,2017-01-06,DeepStack: Expert-Level Artificial Intelligenc...,https://arxiv.org/abs/1701.01724,,,,,,,,
253,"N Shazeer, A Mirhoseini, K Maziarz, A Davis",2017-01-23,Outrageously Large Neural Networks: The Sparse...,https://arxiv.org/abs/1701.06538,,,,,,,,
254,"Kaiming He, Georgia Gkioxari, Piotr Dollár, Ro...",2017-03-30,Mask R-CNN,https://arxiv.org/abs/1703.06870,,,,,,,,
255,"AG Howard, M Zhu, B Chen, D Kalenichenko",2017-04-17,MobileNets: Efficient Convolutional Neural Net...,https://arxiv.org/abs/1704.04861,,,,,,,,
256,"T Anthony, Z Tian, D Barber",2017-05-23,Thinking Fast and Slow with Deep Learning and ...,https://arxiv.org/abs/1705.08439,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
746,"Bishan Yang, Scott Wen-tau Yih, Xiaodong He, J...",2020-02-13,Embedding Entities and Relations for Learning ...,https://arxiv.org/pdf/1412.6575.pdf,,,,,,,,
747,"Sheng Zhang, Xiaodong Liu, Jingjing Liu, Jianf...",2020-07-08,ReCoRD: Bridging the Gap between Human and Mac...,https://arxiv.org/pdf/1810.12885.pdf,,,,,,,,
748,"Yuhao Zhang, Peng Qi, and Christopher D Manning",2020-07-28,Graph Convolution over Pruned Depen- dency Tre...,https://arxiv.org/pdf/1809.10185.pdf,,,,,,,,
749,"Yuhao Zhang, Victor Zhong, Danqi Chen, Gabor A...",2020-11-23,Position- aware Attention and Supervised Data ...,https://nlp.stanford.edu/pubs/zhang2017tacred.pdf,"{yuhao, vzhong, danqi}@cs.stanford.edu;{angeli...",,statements.Your;statement.2;UnitedStates.Actors,,,,Trends in Machine Learning - Report your data ...,"\nDear all,\n\nWe are writing to you about you..."
