## function to download and analyse papers from google scholar
- This function will do a google scholar search based on a query (food + hazard)
- It will attempt to download the first 20 results (mostly through scihub)
- Then ChatPDF will check through each pdf and answer your question and find quotes within each pdf

You need
1. To install PyPaperBot (you can do this in anaconda terminal by typing *pip install PyPaperBot*
2. A chatPDF API Key. you can get this by making a chatPDF user and clicking *My Account* on the ChatPDF API website https://www.chatpdf.com/docs/api/backend


In [None]:
!pip install PyPaperBot

In [None]:

import requests
import glob
import PyPaperBot
from PyPaperBot import __main__ as p
import os
import time
import pandas as pd
import re

In [None]:
# FUNCTIONS to download and review the papers

def download_papers(food, hazard, scholar_pages=[1,2], scholar_results=20, skip_if_folder_exists = True):
  ## ----------------------------------- ###
    # specify download folder
    folder_name = f'{food}_{hazard}'
    dwn_dir = os.path.join(os.getcwd(), folder_name)
    if os.path.exists(dwn_dir) and skip_if_folder_exists:
        pdfs = glob.glob(os.path.join(dwn_dir, '*.pdf'))
        print('found %d papers in the folder "%s"' %(len(pdfs), folder_name))
    else: # not os.path.exists(dwn_dir)
        os.mkdir(dwn_dir)
        # make query:
        query = f'{food}+{hazard}'
        #download papers from google scholar
        p.start(query=query, scholar_pages=scholar_pages, scholar_results=scholar_results, dwn_dir=dwn_dir, proxy=[])

        pdfs = glob.glob(os.path.join(dwn_dir, '*.pdf'))
        print('downloaded %d of %d papers' %(len(pdfs), scholar_results))
    return pdfs


def upload_analyze_papers(food, hazard, pdfs, API_Key, question='default'):
    all_results = pd.DataFrame(columns=['filename', 'food', 'hazard', 'quote', 'location', 'page', 'paragraph'])

    # Load papers into chatpdf
    for file in pdfs:
        print(f'loading file "{os.path.basename(file)}" ')
        files = [('file', ('file', open(file, 'rb'), 'application/octet-stream'))]
        headers = {'x-api-key': API_Key}
        response = requests.post('https://api.chatpdf.com/v1/sources/add-file', headers=headers, files=files)

        if response.status_code == 200:
            sID = response.json()['sourceId']
        else:
            print('Status:', response.status_code)
            print('Error:', response.text)
            continue

        # 'Read' papers in chatpdf
        headers = {
            'x-api-key': API_Key,
            "Content-Type": "application/json"
        }
        print('analyzing file...')
        if question == 'default':
            question = f'can you provide 1) a quote from the text about how {hazard} impacts {food}? 2) the location this study takes place 3) the page where this quote is found 4) the paragraph where this quote is found'

        data = {
            'sourceId': sID,
            'messages': [
                {
                    'role': "user",
                    'content': question,
                }
            ]
        }
        response = requests.post(
            'https://api.chatpdf.com/v1/chats/message', headers=headers, json=data)

        if response.status_code == 200:
            print('result:', response.json()['content'])
            segments = re.split(r'\d+\)', response.json()['content'])  # Split the text using regex
            if len(segments) >= 4:
                quote = segments[0].strip()
                location = segments[1].strip()
                page = segments[2].strip()
                paragraph = segments[3].strip()

                # Create a row dictionary for this PDF
                row_data = {
                    'filename': os.path.basename(file),
                    'food': food,
                    'hazard': hazard,
                    'quote': quote,
                    'location': location,
                    'page': page,
                    'paragraph': paragraph
                }

                # Append the row data to the results
                all_results = pd.concat([all_results, pd.DataFrame([row_data])], ignore_index=True)
        else:
            print('Status:', response.status_code)
            print('Error:', response.text)
        print("\n")

    return all_results

def download_read_export(food, hazard, API_Key, scholar_pages=[1,2], scholar_results=20, question='default', skip_if_folder_exists = True):
  start_time = time.time() #start timer

  pdfs = download_papers(food, hazard, scholar_pages, scholar_results, skip_if_folder_exists)
  df = upload_analyze_papers(food, hazard,pdfs, API_Key, question='default')
  return df

  end_time = time.time() #stop timer
  elapsed_time = end_time - start_time
  print(f'Time taken for "{food}" and "{hazard}":{elapsed_time:.2f} seconds')



### Download and "read" the papers
- Change food and hazard to any combination and input your API_Key
- If you want to change any settings of numbers of pages and number of files to download
- You can also specify a question to ask, otherwise it will ask "How does {hazard} impact {food}? can you provide a quote from the text about this?"



In [None]:
foods = ['wheat', 'rice', 'tuna', 'apple', 'coffee']
hazards = ['drought', 'heatwave', 'warming', 'storm', 'flooding']
API_Key = 'sec_6LIDxgLBHBqmhkVam818PYYXqervcPSX' #make a user and get api key from https://www.chatpdf.com/docs/api/backend
scholar_pages = [1,2]
scholar_results = 20
question = 'default' ## Specify a question in quotes or use the default: "How does {hazard} impact {food}? can you provide a quote from the text about this?"
skip_if_folder_exists = True ## dummy variable, if you have the data downloaded already and dont want to re-download it. defualt=True

# Create an empty dataframe to store the results
dfall = pd.DataFrame(columns=['food', 'hazard', 'quote', 'location', 'page', 'paragraph'])

## Run function across combinations of food and hazard
for food in foods:
  for hazard in hazards:
    df = download_read_export(food, hazard, API_Key, scholar_pages=[1,2], scholar_results=20, question='default', skip_if_folder_exists = True)
    dfall = pd.concat([dfall, df], ignore_index=True)

# Print the merged dataframe
print(dfall)

FileExistsError: ignored

In [None]:
dfall

In [None]:
#####
# food systems # climate change
# pypaperbot download papers; chatpdf read papers
# abm Oct 2023



import requests
import glob
import PyPaperBot
from PyPaperBot import __main__ as p
import os
import time
import pandas as pd
import re

# FUNCTIONS to download and review the papers

def download_papers(food, hazard, scholar_pages=[1,2], scholar_results=20, skip_if_folder_exists = True):
  ## ----------------------------------- ###
    # specify download folder
    folder_name = f'{food}_{hazard}'
    dwn_dir = os.path.join(os.getcwd(), folder_name)
    if os.path.exists(dwn_dir) and skip_if_folder_exists:
        pdfs = glob.glob(os.path.join(dwn_dir, '*.pdf'))
        print('found %d papers in the folder "%s"' %(len(pdfs), folder_name))
    else: # not os.path.exists(dwn_dir)
        os.mkdir(dwn_dir)
        # make query:
        query = f'{food}+{hazard}'
        #download papers from google scholar
        p.start(query=query, scholar_pages=scholar_pages, scholar_results=scholar_results, dwn_dir=dwn_dir, proxy=[])

        pdfs = glob.glob(os.path.join(dwn_dir, '*.pdf'))
        print('downloaded %d of %d papers' %(len(pdfs), scholar_results))
    return pdfs


def upload_analyze_papers(food, hazard, pdfs, API_Key, question='default'):
    all_results = pd.DataFrame(columns=['filename', 'food', 'hazard', 'quote', 'location', 'pos/neg', 'how'])

    # Load papers into chatpdf
    # dummy loop to only upload 12/minute (limit with free plan; to avoid errors)
    uploads = 0
    start_time = time.time()

    for file in pdfs:
        # enter this loop only if number of uploads are 12 or more; if so check if more than 60 sec has passed
        if uploads >= 12:
            elapsed_time = time.time() - start_time
            if elapsed_time < 60:
                # if timer is below 60 and uploads are 12, timeout untill the next minute to continue uploads
                wait_time = 60 - elapsed_time
                print(f'Upload limit reached. Waiting for {wait_time:.2f} seconds...')
                time.sleep(wait_time)
            uploads = 0  # reset the counter and start a new minute
            start_time = time.time()

        print(f'loading file "{os.path.basename(file)}" ')
        files = [('file', ('file', open(file, 'rb'), 'application/octet-stream'))]
        headers = {'x-api-key': API_Key}
        response = requests.post('https://api.chatpdf.com/v1/sources/add-file', headers=headers, files=files)

        uploads += 1 # update the upload counter

        #### Then use the paper as input for question
        if response.status_code == 200:
            sID = response.json()['sourceId']
        else:
            print('Status:', response.status_code)
            print('Error:', response.text)
            continue

        # 'Read' papers in chatpdf
        headers = {
            'x-api-key': API_Key,
            "Content-Type": "application/json"
        }
        print('analyzing file...')
        if question == 'default':
            question = f'1a) provide a quote from the text about how {hazard} impacts {food}? 2a) in what region is this study 3a) does {hazard} negatively or positively impact {food} (reply only negatvie/positive) 4a) exactly how is {food} impacted?'

        data = {
            'sourceId': sID,
            'messages': [
                {
                    'role': "user",
                    'content': question,
                }
            ]
        }
        response = requests.post(
            'https://api.chatpdf.com/v1/chats/message', headers=headers, json=data)

        if response.status_code == 200:
            print('result:', response.json()['content'])
            results = response.json()['content']
            segments = results.split('a)')  # Split the text using regex
            if len(segments) >= 4:
                quote = segments[0].strip()
                location = segments[1].strip()
                page = segments[2].strip()
                paragraph = segments[3].strip()

                # Create a row dictionary for this PDF
                row_data = {
                    'filename': os.path.basename(file),
                    'food': food,
                    'hazard': hazard,
                    'quote': quote,
                    'location': location,
                    'pos/neg': page,
                    'how': paragraph
                }

                # Append the row data to the results
                all_results = pd.concat([all_results, pd.DataFrame([row_data])], ignore_index=True)
        else:
            print('Status:', response.status_code)
            print('Error:', response.text)
        print("\n")

    return all_results

def download_read_export(food, hazard, API_Key, scholar_pages=[1,2], scholar_results=20, question='default', skip_if_folder_exists = True):
    start_time = time.time() #start timer

    pdfs = download_papers(food, hazard, scholar_pages, scholar_results, skip_if_folder_exists)
    df = upload_analyze_papers(food, hazard,pdfs, API_Key, question='default')
    return df

    end_time = time.time() #stop timer
    elapsed_time = end_time - start_time
    print(f'Time taken for "{food}" and "{hazard}":{elapsed_time:.2f} seconds')




In [None]:
############# Run it

foods = ['wheat']#, 'rice', 'tuna', 'apple', 'coffee']
hazards = ['heatwave']#, ['drought', 'heatwave', 'warming', 'storm', 'flooding']
API_Key = 'sec_6LIDxgLBHBqmhkVam818PYYXqervcPSX' #make a user and get api key from https://www.chatpdf.com/docs/api/backend
scholar_pages = [1,2]
scholar_results = 5
question  = 'default' #f'1a) provide a quote from the text about how {hazard} impacts {food}? 2a) in what region is this study 3a) does {hazard} negatively or positively impact {food} (reply only negatvie/positive) 4a) exactly how is {food} impacted?'

# Create an empty dataframe to store the results
dfall = pd.DataFrame(columns=['filename', 'food', 'hazard', 'quote', 'location', 'pos/neg', 'how']) ## Specify a question in quotes or use the default: "How does {hazard} impact {food}? can you provide a quote from the text about this?"
skip_if_folder_exists = True ## dummy variable, if you have the data downloaded already and dont want to re-download it. defualt=True

# Create an empty dataframe to store the results
#dfall = pd.DataFrame(columns=['food', 'hazard', 'quote', 'location', 'page', 'paragraph'])

## Run function across combinations of food and hazard
for food in foods:
  for hazard in hazards:
    df = download_read_export(food, hazard, API_Key, scholar_pages=scholar_pages, scholar_results=scholar_results, question=question, skip_if_folder_exists = True)
    dfall = pd.concat([dfall, df], ignore_index=True)

# Print the merged dataframe
print(dfall)
path = os.path.join(os.getcwd(), 'dfall.csv')
dfall.to_csv(path)


Query: wheat+heatwave

Google Scholar page 1 : 5 papers found
Searching paper 1 of 5 on Crossref...
Searching paper 2 of 5 on Crossref...
Searching paper 3 of 5 on Crossref...
Searching paper 4 of 5 on Crossref...
Searching paper 5 of 5 on Crossref...
Papers found on Crossref: 5/5


Google Scholar page 2 : 5 papers found
Searching paper 1 of 5 on Crossref...
Searching paper 2 of 5 on Crossref...
Searching paper 3 of 5 on Crossref...
Searching paper 4 of 5 on Crossref...
Searching paper 5 of 5 on Crossref...
Papers found on Crossref: 5/5


Using https://sci-hub.ee as Sci-Hub instance
Download 1 of 10 -> The spatial-temporal patterns of heatwave hazard impacts on wheat in northern China under extreme climate scenarios
Download 2 of 10 -> Substantial increase of compound droughts and heatwaves in wheat growing seasons worldwide
Download 3 of 10 -> Can N management affect the magnitude of yield loss due to heat waves in wheat and maize?
Download 4 of 10 -> Wheat yield loss attributable to 