In [64]:
import pandas as pd
import numpy as np 
import time
import os
import glob
import requests


## Notebook to test and compare different prompts

In [86]:

def upload_analyze_papers(food, hazard, API_Key, question='default'):
    
    ########################## set up run ##########################
    # make an empty inital dataframe to fill in 
    all_results = pd.DataFrame(columns=['filename', 'food', 'hazard', 'gen_quote', 'impact', 'direction', 'mechanism', 'quote', 'location', 'full_answer']) 

    # specify dwn_dir (formatted: food_hazard)
    dwn_dir = os.path.join(os.getcwd(), f'{food}_{hazard}')
    # look for pds
    pdfs = glob.glob(os.path.join(dwn_dir, '*.pdf'))
    # make print to check what is going on
    print(f'analyzing {food} and {hazard}. Found {str(len(pdfs))} papers', flush=True, end=' ')
    
    
    ########################## Load papers into chatpdf ##########################
    
    ### dummy loop to only upload 12/minute (limit for free account - change if we have paid account)
    uploads = 0
    start_time = time.time()
    for file in pdfs:
        # Check if we've reached the upload limit
        if uploads >= 12:
            elapsed_time = time.time() - start_time
            if elapsed_time < 60:
                # If uploads exceed the limit in less than a minute, wait for the remaining time
                wait_time = 60 - elapsed_time
                print(f'Upload limit reached. Waiting for {wait_time:.2f} seconds...')
                time.sleep(wait_time)
            uploads = 0  # Reset the counter and start a new minute
            start_time = time.time()
        ###

        files = [('file', ('file', open(file, 'rb'), 'application/octet-stream'))]
        headers = {'x-api-key': API_Key}
        response = requests.post('https://api.chatpdf.com/v1/sources/add-file', headers=headers, files=files)
        # Update the upload counter
        uploads += 1
        ## chatpdf jargon; uploading papers
        if response.status_code == 200:
            sID = response.json()['sourceId']
        else:
            print('Status:', response.status_code)
            print('Error:', response.text)
            continue


        ########################## 'Read' papers in chatpdf ##########################
        headers = {
            'x-api-key': API_Key,
            "Content-Type": "application/json"
        }
        # the defualt question isnt really working with current dataframe setup
        if question == 'default':
            question = f'provide a quote from the text about how {hazard} impacts {food}?'
        
        # more chatpdf jargon, input question:
        data = {
            'sourceId': sID,
            'messages': [
                # set up few-shot prompting (i'm not entirely sure if we should have our same questions before this to show it how to answer or if this is sufficient)
                # code from: https://www.chatpdf.com/docs/api/backend
                {
                    'role': "assistant",
                    'content': '1a) "Understanding the responses of animals to acute heat stress can help to reveal and predict the effect of more frequent extreme hot weather episodes on animal populations and ecosystems in the content of global climate change". 2a) Indirect physiology 3a) Negative 4a) The warming negatively impacts Chinese softshell turtle through indirect physiological mechanisms. 5a) "The results suggest that the Chinese soft-shelled turtle has a strong capacity to activate its antioxidant defense system to cope with acute heat stress". 6a) Beijing, China. 7a) The impact of warming on Chinese softshell turtle via indirect physiology is negative. The study was in Beijing, China.'
                },
                # actual question
                {
                    'role': "user",
                    'content': question,
                }
            ]
        }
        response = requests.post(
            'https://api.chatpdf.com/v1/chats/message', headers=headers, json=data)

        # if results are returned
        if response.status_code == 200:
            #print('result:', response.json()['content'])
            results = response.json()['content']
            segments = results.split('a)')  # Split the text using the question format ('a)')
            #segments = results.split('\n')  # could also split it by lines - might work well if a) is bad
            #print(segments)

            # split answer into small segments to put in dataframe
            ### segments for a. 
            if len(segments) > 5:
                gen_quote = segments[1].strip()[:-3]
                impact = segments[2].strip()[:-3]
                direction = segments[3].strip()[:-3]
                mechanism = segments[4].strip()[:-3]
                quote = segments[5].strip()[:-3]
                location = segments[6].strip()[:-3]
                full_answer = segments[7].strip()[:-3]
                
                print(full_answer)
                # Create a row dictionary for this PDF
                row_data = {
                    'filename': os.path.basename(file),
                    'food': food,
                    'hazard': hazard,
                    'gen_quote': gen_quote,
                    'impact': impact,
                    'direction': direction,
                    'mechanism': mechanism,
                    'quote': quote,
                    'location': location,
                    'full_answer': full_answer
                }

                # Append the row data to the results
                all_results = pd.concat([all_results, pd.DataFrame([row_data])], ignore_index=True)

        else: #if it's baaad
            print('Status:', response.status_code)
            print('Error:', response.text)
    # Return dataframe for one pdf
    return all_results 

In [90]:
# BULK TEST with read papers

foods = ['Bay leaf','Bignose unicornfish','Button mushroom','Centropristis philadelphica','Chinese softshell turtle','Corica soborna','Ganges river sprat','Grasshopper','Large yellow croaker','Larimichthys croceus','Leucoraja erinacea','Little skate','mealworm','Naso vlamingii','Orange-spotted grouper','Ordways brotula','Pusa hispida','Ringed seal','Rock sea bass','Tenebrio molitor','Trionyx sinensis','Triticum aestivum','wheat']
hazards = ['atmospheric CO2 increases','ocean acidification','ozone','storms','warming','precipitation','precipitation','fires','warming','warming','ocean acidification','ocean acidification','warming','ocean acidification','natural cover change','floods','heatwaves','Heatwaves','storms','warming','warming','drought','drought']
API_Key = 'sec_uWPhoztZ5o1f4iiBRrudtvdWONGJKPp0' #make a user and get api key from https://www.chatpdf.com/docs/api/backend

# make an inital empty dataframe
dfall = pd.DataFrame(columns=['filename', 'food', 'hazard', 'gen_quote', 'impact', 'direction', 'mechanism', 'quote', 'location', 'full_answer']) 

# loop through food/hazard combo
for i in range(len(foods)):
    food = foods[i]
    hazard = hazards[i]

    print(food, hazard)
    # make the question (food/hazard specific)
    question = (
        'please include a) after each question, like 1a), 7a) '
        f'1a) provide a quote from the text about how {hazard} impacts {food}? '
        f'2a) please classify the impact on {food} as direct physiology, indirect abiotic, indirect biotic, or none. Respond only direct physiology, indirect abiotic, indirect biotic, or no effect'
        f'3a) does {hazard} negatively, positively or neutrally impact {food} (Respond only with negatvie, positive, neutral, or No effect)'
        f'4a) If {food} is impacted by {hazard}, can you explain through what mechanisms? else, reply Na '
        f'5a) Please provide a quote from the text about the mechanism. if there is none reply Na '
        f'6a) select where this study took place. if its a lab this experment, select lab '
        f'7a) Provide an answer in the following format: The impact of {hazard} on {food} via (select: direct physiology, indirect abiotic, '
        f'or indirect biotic) are (select: positive, negative or neutral). The study was in (location). if no effect is found reply: no effect was found'
        )
    #ruuun
    df = upload_analyze_papers(foods[i], hazards[i], API_Key, question=question)
    # combine with master dataframe
    dfall = pd.concat([dfall, df], ignore_index=True)

dfall

Bay leaf atmospheric CO2 increases
analyzing Bay leaf and atmospheric CO2 increases. Found 1 papers result: I apologize, but I cannot find the specific information you requested in the given pages. Could you please provide me with the question again?
Bignose unicornfish ocean acidification
analyzing Bignose unicornfish and ocean acidification. Found 1 papers result: 1a) "For example, the Bignose unicornfish (Naso vlamingii) exhibited a decrease in aerobic scope under elevated CO2 conditions" 
2a) Direct physiology 
3a) Negative 
4a) Ocean acidification impacts Bignose unicornfish through direct physiological mechanisms. 
5a) "The decrease in aerobic scope in Bignose unicornfish under elevated CO2 conditions suggests that the fish may have reduced capacity to perform high-intensity activities or respond to environmental stressors." 
6a) Na 
7a) The impact of ocean acidification on Bignose unicornfish via direct physiology is negative. The study did not specify a location for this partic

Unnamed: 0,filename,food,hazard,gen_quote,impact,direction,mechanism,quote,location,full_answer
0,Physiological implications of ocean acidificat...,Bignose unicornfish,ocean acidification,"""For example, the Bignose unicornfish (Naso vl...",Direct physiology,Negative,Ocean acidification impacts Bignose unicornfis...,"""The decrease in aerobic scope in Bignose unic...",Na,The impact of ocean acidification on Bignose u...
1,"Effect of Applied Ozone Dose, Time of Ozonizat...",Button mushroom,ozone,"""Mushrooms were subjected to gaseous ozone wit...",Indirect abiotic,No effect,Na,Na,Lab,No effect was fou
2,Proximity Effects of Larger Resident Fishes on...,Centropristis philadelphica,storms,The given pages do not contain information abo...,N,N,N,N,N,No effect was fou
3,Temporal distribution of fisheries in Payra Ri...,Corica soborna,precipitation,"""Exceptionally the rainfall out of the four me...",Indirect biotic,No effect,Na,Na,Payra River,The impact of precipitation on Corica soborna ...
4,Temporal distribution of fisheries in Payra Ri...,Ganges river sprat,precipitation,"""Rainfall was the most influencing driving for...",Indirect biotic,No effect mentioned,Na,Na,"Payra River, Bangladesh",No effect was fou
5,Grasshopper (Orthoptera Acrididae) communities...,Grasshopper,fires,"""Each of the three primary drivers of grasslan...",Indirect biotic,No effect,Na,Na,North American tallgrass prairies,No effect was found. The study was in North Am...
6,"Thermal tolerance, safety margins and acclimat...",Large yellow croaker,warming,"""Therefore, in the future, more efforts should...",Indirect abiotic,Negative,Large yellow croaker is impacted by warming th...,"""The results showed that chronic thermal stres...","Ningde, China",The impact of warming on Large yellow croaker ...
7,Ocean acidification and warming affectskeletal...,Leucoraja erinacea,ocean acidification,"""Ocean acidification and warming are known to ...",Indirect physiology,Negative,The impact of ocean acidification on Leucoraja...,Na,Lab,The impact of ocean acidification on Leucoraja...
8,Ocean acidification and warming affectskeletal...,Little skate,ocean acidification,"""Mineralization increased as a consequence of ...",Indirect abiotic,Negative,Ocean acidification impacts Little skate throu...,"""Mineralization affects stiffness and strength...",Lab,The impact of ocean acidification on Little sk...
9,Physiological implications of ocean acidificat...,Naso vlamingii,ocean acidification,"""For example, OA has been shown to negatively ...",Indirect biotic,Negative,Ocean acidification impacts Naso vlamingii thr...,"""OA has been shown to negatively impact the ol...",Not specified in the given pages.,The impact of ocean acidification on Naso vlam...


In [92]:
### SAVE OUTPUT as CSV (or another format)
filename = 'output_ALL.csv'
dfall.to_csv(filename)

In [52]:
#####  Single Test ######

food = 'Button mushroom'
hazard = 'ozone'
API_Key = 'sec_6LIDxgLBHBqmhkVam818PYYXqervcPSX' #make a user and get api key from https://www.chatpdf.com/docs/api/backend
#question  = f'1a) provide a quote from the text about how {hazard} impacts {food}? 2a) in what region is this study 3a) does {hazard} negatively or positively impact {food} (reply only negatvie/positive) 4a) would you classify the impact on {food} as direct physiology, indirect abiotic, or indirect biotic?  5a) can you give a quote providing an example of this?'

question = (
    f'1a) provide a quote from the text about how {hazard} impacts {food}? '
    f'2a) please classify the impact on {food} as direct physiology, indirect abiotic, indirect biotic, or none. Respond only direct physiology, indirect abiotic, indirect biotic, or no effect'
    f'3a) does {hazard} negatively, positively or neutrally impact {food} (Respond only with negatvie, positive, neutral, or No effect)'
    f'4a) If {food} is impacted by {hazard}, can you explain through what mechanisms? else, reply Na '
    f'5a) Please provide a quote from the text about the mechanism. if there is none reply Na '
    f'6a) select where this study took place. if its a lab this experment, select lab ' 
    f'7a) Provide an answer in the following format: The impact of {hazard} on {food} via (select: direct physiology, indirect abiotic, '
    f'or indirect biotic) are (select: positive, negative or neutral). The study was in (location). if no effect is found reply: no effect was found'
    )


all_results = upload_analyze_papers(food, hazard, API_Key, question=question)
all_results


found 1 paper(s)
loading file "Effect of Applied Ozone Dose, Time of Ozonization, and Storage Time on Selected Physicochemical Characteristics of Mushrooms (Agaricus bisporus).pdf" 
analyzing file...
['1', ' According to the text, "Exposure of mushrooms to ozone atmosphere does not cause large changes in quality and physical parameters of Agaricus bisporus."\n2', ' No effect.\n3', ' Neutral.\n4', ' Na.\n5', ' Na.\n6', ' The study was conducted in a lab.\n7', ' The impact of ozone on Button mushroom is no effect. The study was conducted in a lab.']
test1
test2
The impact of ozone on Button mushroom is no effect. The study was conducted in a lab.




To do for prompt test
- load papers
- get comporable output as human df
- make list/dict/df with question and dfID


To do for big tool
- find effective way to combine bibtxt csv and/or pdfs into groupings that make sense
- either append resutls here or on pypaperbot csv
- subset run to 'smaller' batches 
