## high-z arxiv exgal bot

In [2]:

def get_arxiv_exgal():
    '''
    Script used to create table of recently posted papers -- sorted by posting order.
    NOTE: ignoring papers that are cross-listed.
    
    Will sort & keep only high-z ones, flagging the JWST-related ones
    
    
    NOTES:
    > if hidden item, get_attribute() works with "innerHTML", "innerText", "textContent"
    
    
    '''
    
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.common.by import By
    from selenium.webdriver.firefox.service import Service
    from selenium.webdriver.firefox.options import Options
    from datetime import datetime
    import threading, time, getpass, sys, subprocess
    import pandas as pd
    import numpy as np
    import sys
    
    __author__ = 'Taylor Hutchison'
    __email__ = 'astro.hutchison@gmail.com'
    
    
    from selenium.webdriver.firefox.service import Service
    from selenium.webdriver.firefox.options import Options
    
    # Initialize FirefoxOptions
    firefox_options = Options()
    firefox_options.add_argument("--headless")


    # BIG OUTPUT TEXT VARIABLE
    # (because printing won't work in datalab)
    big_output = ''

    
    # ------------------------ #
    # -- creating dataframe -- #
    # ------------------------ #
    df_dtypes = {'order':int,'id':str,'pri_category':str, 
                 'mathjax':bool, 'jwst':bool, 'sim':bool, 'lensing':bool, 
                 'agn':bool, 'dust':bool, 'overdense':bool, 'ism':bool,
                 'z':bool, 'title':str}
    df = pd.DataFrame({'order':[],'id':[],'pri_category':[],'mathjax':[],'jwst':[],
                       'sim':[],'lensing':[],'agn':[],'dust':[],'overdense':[],'ism':[],
                       'z':[],'title':[]})
    
    
    # opening browser & going to arXiv.org
    # driver = webdriver.Firefox()
    driver = webdriver.Firefox(options=firefox_options)
    # driver.get("https://export.arxiv.org/list/astro-ph/new")
    driver.get("https://arxiv.org/list/astro-ph/new") # until they fix export arxiv 
    
    date = driver.find_element(By.TAG_NAME,'h3')
    date = date.text.split(', ')[1] # just pulling out the date part
    # print(f'Pulling high-z + galaxy arXiv postings for {date}',end='\n\n\n')
    big_output += f'Pulling high-z + galaxy arXiv postings for {date}.\n\n'+\
                    'Lensing papers marked with ***\n'+\
                    'AGN papers marked with ooo\n'+\
                    'Dusty papers marked with @@@\n'+\
                    'Overdensity papers marked with {{{\n'+\
                    'ISM-related papers marked with !!!'+\
                    '\n\n\n'
    
    # pulling all of the New Submissions
    posts = driver.find_element(By.ID,'articles')
    
    
    # flags for identifying highz work
    highz_flags = [r'z\sim',r'z \sim','z~','z ~',r' z ',r' z,',r'z=',r'z =',r'z \leq',
                   r'z\leq',r'z\geq',r'z \geq',r'z \lesssim',r'z\lesssim',
                   r'z \gtrsim',r'z\gtrsim',r'z_',r'z _',r'z>',r'z >',r'z<',r'z <',
                   'high-z','high z ','high-redshift','high redshift','cosmic dawn',
                   'cosmic noon','High-Redshift']
    
    # flags for simulations
    sim_flags = ['cosmological simulation',' TNG',' EAGLE',
                 'numerical simulation','macroscopic simulation',
                 'radiative transfer','hydrodynamical equation']

    # flags for gravitational lensing
    grav_flags = ['gravitational lensing','gravitational-lensing','lensing','magnified']

    # flags for AGN
    agn_flags = ['AGN','SMBH','active galactic nuclei','quasar','QSO','direct collapse'
                 'radio jet','accretion disk','black hole','Black Hole']

    # flags for dust
    dusty_flags = ['dust grain','dust','polycyclic aromatic hydrocarbons', 
                   'submillimeter','submillimetre','ALMA','obscured','attenuated']

    # flags for galaxy clusters
    cluster_flags = ['protocluster','galaxy cluster','overdensity','LSS','large scale structure',
                     'galaxy environment','field counterpart','BCG','virialization']

    # flags for ISM-related things
    ism_flags = ['nebular','density','metallicity','temperature',
                 'ionization','abundance','emission','auroral',
                 'O/H','C/O','N/O','Ar/O']
    
    
    # flags for identifying lowz & local work
    # I add more as I find them
    local_things = ['differential radial velocities','differential radial velocity',
                    'radial velocities','radial velocity',' AU.', ' AU,',' AU ',
                    ' RV ','(RV)','(RVs)',' RVs ',' RV,','Galactic','LPV',
                    'Large Magellanic Cloud','LMC','Milky Way',' NGC ','M33',
                    'NGC ','polarisation','Polarisation','SDSS MaNGA','Hubble constant',
                    'Nova Remnant','nova remnant','protoplanetary','exoplanet','ULX']
    lowz_things = ['z < 1 ', 'z < 1.', r'z \sim 1.', 'z < 0.', 
                   'z<1 ', 'z<1.', r'z\sim1.', 'z<0.',
                   'low-redshift','low redshift','low-z','low z',
                   'redshift < 1.','redshift < 1 ','redshift < 0.',
                   'redshift = 1.','redshift = 1 ','redshift = 0.',
                   'redshift of 1.','redshift of 1 ','redshift of 0.','redshift range 0.',
                   'Redshift < 1.','Redshift < 1 ','Redshift < 0.',
                   'Redshift = 1.','Redshift = 1 ','Redshift = 0.']

    
    # -- arXiv ID -- #
    # -------------- #
    items = posts.find_elements(By.TAG_NAME,'dt')
    meta_infos = posts.find_elements(By.TAG_NAME,'dd')
    
    # running through the posts to pull out arXiv ID
    for i,item in enumerate(items):
        # getting the meta info
        meta_info = meta_infos[i]
        meta_info = meta_info.find_element(By.CLASS_NAME,'meta')
        
        # category, could be >1
        categories = meta_info.find_element(By.CLASS_NAME,'list-subjects')
        primary_category = categories.find_element(By.CLASS_NAME,'primary-subject').text
        other_categories = categories.text.lstrip('Subjects: ').lstrip(primary_category)
    
        # checking if we include it at all (aka if it's the Galaxies category)
        # if not, we skip this entry
        if 'astro-ph.GA' not in categories.text: continue
    
        # it IS a Galaxies paper:
        # -------------------------
        # arxiv ID
        arxiv_id = str(item.text.split('arXiv:')[1].split(' [')[0]) # arXiv number
        # print(f'https://arxiv.org/abs/{arxiv_id}')
        
        # title
        title = meta_info.find_element(By.CLASS_NAME,'list-title.mathjax').text
        title = title.replace('\n',' ') # just in case there's "new lines"
        
        # abstract
        abstract_loc = meta_info.find_element(By.TAG_NAME,'p')
        abstract_split = abstract_loc.text.split('\n') # slipt up where missing mathjax is
        new_abstract = abstract_split[0] # if no mathjax, this is full abstract
        mathjax_flag = False
        
        if len(abstract_split)>1: # checks if there IS mathjax involved
            mathjax_flag = True
            mathjax_locs = abstract_loc.find_elements(By.TAG_NAME, 'script') # the mathjax locs
            mathjax = [m.get_attribute("textContent") for m in mathjax_locs] # all mathjax text
            # I'm just manually adding the pre-mathified text back in, because slack won't convert anyway
            new_abstract = abstract_split[0] # piece-wising it together
            for j in range(len(mathjax)):
                new_abstract += ' ' + mathjax[j]
                new_abstract += abstract_split[j+1]
    
    
        # checking for JWST in title & abstract
        jwst_flag = False
        if 'jwst' in title.lower() or 'jwst' in new_abstract.lower():
            jwst_flag = True
            # title = '(_JWST_) ' + title
    
        # checking for highz language in title & abstract
        # but first removing z < 1 things
        lowz = 0
        for thing in local_things:
            isit,new_abstract = check_lowz(new_abstract,thing)
            lowz += isit
    
        for thing in lowz_things:
            isit,new_abstract = check_lowz(new_abstract,thing)
            lowz += isit
        
        # if lowz > 0: print('lowz',lowz)
    
        # after removing the lowz language, checking if a z is still there
        redshift = False
        if any(x in new_abstract for x in highz_flags) == True:
            redshift = True
            # title = 'z ' + title
    
        # checking if simulation paper, 
        # and not just an abstract that mentions comparisons to sims
        sims = False
        if any(x in new_abstract for x in sim_flags) == True:
            test = new_abstract.split('simulation')
            if len(test) > 2: sims = True
        if any(x in title for x in sim_flags) == True:
            sims = True

        # checking if gravitational lensing paper
        lensing = False
        if any(x in new_abstract for x in grav_flags) == True:
            lensing = True

        # checking if agn paper
        agn = False
        if any(x in new_abstract for x in agn_flags) == True:
            agn = True

        # checking if dusty
        dust = False
        if any(x in new_abstract for x in dusty_flags) == True:
            dust = True

        # checking if clustering/protocluster/overdensity paper
        overdensity = False
        if any(x in new_abstract for x in cluster_flags) == True:
            overdensity = True

        # checking if ISM-related paper
        ism = False
        if any(x in new_abstract for x in ism_flags) == True:
            ism = True
            

        # adding to end of dataframe
        # order, id, pri_category, mathjax, jwst, sim, z, title
        df.loc[len(df)] = [i, str(arxiv_id), primary_category, mathjax_flag,
                           jwst_flag, sims, lensing, agn, dust, overdensity, ism,
                           redshift, title]
        
        # print(title)
        # # print('>'+new_abstract,end='\n\n')
        # print(f'arxiv.org/abs/{arxiv_id}',end='\n\n')
    
    # ------------------------- #
    
    
    # running through df for printing high-z first
    jwst_high_z = df.query('jwst == True and z == True and sim == False').copy()
    high_z = df.query('z == True and sim == False').copy()
    high_z.drop(jwst_high_z.index.values,inplace=True) # removing the JWST ones
    
    high_z_sim = df.query('z == True and sim == True').copy()
    
    
    if len(jwst_high_z) > 0:
        # print('''JWST-related high-z papers:
# =======================''')
        big_output += '''=======================
JWST-related high-z papers:
=======================\n'''
        for i in jwst_high_z.index.values:
            if jwst_high_z.loc[i,'lensing']: big_output += '*** '
            if jwst_high_z.loc[i,'agn']: big_output += 'ooo '
            if jwst_high_z.loc[i,'dust']: big_output += '@@@ '
            if jwst_high_z.loc[i,'overdense']: big_output += '{{{ '
            if jwst_high_z.loc[i,'ism']: big_output += '!!! '

            # adding separater
            if big_output[-2:] == '* ' or big_output[-2:] == 'o ' \
                or big_output[-2:] == '@ ' or big_output[-2:] == '{ ' \
                or big_output[-2:] == '! ':
                big_output += '/ '
                
            big_output += f"{jwst_high_z.loc[i,'title']}\n"
            big_output += f"arxiv.org/abs/{jwst_high_z.loc[i,'id']}\n\n"
    
    if len(high_z) > 0:
#         print('''
        
# non-JWST high-z papers:
# =====================''')
        big_output += '''
====================
non-JWST high-z papers:
====================\n'''
        for i in high_z.index.values:
            if high_z.loc[i,'lensing']: big_output += '*** '
            if high_z.loc[i,'agn']: big_output += 'ooo '
            if high_z.loc[i,'dust']: big_output += '@@@ '
            if high_z.loc[i,'overdense']: big_output += '{{{ '
            if high_z.loc[i,'ism']: big_output += '!!! '

            # adding separater
            if big_output[-2:] == '* ' or big_output[-2:] == 'o ' \
                or big_output[-2:] == '@ ' or big_output[-2:] == '{ ' \
                or big_output[-2:] == '! ':
                big_output += '/ '
                
            big_output += f"{high_z.loc[i,'title']}\n"
            big_output += f"arxiv.org/abs/{high_z.loc[i,'id']}\n\n"
    
    if len(high_z_sim) > 0:
#         print('''
        
# high-z simulation papers:
# =======================''')
        big_output += '''
====================   
high-z simulation papers:
====================\n'''
        for i in high_z_sim.index.values:
            if high_z_sim.loc[i,'lensing']: big_output += '*** '
            if high_z_sim.loc[i,'agn']: big_output += 'ooo '
            if high_z_sim.loc[i,'dust']: big_output += '@@@ '
            if high_z_sim.loc[i,'overdense']: big_output += '{{{ '
            if high_z_sim.loc[i,'ism']: big_output += '!!! '

            # adding separater
            if big_output[-2:] == '* ' or big_output[-2:] == 'o ' \
                or big_output[-2:] == '@ ' or big_output[-2:] == '{ ' \
                or big_output[-2:] == '! ':
                big_output += '/ '
                
            big_output += f"{high_z_sim.loc[i,'title']}\n"
            big_output += f"arxiv.org/abs/{high_z_sim.loc[i,'id']}\n\n"
    
    
    driver.close()

    # returning the "big output" string
    return big_output



# checking if lowz flags are in there
def check_lowz(abstract,key):
    if type(abstract) == str:
        filler = abstract.split(key) # will return a list
        yes = len(filler) > 1 # if key not in abstract, is false
        return int(yes),' '.join(filler) # returns a string
        

Error: Failed to execute this cell, please try again.

## the code for Slack

In [3]:
%%capture
!pip install selenium

In [4]:
import os
from slack_sdk import WebClient
from datetime import datetime as dt
from zoneinfo import ZoneInfo
tzinfo=ZoneInfo("America/New_York") # set to new york for day check

# arxiv holidays (no posting)
# need to update some of these dates yearly!
holidays = ['Jan 01','Jan 19','Jun 19','Dec 25']


# ARXIV DAILY POSTING
# --------------------
# if it's Friday or Saturday, no posting
day = dt.now(tz=tzinfo).strftime('%A')
date = dt.now(tz=tzinfo).strftime('%b %d')

if day != 'Friday' and day != 'Saturday': 
    if date not in holidays:
        message = "Here's the exgal arxiv postings for today!\n\n"
        message += get_arxiv_exgal()    
        
        # Set up a WebClient with the Slack OAuth token
        client = WebClient(token=os.environ["SLACK_BOT_TOKEN"])
        
        # Send a message
        client.chat_postMessage(
            channel="astro-ph", 
            # channel="testing-arxiv", # when testing
            text=message, 
            username="astro-ph-exgal"
        )

In [5]:
# import os, subprocess
# from slack_bolt import App
# from slack_bolt.adapter.socket_mode import SocketModeHandler

# # Initializes your app with your bot token and socket mode handler
# app = App(token=os.environ.get("SLACK_BOT_TOKEN"))

# # Listens to incoming messages that contain "hello"
# # To learn available listener arguments,
# # visit https://docs.slack.dev/tools/bolt-python/reference/kwargs_injection/args.html
# @app.message("hello")
# def message_hello(message, say):
#     # say() sends a message to the channel where the event was triggered
#     say(f"Hey there <@{message['user']}>!")


# @app.event("message")
# def handle_message_events(body, logger):
#     logger.info(body)


# # running the arxiv call
# @app.event("app_mention")
# def call_arxiv(say):
#     say(f"Here's the arxiv posting for today!")

#     # result = subprocess.check_output('python3 access-arxiv.py', shell=True, text=True)
#     result = get_arxiv_exgal() # a function instead of a separate script
#     say(result)


    

# # Start your app
# if __name__ == "__main__":
#     SocketModeHandler(app, os.environ["SLACK_APP_TOKEN"]).start()