In [1]:
#######################################
print('Setting up environment...')
#######################################

import os
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

#######################################
print('Done!')
#######################################

Setting up environment...
Done!


In [2]:
#######################################
print('Setting display options...')
#######################################

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.0f' % x)

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 999

#######################################
print('Done!')
#######################################

Setting display options...
Done!


In [3]:
#######################################
print('Establishing database connection...')
#######################################

# un-comment next line if getpass is not installed
# !pip install getpass4
try:
    from getpass import getpass
except ImportError:
    print("The 'getpass' module was not found. Un-comment line 6 to install and re-run code cell.")

# Insert your username into the textbox and hit enter
username = input("Please enter your username: ")

# Insert your password into the textbox and hit enter
password = getpass("Please enter your password: ")

db_postgres = create_engine('postgresql://'+username+':'+password+'@localhost:7005/drc')

#######################################
print('Done!')
#######################################

Establishing database connection...


Please enter your username:  
Please enter your password:  ········


Done!


In [4]:
print("Querying database...")

start_rpt_date = '2024-03-01'
end_rpt_date = '2024-03-29'

query = f'''
SELECT * FROM nih_dm.v_workspace_spend_report s
    WHERE s.date between '{start_rpt_date}' and '{end_rpt_date}'
'''

df = pd.read_sql(query, db_postgres)

print('Done!')

Querying database...
Done!


In [5]:
# data preview
print(f"{len(df)} rows returned.")
print("Data preview:")
df.head()

421822 rows returned.
Data preview:


Unnamed: 0,date,data_use_agreement_date,workspace_id,numerical_id,daily_cost,users_access_tier,active_researcher_registration_status,dua_user,dua_institution,inst_name,inst_category,inst_diversity,user_role,creation_time,usr_workspace_count
0,2024-03-04,2023-09-12 17:35:12,3664,21408063124554001373,0,"controlled,registered",Controlled Tier,2023-09-12 17:35:12,Complete,Bryant University,Academic Institution,,Mid Career,2022-06-09 15:47:50,Created >= 1 Workspaces
1,2024-03-03,2024-03-01 02:34:20,7209,45422598259695748910,0,"controlled,registered",Controlled Tier,2024-03-01 02:34:20,,"University of Tennessee Health Science Center, Memphis",Other,,Early Career,NaT,
2,2024-03-02,2023-11-02 16:44:18,12759,59898552867503916742,0,"controlled,registered",Controlled Tier,2023-11-02 16:44:18,,University of Virginia,Other,,Early Career,NaT,
3,2024-03-05,2022-09-25 13:56:37,1825,10385878304934508873,0,,,2022-09-25 13:56:37,,Georgia Institute of Technology,Other,,Trainee,NaT,
4,2024-03-02,2020-04-28 20:00:54,2389,81491810404224884430,0,"controlled,registered",Controlled Tier,NaT,,All of Us Program Operational Use,Other,,Other,2021-12-18 03:21:17,Created >= 1 Workspaces


In [6]:
# calculating maximum spend
max_spend = df.groupby(by=['date', 'numerical_id'])['daily_cost'].max()
max_spend = max_spend.reset_index().rename(columns={"daily_cost": "max_spend"})
max_spend

Unnamed: 0,date,numerical_id,max_spend
0,2024-03-01,10001415875980445692,0
1,2024-03-01,10002456310510382955,0
2,2024-03-01,10003113056703111809,0
3,2024-03-01,10008166313049040267,0
4,2024-03-01,10008735167713850186,0
...,...,...,...
168445,2024-03-29,99924648688265641191,0
168446,2024-03-29,99931217469499534299,0
168447,2024-03-29,99933784816784945458,0
168448,2024-03-29,99934250630208249684,0


In [7]:
# exporting data to Excel
with pd.ExcelWriter('rwb_spend_data.xlsx') as writer:
    df.to_excel(writer, sheet_name="All Data", index = False)
    max_spend.to_excel(writer, sheet_name="Individual Max Spending", index = False)

In [8]:
print('done')

done
