### This helper script is used to rename IQC cdb report files.  

### Why am I doing this?  Because many of the AMC & LSCCMC files are identical.  There is no need to run/download them for each cohort since the same report holds all Vizient member hospitals in the output.  Also, the CCMC and Community files are identical in many cases.  Since the Python Selenium bot requires many hours to run/download all these reports, sometimes I will just run the reports for AMC & CCMC only.  Then, copy the AMC files and rename them LSCCMC.  Same thing for CCMC and Community.  Just download CCMC, copy the CCMC files and rename them as Community.  

### This is true because the AMC & LSCCMC cohorts both use the same risk model (AMC).  The CCMC and Community cohorts both use the same risk model (Community).  

In [99]:
import os
#libraries for interacting with dataframes in python
import pandas as pd
import numpy as np
#library for manipulating Excel files.
from openpyxl import load_workbook
import openpyxl

In [120]:
#assign filepath to iqc cdb report folder we want to rename.  Start at the 'Vizient Q&A Files' director.
filepath = r'P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\fy24 scraped cdb data\period_3_community_ccmc_reports\Vizient Q&A Files\Complex Care Medical Center'
filepath_output = r'P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\fy24 scraped cdb data\period_3_community_ccmc_reports\Vizient Q&A Files'

In [129]:
#filetype_var = 'dcost'
#filetype_var = 'edac'
#filetype_var = 'los'
filetype_var = 'mort'
#filetype_var = 'readm'
proxy_hospital_list = ['140286 NORTHWESTERN_KISH','149916 NORTHWESTERN_HUNTLEY']

In [122]:
def find_first_ws_col(worksheet_var):
    for i in range(1,worksheet_var.max_column+1):
        if worksheet_var.cell(row=1,column=i).value is None:
            pass
        else:
            return(i)

In [123]:
def find_ws_header_row(worksheet_var, first_val_col):
    for i in range(1,worksheet_var.max_row+1):
        if worksheet_var.cell(row=i,column=first_val_col).value == 'Hospital' or worksheet_var.cell(row=i,column=first_val_col).value == 'AHRQ Safety' or worksheet_var.cell(row=i,column=first_val_col).value == 'Encounter Month' or worksheet_var.cell(row=i,column=first_val_col).value == 'Hospital/ Hospital System':
            return(i)

In [124]:
def find_ws_populated_cols(worksheet_var,header_row):
    cols_with_values = []
    for i in range(1,worksheet_var.max_column+1):
        if worksheet_var.cell(row=header_row,column=i).value is not None:
            cols_with_values.append(i-1)
    return(cols_with_values)

In [125]:
def open_excel_file(path_obj, file_list_obj):
    
    #check for 'ghost file' with ~ in front of it.  
    if file_list_obj[0].startswith('~'):
        file_list_obj.remove(file_list_obj[0])
    #Create filename path in order to open the excel file
    file_loc = os.path.join(os.path.abspath(path_obj),file_list_obj)
    dirname = os.path.dirname(file_loc)
    #save the excel workbook object in a variable
    wb = openpyxl.load_workbook(file_loc)
    #save the excel worksheet object in a variable
    wb_sheetnames = wb.sheetnames
    #take the first sheet in the workbook
    ws = wb[wb_sheetnames[0]]
    #find the first column number with a value in it
    first_col = find_first_ws_col(ws)
    #print('first_col:',first_col)
    #find the first row number with 'Hospital' or 'AHRQ Safety' indicating the header row of the Vizient files
    header_row = find_ws_header_row(ws,first_col)
    #print('header_row:',header_row)
    #create a list of all columns which have values
    populated_columns = find_ws_populated_cols(ws,header_row)
    #print('pop columns',populated_columns)
    #parse the Excel spreadsheet to create a pandas dataframe
    xlsx_file = pd.DataFrame(pd.read_excel(file_loc,sheet_name=wb_sheetnames[0],skiprows=header_row-1,usecols=populated_columns,engine='openpyxl'))
    return(xlsx_file)

In [130]:
df_container = []
for i, item in enumerate(os.listdir(filepath)):
    for j, item2 in enumerate(os.listdir(os.path.join(filepath,item))):
        if filetype_var.upper() in item.upper(): #check for filetype_var in folder name
            #read the excel file
            excel_file_data = open_excel_file(os.path.join(filepath,item), item2)
            #conditionally filter data
            if filetype_var == 'dcost':
                excel_file_data = excel_file_data[["Hospital","Direct Cost Index","Mean Direct Cost (Obs)","Cases"]]
            elif filetype_var == 'edac':
                excel_file_data = excel_file_data[["Hospital","Excess Days Per 100 Index Encounters","Excess Days","Total Index Encounters"]]
            elif filetype_var == 'los':
                excel_file_data = excel_file_data[["Hospital","LOS Index","Mean LOS (Obs)","Cases"]]
            elif filetype_var == 'mort':
                excel_file_data = excel_file_data[["Hospital","Mortality Index","Deaths (Obs)","Cases"]]
            elif filetype_var == 'readm':
                excel_file_data = excel_file_data[["Hospital","PCT HWR Inpatient","Revisit Inpatient Cases","Total Index Encounters"]]
                
            
            excel_file_data = excel_file_data[excel_file_data['Hospital'].isin(proxy_hospital_list)]
                
                
            #check for missing rows
            #if missing rows, add a placeholder back in
            if excel_file_data.shape[0] < len(proxy_hospital_list):
                row_diff = len(proxy_hospital_list) - excel_file_data.shape[0]
                #find all hospitals in dataset
                remaining_hosp = list(excel_file_data['Hospital'].unique())
                #find hospitals in original search list not showing up in the dataset
                missing_hosp = [x for x in set(proxy_hospital_list) if x not in remaining_hosp]
                #create new dataframe from missing hospital list
                missing_df = pd.DataFrame(missing_hosp, columns =['Hospital'], dtype = str) 
                #union missing data to existing data
                excel_file_data = pd.concat([excel_file_data,missing_df])
                #add file_name column
                excel_file_data['file_name'] = item2
                #replace NaN with "missing"
                excel_file_data = excel_file_data.fillna('Missing')
                #print('missing ',row_diff,' row(s)')
                df_container.append(excel_file_data)
            else:
                #if not missing, just add the file_name column and append to list
                excel_file_data['file_name'] = item2
                df_container.append(excel_file_data)

#union all datasets
final_df = pd.concat(df_container)
final_df = final_df.sort_values(by=['file_name','Hospital'],ascending=True)

In [131]:
final_df

Unnamed: 0,Hospital,Mortality Index,Deaths (Obs),Cases,file_name
204,140286 NORTHWESTERN_KISH,1.40977,1,9,CCMC_MORT_CT_CUSTOM.xlsx
208,149916 NORTHWESTERN_HUNTLEY,0,0,3,CCMC_MORT_CT_CUSTOM.xlsx
0,140286 NORTHWESTERN_KISH,Missing,Missing,Missing,CCMC_MORT_NSURG_CUSTOM.xlsx
186,149916 NORTHWESTERN_HUNTLEY,0,0,1,CCMC_MORT_NSURG_CUSTOM.xlsx
256,140286 NORTHWESTERN_KISH,0.505438,7,179,CCMC_MORT_ONC_CUSTOM.xlsx
263,149916 NORTHWESTERN_HUNTLEY,0.171678,2,148,CCMC_MORT_ONC_CUSTOM.xlsx
248,140286 NORTHWESTERN_KISH,0.558902,13,63,CCMC_MORT_PULM_CUSTOM.xlsx
252,149916 NORTHWESTERN_HUNTLEY,1.04666,28,83,CCMC_MORT_PULM_CUSTOM.xlsx
262,140286 NORTHWESTERN_KISH,0,0,84,CCMC_MORT_TRAUMA_CUSTOM.xlsx
271,149916 NORTHWESTERN_HUNTLEY,0.471441,4,266,CCMC_MORT_TRAUMA_CUSTOM.xlsx


In [132]:
final_df.to_excel(os.path.join(filepath_output,filetype_var+"_df"+".xlsx"),index=False,engine="openpyxl")