In [14]:
import pandas as pd
import pyodbc
import openpyxl
import os
import re
import math
import numpy as np

In [15]:
#does the same as above but takes a full file path + filename as input.
def get_wb_object2(path_and_file):
    #file_loc = os.path.abspath(os.path.join(path_obj,file_obj))
    wb = openpyxl.load_workbook(path_and_file, data_only = True)
    return(wb)

In [16]:
#does the same as above but takes a full file path + filename as input.
def grab_worksheet_list2(path_and_file):
    #file_loc = os.path.abspath(os.path.join(path_obj,file_obj))
    wb = openpyxl.load_workbook(path_and_file, data_only = True)
    sheet_list = wb.sheetnames
    return(sheet_list)

### Recursively walk down the original calculator folder structure, read in the workbook, grab the cumulative rank worksheet into a dataframe, clean it.  Concatenate all ranking file data together and export to csv ###

In [17]:
def find_year_name(f_name):
    file_nm = f_name.split('\\')[-1].upper().strip()
    print(file_nm)
    a = re.search(r'.*([2][0-9]{3})', file_nm)
    year = a.group(1)
    year_index = file_nm.index(year)
    return(year,year_index)

In [18]:
def find_period_name(f_name, year_index):
    file_nm = f_name.split('\\')[-1].upper().strip()
    if 'PERIOD' in file_nm:
            period_index = file_nm.index('PERIOD')
            period_nm = file_nm[period_index:year_index].replace(' ','').replace('_','')
    elif 'ANNUAL' in file_nm:
            period_index = file_nm.index('ANNUAL')
            period_nm = file_nm[period_index:year_index].replace(' ','').replace('_','')
            
    return(period_nm,period_index)

In [19]:
def extract_cumulative_ranks(file_name):
    #grab the wb object into to a variable
    calc_wb = get_wb_object2(file_name)
    #store the cumulative rank worksheet into a pandas dataframe
    df = pd.DataFrame(calc_wb['CurrentQA Cumulative-Rank'].values)
    #in openpyxl, the headers are listed as numbers.  Renaming the headers using the first row values.
    df = df.rename(columns=df.iloc[0])
    #drop the first row because they are now the header
    df = df.drop([0])
    #print(df.columns)
    #retrieve name of the parent id column.  Sometimes this column name all caps, sometimes not.  
    parent_id_col = [x for x in df.columns if type(x) == str and x is not None and x.upper().startswith('PARENT') == True and x.upper().endswith('ID') == True][0]
    #drop all rows that do not have a hospital value
    df_all_row_indices =  df[parent_id_col].notnull()
    df = df[df_all_row_indices]
    
    if 'eq_rank' not in df.columns:
        #only save specific columns we need
        df = df[[parent_id_col, 'HCO_SHORT_NAME', 'Mort_score', 'mort_Rank', 'Eff_Score', 'eff_Rank', 'Safety_score', 'safety_Rank', 'effect_score', 'effect_Rank', 'patct_score', 'Patct_Rank', 'final_score', 'final_Rank', 'mort_wt_score', 'eff_wt_score', 'safety_wt_score', 'effect_wt_score', 'patct_wt_score']]
        
        df['eq_score'] = None
        df['eq_rank'] = None
        df['eq_wt_score'] = None
        
        df = df[[parent_id_col, 'HCO_SHORT_NAME', 'Mort_score', 'mort_Rank', 'Eff_Score', 'eff_Rank', 'Safety_score', 'safety_Rank', 'effect_score', 'effect_Rank', 'patct_score', 'Patct_Rank', 'eq_score', 'eq_rank', 'final_score', 'final_Rank', 'mort_wt_score', 'eff_wt_score', 'safety_wt_score', 'effect_wt_score', 'patct_wt_score', 'eq_wt_score']]
    else:
        #only save specific columns we need
        df = df[[parent_id_col, 'HCO_SHORT_NAME', 'Mort_score', 'mort_Rank', 'Eff_Score', 'eff_Rank', 'Safety_score', 'safety_Rank', 'effect_score', 'effect_Rank', 'patct_score', 'Patct_Rank', 'eq_score', 'eq_rank', 'final_score', 'final_Rank', 'mort_wt_score', 'eff_wt_score', 'safety_wt_score', 'effect_wt_score', 'patct_wt_score', 'eq_wt_score']]
    return(df)

In [20]:
def extract_metric_ranks(file_name):
    #grab the wb object into to a variable
    calc_wb = get_wb_object2(file_name)
    #store the cumulative rank worksheet into a pandas dataframe
    df = pd.DataFrame(calc_wb['CurrentQA Cumulative-Metric'].values)
    #in openpyxl, the headers are listed as numbers.  Renaming the headers using the first row values.
    df = df.rename(columns=df.iloc[0])
    #drop the first row because they are now the header
    df = df.drop([0])
    print(df.columns)
    if '592 occura' in df.columns:
        df = df.rename(columns={'592 occura': "Metric"})
    #retrieve name of the parent id column.  Sometimes this column name all caps, sometimes not.  
    parent_id_col = [x for x in df.columns if type(x) == str and x is not None and x.upper().startswith('PARENT') == True and x.upper().endswith('ID') == True][0]
    metric_nm_col = [x for x in df.columns if type(x) == str and x is not None and x.upper() == 'METRIC'][0]
    #drop all rows that do not have a hospital value
    df_all_row_indices =  df[parent_id_col].notnull()
    df = df[df_all_row_indices]
    
    try:
        df = df[[metric_nm_col,parent_id_col,'rate','overall_weight','z']]
    except:
        
        try:
            df = df[[metric_nm_col,parent_id_col,'rate/result2','% overall/wt_metric_score','z']]
        except:
            df = df[[metric_nm_col,parent_id_col,'rate','% Overall/Overall Weight','z']]
            
            
    return(df)

In [21]:
def find_ws_measure_weight_xls_header_row(xl_file_obj):
    xl_file_df = pd.read_excel(xl_file_obj, sheet_name="Metric Weights",header=None, engine='openpyxl')
    #get the name of the first column
    first_col_name = xl_file_df.columns[0]
    #get the index of the first row equal to 'Metric'
    first_row_loc = xl_file_df.index[xl_file_df[first_col_name] == 'Metric'].tolist()
    #return index
    return(first_row_loc[0])

In [22]:
def open_calc_measure_weight(file_obj):
    #join the file path and the file name
    file_loc2 = os.path.abspath(file_obj)
    #fine the index of the header row in order to know how many rows to skip
    measure_weight_header_row = find_ws_measure_weight_xls_header_row(file_loc2)
    #open the excel file 'Metric Weights' sheet.
    opened_excel_file = pd.read_excel(file_loc2, sheet_name="Metric Weights",skiprows =measure_weight_header_row, engine='openpyxl')
    #subset columns to only measure weights.
    #find 3 columns:  Metric, Metric Weight (% of domain weight), Metric Weight (% of overall)
    #UL003
    keep_cols = [i for i in opened_excel_file.columns if (i.upper() == 'METRIC' or i.upper() == 'DOMAIN')]
    
    opened_excel_file = opened_excel_file[keep_cols]
    
    if 'metric' in opened_excel_file.columns:
        opened_excel_file = opened_excel_file.rename(columns={'metric': "Metric"})
        
    if 'METRIC' in opened_excel_file.columns:
        opened_excel_file = opened_excel_file.rename(columns={'METRIC': "Metric"})
        
        
    opened_excel_file = opened_excel_file.dropna(how='all')
    
    return(opened_excel_file)

In [23]:
#test_path = r'P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\original\2019\period2_original'
#test_file = r'QACalculator_140242_Period 2_2019.xlsm'

In [24]:
#t = open_calc_measure_weight(os.path.join(test_path,test_file))

# SPECIAL SECTION:  2021 Period 3 Baseline & 2021 Period 4 Annual calculators required re-cohorting
# Delnor, Kish and Huntley were moved to Community.  NM wants them to be back in CCMC.  Therefore, I am creating a proxy Period 3 CCMC calculator for those 3 hospitals.  Adding a special function to manually extract those values and append to the all_period_rankings dataset.

# As of 2023, this is only done for Kish and Huntley.

## BEGIN ADHOC function section ##

In [25]:
def grab_calc_worksheet_calc_sheet(path_obj,file_obj):
    file_loc = os.path.abspath(os.path.join(path_obj,file_obj))
    wb = openpyxl.load_workbook(file_loc, data_only = True)
    ws = wb['Calculator']
    return(ws)

In [26]:
def find_calc_phrase_cell_coords(worksheet_obj,phrase):
    #Find phrase cell coordinates.
    #iterate over all worksheet cells until you find the desired phrase
    for row in worksheet_obj.iter_rows():
        for cell in row:
                #iterate over worksheet cells until you find the first instance
                #of the phrase you are looking for.
                if cell.value == phrase:
                    return([cell.row,cell.column,phrase])

In [27]:
def parse_calculator_and_return_clean_dataframe(path_obj, file_obj):
    # take the path and file name, join them together and isolate the 'Calculator' worksheet
    ws = grab_calc_worksheet_calc_sheet(path_obj, file_obj)

    # find the coordinates of the 'Domain' header cell of the 'Calculator' worksheet
    domain_coord = find_calc_phrase_cell_coords(ws, 'Domain')

    # find the coordinates of the 'Target Domain Ranking' header cell of the 'Calculator' worksheet
    target_domain_coord = find_calc_phrase_cell_coords(ws, 'Target Domain Ranking')

    # store worksheet object as pandas dataframe
    df = pd.DataFrame(ws.values)

    # use column coordinates of "Domain" header to drop all columns to the left of it.
    col_drop_list = [i for i in range(domain_coord[1] - 1)]
    df = df.drop(col_drop_list, axis=1)

    # rename column headers using the 'Domain' header row coordinates
    df = df.rename(columns=df.iloc[domain_coord[0] - 1])

    # use row coordinates of "Domain" header row to drop all rows before it.
    row_drop_list = [i for i in range(domain_coord[0])]
    df = df.drop(row_drop_list)

    # drop colum where all values are NA
    df = df.dropna(axis='columns', how="all")

    # drop rows where all values are NA
    df = df.dropna(how="all")

    # forward fill domain name, domain rank result and target domain ranking columns because
    # the values only appear on one row per domain group causing NAs.  For easier parsing, just repeat for each line.
    df['Domain'] = df['Domain'].fillna(method='ffill')
    df[' Domain Rank Result'] = df[' Domain Rank Result'].fillna(method='ffill')
    df['Target Domain Ranking'] = df['Target Domain Ranking'].fillna(method='ffill')
    df['Current Domain Ranking'] = df['Current Domain Ranking'].fillna(method='ffill')

    # find the range of columns between "Target Domain Ranking" column and the "Metric Value" column
    # in the "What if" Performance Evaluation section.  Unfortunately, there are several "Metric Value"
    # columns so, we must first find the index of "Domain Rank Result", then subtract 4 columns positions.
    # want to keep first two columns so "Domain", and "Measure" are grabbed
    domain_and_measure_indices = [0, 1]
    keep_cols = [i for i in
                 range(df.columns.get_loc(' Domain Rank Result') - 4, df.columns.get_loc('Target Domain Ranking') + 1)]
    final_keep_cols = domain_and_measure_indices + keep_cols
    #df = df[final_keep_cols]
    df = df.iloc[:,final_keep_cols]

    #print(df.columns)
    # remove copyright row at the bottom.
    df = df[~df['Domain'].str.contains('Copyright', na=False)]

    #UL003  
    # remove rows that have "No Rank" in the domain rank section.  This means they are greyed out and not used.
    #df = df[df[' Domain Rank Result'] != 'No Rank']
    return (df)

In [28]:
def create_2021_proxy_ccmc_hosp_rows(modified_calc_file_path,modified_calc_list, empty_df_container,ccmc_proxy_year_var,ccmc_proxy_period_var,item_final_rank_var,hco_short_name_list_local,parent_id_list_local):
    for i, item in enumerate(modified_calc_list):

        # function that takes a calculator file path and name
        # and returns a pandas dataframe of just the domain name, measure name,
        # "What if" section, "Current ranking" section and "Target Performance Evaluation" section
        clean_df = parse_calculator_and_return_clean_dataframe(modified_calc_file_path, item)
        #UL003
        clean_df = clean_df[clean_df[' Domain Rank Result'] != 'No Rank']

        # remove rows with LV marker or '-' marker

        clean_df = clean_df[clean_df['Metric Value'] != 'LV']
        clean_df = clean_df[clean_df['Metric Value'] != '-']

        # drop blank rows
        clean_df = clean_df[pd.notnull(clean_df['Metric Value'])]

        overall_score = clean_df['% of Overall Score'].sum()

        domain_sums = clean_df.groupby(['Domain'])['% of Overall Score'].sum().reset_index()

        print()

        effect_wt_score = domain_sums[domain_sums['Domain'] == 'Effectiveness']['% of Overall Score'].iloc[0]
        eff_wt_score = domain_sums[domain_sums['Domain'] == 'Efficiency']['% of Overall Score'].iloc[0]
        eq_wt_score = domain_sums[domain_sums['Domain'] == 'Equity']['% of Overall Score'].iloc[0]
        mort_wt_score = domain_sums[domain_sums['Domain'] == 'Mortality']['% of Overall Score'].iloc[0]
        patct_wt_score = domain_sums[domain_sums['Domain'] == 'Patient Centeredness']['% of Overall Score'].iloc[0]
        safety_wt_score = domain_sums[domain_sums['Domain'] == 'Safety']['% of Overall Score'].iloc[0]


        domain_ranks = clean_df[['Domain',' Domain Rank Result']].drop_duplicates()


        effect_Rank = domain_ranks[domain_ranks['Domain'] == 'Effectiveness'][' Domain Rank Result'].iloc[0]
        eff_Rank = domain_ranks[domain_ranks['Domain'] == 'Efficiency'][' Domain Rank Result'].iloc[0]
        eq_Rank = domain_ranks[domain_ranks['Domain'] == 'Equity'][' Domain Rank Result'].iloc[0]
        mort_Rank = domain_ranks[domain_ranks['Domain'] == 'Mortality'][' Domain Rank Result'].iloc[0]
        patct_Rank = domain_ranks[domain_ranks['Domain'] == 'Patient Centeredness'][' Domain Rank Result'].iloc[0]
        safety_Rank = domain_ranks[domain_ranks['Domain'] == 'Safety'][' Domain Rank Result'].iloc[0]

        empty_df_container = empty_df_container.append({'COHORT':ccmc_proxy_cohort_nm,'Eff_Score':None,\
                                              'HCO_SHORT_NAME':hco_short_name_list_local[i],'Mort_score':None,\
                                              'PERIOD':ccmc_proxy_period_var,'Parent_ID':parent_id_list_local[i],\
                                              'Patct_Rank':patct_Rank,'Safety_score':None,'YEAR':ccmc_proxy_year_var,\
                                              'eff_Rank':eff_Rank,'eff_wt_score':eff_wt_score,'effect_Rank':effect_Rank,\
                                              'effect_score':None,'effect_wt_score':effect_wt_score,'eq_rank':eq_Rank,\
                                              'eq_score':None,'eq_wt_score':eq_wt_score,'final_Rank':item_final_rank_var[i],\
                                              'final_score':overall_score,'mort_Rank':mort_Rank,'mort_wt_score':mort_wt_score,\
                                              'patct_score':None,'patct_wt_score':patct_wt_score,\
                                              'safety_Rank':safety_Rank,'safety_wt_score':safety_wt_score}, 
                    ignore_index = True)

        print('Row added:',hco_short_name_list_local[i])
        print('Overall Score:',overall_score)
        print(domain_sums)
        print('--------------------------------------------')
        print(domain_ranks)
    return(empty_df_container)

In [29]:
#period 3 2021
ccmc_proxy_calc_file_path_p32021 = r"P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\fy22\DELNOR_KISH_HUNTLEY_Period3_CCMC_PROXY\calcs_modified"
ccmc_proxy_item_list_p32021 = ['QACalculator_DELNOR_Period3_2021.xlsm','QACalculator_HH_Period3_2021.xlsm','QACalculator_KISH_Period3_2021.xlsm']
item_final_rank_p32021 = [12,15,2]
ccmc_proxy_year_p32021 = 2021
ccmc_proxy_period_p32021 = 'PERIOD3'
parent_id_list_p32021 = ['140211','149916','140286']
hco_short_name_list_p32021 = ['NORTHWESTERN_DELNOR_PROXY','NORTHWESTERN_HUNTLEY_PROXY','NORTHWESTERN_KISH_PROXY']
ccmc_proxy_cohort_nm = 'Complex Care Medical Center'

In [30]:
#period 4 2021
ccmc_proxy_calc_file_path_p42021 = r"P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\fy22\DELNOR_KISH_HUNTLEY_Period4_CCMC_PROXY\calcs_modified"
ccmc_proxy_item_list_p42021 = ['QACalculator_DELNOR_annual_2021.xlsm','QACalculator_HH_annual_2021.xlsm','QACalculator_KISH_annual_2021.xlsm']
item_final_rank_p42021 = [8,7,2]
ccmc_proxy_year_p42021 = 2021
ccmc_proxy_period_p42021 = 'ANNUAL'
parent_id_list_p42021 = ['140211','149916','140286']
hco_short_name_list_p42021 = ['NORTHWESTERN_DELNOR_PROXY','NORTHWESTERN_HUNTLEY_PROXY','NORTHWESTERN_KISH_PROXY']
ccmc_proxy_cohort_nm = 'Complex Care Medical Center'

In [31]:
#period 2 2022
ccmc_proxy_calc_file_path_p22022 = r"P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\fy23\KISH_HUNTLEY_Period2_CCMC_PROXY\calcs_modified"
ccmc_proxy_item_list_p22022 = ['QACalculator_149916_Period2_2022.xlsm','QACalculator_140286_Period2_2022.xlsm']
item_final_rank_p22022 = [16,16]
ccmc_proxy_year_p22022 = 2022
ccmc_proxy_period_p22022 = 'PERIOD2'
parent_id_list_p22022 = ['149916','140286']
hco_short_name_list_p22022 = ['NORTHWESTERN_HUNTLEY_PROXY','NORTHWESTERN_KISH_PROXY']
ccmc_proxy_cohort_nm = 'Complex Care Medical Center'

In [32]:
#period 3 2022
ccmc_proxy_calc_file_path_p32022 = r"P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\fy23\KISH_HUNTLEY_Period3_CCMC_PROXY\calcs_modified"
ccmc_proxy_item_list_p32022 = ['QACalculator_149916_Period3_2022.xlsm','QACalculator_140286_Period3_2022.xlsm']
item_final_rank_p32022 = [13,14]
ccmc_proxy_year_p32022 = 2022
ccmc_proxy_period_p32022 = 'PERIOD3'
parent_id_list_p32022 = ['149916','140286']
hco_short_name_list_p32022 = ['NORTHWESTERN_HUNTLEY_PROXY','NORTHWESTERN_KISH_PROXY']
ccmc_proxy_cohort_nm = 'Complex Care Medical Center'

In [33]:
#period 2 2023
ccmc_proxy_calc_file_path_p22023 = r"P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\fy24\KISH_HUNTLEY_Period2_CCMC_PROXY\calcs_modified"
ccmc_proxy_item_list_p22023 = ['QACalculator_149916_Period2_2023.xlsm','QACalculator_140286_Period2_2023.xlsm']
item_final_rank_p22023 = [21,8]
ccmc_proxy_year_p22023 = 2023
ccmc_proxy_period_p22023 = 'PERIOD2'
parent_id_list_p22023 = ['149916','140286']
hco_short_name_list_p22023 = ['NORTHWESTERN_HUNTLEY_PROXY','NORTHWESTERN_KISH_PROXY']
ccmc_proxy_cohort_nm = 'Complex Care Medical Center'

In [35]:
#period 3 2023
ccmc_proxy_calc_file_path_p32023 = r"P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\fy24\KISH_HUNTLEY_Period3_CCMC_PROXY\calcs_modified"
ccmc_proxy_item_list_p32023= ['QACalculator_149916_Period3_2023.xlsm','QACalculator_140286_Period3_2023.xlsm']
item_final_rank_p32023 = [11,6]
ccmc_proxy_year_p32023 = 2023
ccmc_proxy_period_p32023 = 'PERIOD3'
parent_id_list_p32023 = ['149916','140286']
hco_short_name_list_p32023 = ['NORTHWESTERN_HUNTLEY_PROXY','NORTHWESTERN_KISH_PROXY']
ccmc_proxy_cohort_nm = 'Complex Care Medical Center'

In [36]:
#empty df container to hold ccmc proxy values.
ccmc_proxy_df = pd.DataFrame(columns = ['COHORT','Eff_Score','HCO_SHORT_NAME','Mort_score','PERIOD','Parent_ID','Patct_Rank','Safety_score','YEAR','eff_Rank','eff_wt_score','effect_Rank','effect_score','effect_wt_score','eq_rank','eq_score','eq_wt_score','final_Rank','final_score','mort_Rank','mort_wt_score','patct_score','patct_wt_score','safety_Rank','safety_wt_score'])

## END adhoc function section

# CODE to create all_period_vizient ranking file

In [37]:
path = os.path.abspath(input('Enter path of original Vizient Calculator folders.'))

files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        if '.xlsm' in file:
            files.append(os.path.join(r, file))
file_counter = 0

dataframe_list = []
for f in files:
    if ('140130' in f) or ('140281' in f) or ('140242' in f) or ('141340' in f) or ('GreatState' in f) or ('149916' in f):
        #extract year and year index from the file name
        year, year_index = find_year_name(f)
        #extract period name and period name index from the file name
        period_nm, period_index = find_period_name(f,year_index)
        
        if year in ['2021','2022','2023']:
            if ('Oncology' in f):
                cohort_nm = 'Oncology'
                
            elif ('141340' in f):
                cohort_nm = 'Critical Access'

            elif ('140281' in f):
                cohort_nm = 'Comprehensive Academic Medical Center'

            elif ('140242' in f):
                cohort_nm = 'Large Specialized Complex Care Medical Center'

            elif ('140130' in f):
                cohort_nm = 'Complex Care Medical Center'

            elif ('140211' in f):
                cohort_nm = 'Community'

            elif ('149916' in f):
                cohort_nm = 'Community'

            elif ('GreatState' in f):
                cohort_nm = 'Community'
                
            #period 0 is just the previous year's annual calculator using the new risk model
            #need to account for delnor not switching cohorts yet.
            if '140211' in f  and period_nm == 'PERIOD0':
                print(f)
                print(year)
                print('Delnor passed')
                pass
            else:
                df = extract_cumulative_ranks(f)
                #print(df.head(n=3))
                file_counter+=1
                #create a column for the calculator year
                df['YEAR'] = year
                #create a column for the calculator period
                df['PERIOD'] = period_nm
                #create a column for the cohort
                df['COHORT'] = cohort_nm

                parent_id_col = [x for x in df.columns if x is not None and x.upper().startswith('PARENT') == True and x.upper().endswith('ID') == True][0]
                #convert Parent_ID column to string to preserve leading zeros
                df[parent_id_col] = df[parent_id_col].astype(str)
                
                #rename parent_id column to Period_ID, if exists
                
                if 'parent_id' in df.columns:
                    
                    df = df.rename(columns={'parent_id': "Parent_ID"})
                
                print(df.shape)
                
                df['Parent_ID'] = df['Parent_ID'].astype(str)
                
                #print('2021 df col names:',df.columns)
                
                #metric_and_domain = open_calc_measure_weight(f) 
                
                #df['Metric'] = df['Metric'].str.upper().strip()
                
                #metric_and_domain['Metric'] = metric_and_domain['Metric'].str.upper().strip()
                
                #df = pd.merge(df,metric_and_domain,how="left", on=["Metric"])
                
                dataframe_list.append(df)
                
        else:
            
            if ('Oncology' in f):
                cohort_nm = 'Oncology'
            
            elif ('141340' in f):
                cohort_nm = 'Critical Access'

            elif ('140281' in f):
                cohort_nm = 'Comprehensive Academic Medical Center'

            elif ('140242' in f):
                cohort_nm = 'Large Specialized Complex Care Medical Center'

            elif ('140130' in f):
                cohort_nm = 'Complex Care Medical Center'

            elif ('140211' in f):
                cohort_nm = 'Complex Care Medical Center'

            elif ('GreatState' in f):
                cohort_nm = 'Community'
                
            elif ('149916' in f):
                cohort_nm = 'Community'
            
            if ('140211' in f or '149916' in f):
                print(f)
                print(year)
                print('Delnor passed')
                pass
            else:
                df = extract_cumulative_ranks(f)
                #print(df.head(n=3))
                file_counter+=1
                #create a column for the calculator year
                df['YEAR'] = year
                #create a column for the calculator period
                df['PERIOD'] = period_nm
                #create a column for the cohort
                df['COHORT'] = cohort_nm

                #convert Parent_ID column to string to preserve leading zeros
                parent_id_col = [x for x in df.columns if x is not None and x.upper().startswith('PARENT') == True and x.upper().endswith('ID') == True][0]
                #convert Parent_ID column to string to preserve leading zeros
                df[parent_id_col] = df[parent_id_col].astype(str)
                #df['Parent_ID'] = df['Parent_ID'].astype(str)
                
                if 'parent_id' in df.columns:
                    
                    df = df.rename(columns={'parent_id': "Parent_ID"})

                print(df.shape)
                
                df['Parent_ID'] = df['Parent_ID'].astype(str)
                
                dataframe_list.append(df)
  
ccmc_proxy_2021_period_3_vals = create_2021_proxy_ccmc_hosp_rows(ccmc_proxy_calc_file_path_p32021,ccmc_proxy_item_list_p32021,ccmc_proxy_df,ccmc_proxy_year_p32021,ccmc_proxy_period_p32021,item_final_rank_p32021,hco_short_name_list_p32021,parent_id_list_p32021)

ccmc_proxy_2021_period_4_vals = create_2021_proxy_ccmc_hosp_rows(ccmc_proxy_calc_file_path_p42021,ccmc_proxy_item_list_p42021,ccmc_proxy_df,ccmc_proxy_year_p42021,ccmc_proxy_period_p42021,item_final_rank_p42021,hco_short_name_list_p42021,parent_id_list_p42021)

ccmc_proxy_2022_period_2_vals = create_2021_proxy_ccmc_hosp_rows(ccmc_proxy_calc_file_path_p22022,ccmc_proxy_item_list_p22022,ccmc_proxy_df,ccmc_proxy_year_p22022,ccmc_proxy_period_p22022,item_final_rank_p22022,hco_short_name_list_p22022,parent_id_list_p22022)

ccmc_proxy_2022_period_3_vals = create_2021_proxy_ccmc_hosp_rows(ccmc_proxy_calc_file_path_p32022,ccmc_proxy_item_list_p32022,ccmc_proxy_df,ccmc_proxy_year_p32022,ccmc_proxy_period_p32022,item_final_rank_p32022,hco_short_name_list_p32022,parent_id_list_p32022)

ccmc_proxy_2023_period_2_vals = create_2021_proxy_ccmc_hosp_rows(ccmc_proxy_calc_file_path_p22023,ccmc_proxy_item_list_p22023,ccmc_proxy_df,ccmc_proxy_year_p22023,ccmc_proxy_period_p22023,item_final_rank_p22023,hco_short_name_list_p22023,parent_id_list_p22023)

ccmc_proxy_2023_period_3_vals = create_2021_proxy_ccmc_hosp_rows(ccmc_proxy_calc_file_path_p32023,ccmc_proxy_item_list_p32023,ccmc_proxy_df,ccmc_proxy_year_p32023,ccmc_proxy_period_p32023,item_final_rank_p32023,hco_short_name_list_p32023,parent_id_list_p32023)



dataframe_list.append(ccmc_proxy_2021_period_3_vals)

dataframe_list.append(ccmc_proxy_2021_period_4_vals)

dataframe_list.append(ccmc_proxy_2022_period_2_vals)

dataframe_list.append(ccmc_proxy_2022_period_3_vals)

dataframe_list.append(ccmc_proxy_2023_period_2_vals)

dataframe_list.append(ccmc_proxy_2023_period_3_vals)

result_df = pd.concat(dataframe_list,sort=True)

############################################################################
#calculate top decile and top quartile per cohort per calculator period/year
#then join back to the final result_df and export.

decile_df = result_df.groupby(['COHORT', 'PERIOD','YEAR'])['final_Rank'].size().reset_index()

#remember, we just added 3 proxy hospitals to the 2021 Period 3 CCMC cohort.
#we need to now subtract 3 from that cohort size count to get the correct decile/quartile rank.
#find the index of that row in the dataset.
ccmc_2021_cohort_index = decile_df.index[(decile_df['COHORT'] == 'Complex Care Medical Center') & (decile_df['PERIOD'] == 'PERIOD3') & (decile_df['YEAR'] == 2021)][0]
#isolate the original cohort size number
ccmc_2021_cohort_size = decile_df.iloc[ccmc_2021_cohort_index]['final_Rank']
#update the dataframe with the correct 2021 Period 3 cohort size.
decile_df.at[ccmc_2021_cohort_index,'final_Rank'] = ccmc_2021_cohort_size-3

#calculate quantiles
decile_df['top_decile'] = decile_df['final_Rank'] * .10
decile_df['top_decile_rank'] = decile_df['top_decile'].round()
decile_df['top_quartile'] = decile_df['final_Rank'] * .25
decile_df['top_quartile_rank'] = decile_df['top_quartile'].round()

#drop unnecessary columns
decile_df = decile_df.drop(['final_Rank','top_decile','top_quartile'],axis=1)
print(decile_df.head())
#left join the quantiles to the original ranks table
final_merged_df = pd.merge(result_df, decile_df, how="left", on=["COHORT","PERIOD","YEAR"])
#create binary columns to flag top decile and top quartile hospitals for each cohort and time period.
final_merged_df['top_decile_hosp'] = np.where((final_merged_df['final_Rank'] <= final_merged_df['top_decile_rank']), 1, 0)
final_merged_df['top_quartile_hosp'] = np.where((final_merged_df['final_Rank'] <= final_merged_df['top_quartile_rank']), 1, 0)

############################################################################
result_path = path = os.path.abspath(input('Enter path of folder to store final results.'))
os.chdir(os.path.abspath(result_path))
#result_df.to_csv('all_period_vizient_rankings_up_to_period2_2021.csv')
final_merged_df.to_excel('all_period_vizient_rankings_07.11.2023.xlsx',sheet_name = 'all_period_vizient_rankings_up_', index=False)
print(file_counter)

Enter path of original Vizient Calculator folders.P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\original
QACALCULATOR_140130_PERIOD 1_2019.XLSM
(115, 25)
QACALCULATOR_140242_PERIOD 1_2019.XLSM
(105, 25)
QACALCULATOR_140281_PERIOD 1_2019.XLSM
(101, 25)
QACALCULATOR_140130_PERIOD 2_2019.XLSM


KeyboardInterrupt: 

### Recursively walk down the original calculator folder structure, read in the workbook, grab the cumulative METRIC worksheet into a dataframe, clean it. Concatenate all ranking file data together and export to csv

#grab the wb object into to a variable
calc_wb = get_wb_object2(file_name)
    #store the cumulative rank worksheet into a pandas dataframe
df = pd.DataFrame(calc_wb['CurrentQA Cumulative-Metric'].values)
    #in openpyxl, the headers are listed as numbers.  Renaming the headers using the first row values.
df = df.rename(columns=df.iloc[0])
#drop the first row because they are now the header
df = df.drop([0])
print(df.columns)
#retrieve name of the parent id column.  Sometimes this column name all caps, sometimes not.  
parent_id_col = [x for x in df.columns if type(x) == str and x is not None and x.upper().startswith('PARENT') == True and x.upper().endswith('ID') == True][0]
metric_nm_col = [x for x in df.columns if type(x) == str and x is not None and x.upper() == 'METRIC'][0]
#drop all rows that do not have a hospital value
df_all_row_indices =  df[parent_id_col].notnull()
df = df[df_all_row_indices]
    
try:
    df = df[[metric_nm_col,parent_id_col,'rate','overall_weight','z']]
except:
        
    try:
        df = df[[metric_nm_col,parent_id_col,'rate/result2','% overall/wt_metric_score','z']]
    except:
        df = df[[metric_nm_col,parent_id_col,'rate','% Overall/Overall Weight','z']]

In [48]:
path = os.path.abspath(input('Enter path of original Vizient Calculator folders.'))

files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        if '.xlsm' in file:
            files.append(os.path.join(r, file))
file_counter = 0

dataframe_list = []
for f in files:
    if ('140130' in f) or ('140281' in f) or ('140242' in f) or ('141340' in f) or ('GreatState' in f) or ('140211' in f):
        #extract year and year index from the file name
        year, year_index = find_year_name(f)
        #extract period name and period name index from the file name
        period_nm, period_index = find_period_name(f,year_index)
        
        if year in ['2021']:
            if ('141340' in f):
                cohort_nm = 'Critical Access'

            elif ('140281' in f):
                cohort_nm = 'Comprehensive Academic Medical Center'

            elif ('140242' in f):
                cohort_nm = 'Large Specialized Complex Care Medical Center'

            elif ('140130' in f):
                cohort_nm = 'Complex Care Medical Center'

            elif ('140211' in f):
                cohort_nm = 'Community'

            elif ('149916' in f):
                cohort_nm = 'Community'

            elif ('GreatState' in f):
                cohort_nm = 'Community'
                
            #period 0 is just the previous year's annual calculator using the new risk model
            #need to account for delnor not switching cohorts yet.
            if '140211' in f and period_nm == 'PERIOD0':
                print(f)
                print(year)
                print('Delnor passed')
                pass
            else:
                df = extract_metric_ranks(f)
                #print(df.head(n=3))
                file_counter+=1
                #create a column for the calculator year
                df['YEAR'] = year
                #create a column for the calculator period
                df['PERIOD'] = period_nm
                #create a column for the cohort
                df['COHORT'] = cohort_nm

                parent_id_col = [x for x in df.columns if x is not None and x.upper().startswith('PARENT') == True and x.upper().endswith('ID') == True][0]
                #convert Parent_ID column to string to preserve leading zeros
                df[parent_id_col] = df[parent_id_col].astype(str)

                print(df.shape)
                
                if 'metric' in df.columns:
                    
                    df = df.rename(columns={'metric': "Metric"})
                    
                if '592 occura' in df.columns:
                    
                    df = df.rename(columns={'592 occura': "Metric"})
                    
                if 'parent_id' in df.columns:
                    
                    df = df.rename(columns={'parent_id': "Parent_ID"})
                    
                if 'rate/result2' in df.columns:
                    
                    df = df.rename(columns={'rate/result2': "metric_value"})
                    
                if 'rate' in df.columns:
                    
                    df = df.rename(columns={'rate': "metric_value"})
                    
                if '% Overall/Overall Weight' in df.columns:
                    
                    df = df.rename(columns={'% Overall/Overall Weight': "metric_score"})
                    
                if '% overall/wt_metric_score' in df.columns:
                    
                    df = df.rename(columns={'% overall/wt_metric_score': "metric_score"})
                    
                if 'overall_weight' in df.columns:
                    
                    df = df.rename(columns={'overall_weight': "metric_score"})
                    
                if 'z_score' in df.columns:
                    
                    df = df.rename(columns={'z_score': "z"})
                    
                if 'Z' in df.columns:
                    
                    df = df.rename(columns={'Z': "z"})
                
                df['Parent_ID'] = df['Parent_ID'].astype(str)
                
                #split Metric column to extract just the metric name and remove parent id and clean up.
                df['metric_list'] = df["Metric"].str.rsplit('-',1)
                df[['metric_list0','metric_list1']] = pd.DataFrame(df.metric_list.tolist(), index= df.index)
                df = df.drop(['Metric', 'metric_list1','metric_list'], axis=1)
                df = df.rename(columns={'metric_list0': "Metric"})
                
                #metric_and_domain = open_calc_measure_weight(f) 
                
                df['Metric'] = df['Metric'].str.upper().str.strip()
                
                #metric_and_domain['Metric'] = metric_and_domain['Metric'].str.upper().str.strip()
                
                #df = pd.merge(df,metric_and_domain,how="left", on=["Metric"])
                
                df = df[['COHORT','Metric','PERIOD','Parent_ID','YEAR','metric_score','metric_value','z']]
                
                dataframe_list.append(df)
                
        else:
            
            if ('141340' in f):
                cohort_nm = 'Critical Access'

            elif ('140281' in f):
                cohort_nm = 'Comprehensive Academic Medical Center'

            elif ('140242' in f):
                cohort_nm = 'Large Specialized Complex Care Medical Center'

            elif ('140130' in f):
                cohort_nm = 'Complex Care Medical Center'

            elif ('140211' in f):
                cohort_nm = 'Complex Care Medical Center'

            elif ('GreatState' in f):
                cohort_nm = 'Community'
                
            
            if '140211' in f:
                print(f)
                print(year)
                print('Delnor passed')
                pass
            else:
                df = extract_metric_ranks(f)
                #print(df.head(n=3))
                file_counter+=1
                #create a column for the calculator year
                df['YEAR'] = year
                #create a column for the calculator period
                df['PERIOD'] = period_nm
                #create a column for the cohort
                df['COHORT'] = cohort_nm

                #convert Parent_ID column to string to preserve leading zeros
                parent_id_col = [x for x in df.columns if x is not None and x.upper().startswith('PARENT') == True and x.upper().endswith('ID') == True][0]
                #convert Parent_ID column to string to preserve leading zeros
                df[parent_id_col] = df[parent_id_col].astype(str)
                #df['Parent_ID'] = df['Parent_ID'].astype(str)

                print(df.shape)
                
                if 'metric' in df.columns:
                    
                    df = df.rename(columns={'metric': "Metric"})
                
                if '592 occura' in df.columns:
                    
                    df = df.rename(columns={'592 occura': "Metric"})
                    
                    
                if 'parent_id' in df.columns:
                    
                    df = df.rename(columns={'parent_id': "Parent_ID"})
                    
                if 'rate/result2' in df.columns:
                    
                    df = df.rename(columns={'rate/result2': "metric_value"})
                    
                if 'rate' in df.columns:
                    
                    df = df.rename(columns={'rate': "metric_value"})
                    
                if '% Overall/Overall Weight' in df.columns:
                    
                    df = df.rename(columns={'% Overall/Overall Weight': "metric_score"})
                    
                if '% overall/wt_metric_score' in df.columns:
                    
                    df = df.rename(columns={'% overall/wt_metric_score': "metric_score"})
                    
                if 'overall_weight' in df.columns:
                    
                    df = df.rename(columns={'overall_weight': "metric_score"})
                    
                if 'z_score' in df.columns:
                    
                    df = df.rename(columns={'z_score': "z"})
                    
                if 'Z' in df.columns:
                    
                    df = df.rename(columns={'Z': "z"})
                      
                df['Parent_ID'] = df['Parent_ID'].astype(str)
                
                #split Metric column to extract just the metric name and remove parent id and clean up.
                df['metric_list'] = df["Metric"].str.rsplit('-',1)
                df[['metric_list0','metric_list1']] = pd.DataFrame(df.metric_list.tolist(), index= df.index)
                df = df.drop(['Metric', 'metric_list1','metric_list'], axis=1)
                df = df.rename(columns={'metric_list0': "Metric"})
                
                #metric_and_domain = open_calc_measure_weight(f) 
                
                df['Metric'] = df['Metric'].str.upper().str.strip()
                
                #metric_and_domain['Metric'] = metric_and_domain['Metric'].str.upper().str.strip()
                
                #df = pd.merge(df,metric_and_domain,how="left", on=["Metric"])
                
                df = df[['COHORT','Metric','PERIOD','Parent_ID','YEAR','metric_score','metric_value','z']]
                
                dataframe_list.append(df)
  
#now loop over calculators again and get the metric/domain combinations.  
#add them to a list, then union them all together and create a distinct list of metric/domain combinations.
metric_domain_list = []
for f in files:
    if ('140130' in f) or ('140281' in f) or ('140242' in f) or ('141340' in f) or ('GreatState' in f) or ('140211' in f):
        
        
        metric_and_domain = open_calc_measure_weight(f) 
        
        metric_and_domain['Metric'] = metric_and_domain['Metric'].str.upper().str.strip()
        
        metric_domain_list.append(metric_and_domain)

metric_and_domain_result_df = pd.concat(metric_domain_list)
#remove duplicate rows
metric_and_domain_result_df = metric_and_domain_result_df.drop_duplicates()

result_df = pd.concat(dataframe_list,sort=True)
#left join metric/domain to metric dataset
result_df = pd.merge(result_df,metric_and_domain_result_df,how="left", on=["Metric"])

result_df = result_df[['COHORT','Metric','Domain','PERIOD','Parent_ID','YEAR','metric_score','metric_value','z']]

result_path = path = os.path.abspath(input('Enter path of folder to store final results.'))
os.chdir(os.path.abspath(result_path))
#result_df.to_csv('all_period_vizient_metric_values_up_to_2020_period2.csv')
result_df.to_excel('all_period_vizient_metric_values_02.06.2023.xlsx',sheet_name = 'all_period_vizient_metric_value',index=False)
print(file_counter)

Enter path of original Vizient Calculator folders.P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\original
QACALCULATOR_140130_PERIOD 1_2019.XLSM
Index([        'Metric',      'Parent_ID',           'rate',              'N',
               'Eq_Den',              'z',          'score',              'p',
       'overall_weight',             None,             None,             None,
                   None],
      dtype='object')
(6785, 8)
QACALCULATOR_140211_PERIOD 1_2019.XLSM
P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\original\2019\period1_original\QACalculator_140211_Period 1_2019.xlsm
2019
Delnor passed
QACALCULATOR_140242_PERIOD 1_2019.XLSM
Index([        'Metric',      'Parent_ID',           'rate',              'N',
               'Eq_Den',              'z',          'score',              'p',
       'overall_weight',             None,             None,             None,
                   None],
      dtype='object')

Index([                  'Metric',                'Parent_ID',
                           'rate',                        'z',
                 '% Domain/score',                   'Eq_Den',
                              'p', '% Overall/Overall Weight',
                 'transformation',              'shift_value',
                        'keyword',                       None,
                 'Equity p Value'],
      dtype='object')
(14016, 8)
CRITICAL_ACCESS_QACALCULATOR_141340_PERIOD1_2020.XLSM
Index([                   'Metric',                 'Parent_ID',
                    'rate/result2',            'Interpretation',
                           'blank',                         'z',
                        '% Domain',                     'blank',
                          '_score', '% overall/wt_metric_score',
                   'Metric_Weight',              '%Domain Calc',
                  'Equity p Value',                        None,
                              None,         

Index([                  'Metric',                'Parent_ID',
                           'rate',                        'z',
                 '% Domain/score',                   'Eq_Den',
                              'p', '% Overall/Overall Weight',
                 'transformation',              'shift_value',
                        'keyword',                       None,
                 'Equity p Value'],
      dtype='object')
(14600, 8)
CRITICALACCESSQACALCULATOR_141340_PERIOD1_2021.XLSM
Index([        'metric',      'parent_id',           'rate',              'z',
                'score', 'overall_weight', 'transformation',    'shift_value',
       'interpretation',        'keyword',             None,             None,
                   None,             None,             None,             None,
                   None,             None,             None,             None,
                   None,             None,             None,             None,
                   None,   

CRITICALACCESSQACALCULATOR_141340_ANNUAL_2021.XLSM
Index([        'metric',      'parent_id',           'rate',              'z',
                'score', 'overall_weight', 'transformation',    'shift_value',
       'interpretation',        'keyword',             None,             None,
                    nan,             None,             None,             None,
                   None,             None,             None,             None,
                   None,             None,             None,             None,
                   None,             None,             None,             None,
                   None,             None,             None,             None],
      dtype='object')
(1919, 8)
QACALCULATOR_140130_ANNUAL_2021.XLSM
Index([                  'Metric',                'Parent_ID',
                           'rate',                        'z',
                 '% Domain/score',                   'Eq_Den',
                              'p', '% Overall/Overall Weig

Index([                  'Metric',                'Parent_ID',
                           'rate',                        'z',
                 '% Domain/score',                   'Eq_Den',
                              'p', '% Overall/Overall Weight',
                 'transformation',              'shift_value',
                        'keyword',                       None,
                 'Equity p Value'],
      dtype='object')
(18081, 8)
QACALCULATOR_140281_PERIOD2_2022.XLSM
Index([                  'Metric',                'Parent_ID',
                           'rate',                        'z',
                 '% Domain/score',                   'Eq_Den',
                              'p', '% Overall/Overall Weight',
                 'transformation',              'shift_value',
                        'keyword',                       None,
                 'Equity p Value'],
      dtype='object')
(14994, 8)
CRITICALACCESSQACALCULATOR_141340_PERIOD3_2022.XLSM
Index([    '592 

74


# Join the Hospital Ranks Dataset with the Hospital Metrics Dataset 

In [81]:
path_hospital_ranks = r"P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\original_unlocked\all_period_rankings\all_period_vizient_rankings.xlsx"
path_metric_values = r"P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\original_unlocked\all_period_metric_values\all_period_vizient_metric_values_08.04.2021.xlsx"

In [82]:
df_hosp_ranks = pd.read_excel(path_hospital_ranks, sheet_name = 'all_period_vizient_rankings_up_', engine='openpyxl',converters={'Parent_ID':str})
df_metric_vals = pd.read_excel(path_metric_values, sheet_name = 'all_period_vizient_metric_value', engine='openpyxl',converters={'Parent_ID':str})

In [83]:
df_hosp_ranks.head()

Unnamed: 0,COHORT,Eff_Score,HCO_SHORT_NAME,Mort_score,PERIOD,Parent_ID,Patct_Rank,Safety_score,YEAR,eff_Rank,...,mort_Rank,mort_wt_score,patct_score,patct_wt_score,safety_Rank,safety_wt_score,top_decile_rank,top_quartile_rank,top_decile_hosp,top_quartile_hosp
0,Complex Care Medical Center,0.628621,BANNERHEALTH_SOUTH,0.631739,PERIOD1,30111,82.0,0.47198,2019,36.0,...,58.0,0.166147,0.301269,0.047601,94,0.124131,12,29,0,0
1,Complex Care Medical Center,0.025043,LA_CO_OLIVEVIEW,0.592626,PERIOD1,50040,43.0,0.400093,2019,113.0,...,67.0,0.155861,0.543275,0.085837,100,0.105224,12,29,0,0
2,Complex Care Medical Center,0.636605,UCHEALTH_POUDREVALLEY,0.614588,PERIOD1,60010,40.0,0.684623,2019,34.0,...,62.0,0.161637,0.576086,0.091022,27,0.180056,12,29,0,0
3,Complex Care Medical Center,0.981948,UCHEALTH_YAMPA-VALLEY,0.619545,PERIOD1,60049,,0.515273,2019,2.0,...,61.0,0.16294,0.0,0.0,85,0.135517,12,29,0,0
4,Complex Care Medical Center,0.696609,UCHEALTH_LONGSPEAK,0.937814,PERIOD1,60128,,0.952567,2019,27.0,...,12.0,0.246645,0.0,0.0,1,0.250525,12,29,1,1


In [84]:
df_metric_vals.head()

Unnamed: 0,COHORT,Metric,Domain,PERIOD,Parent_ID,YEAR,metric_score,metric_value,z
0,Complex Care Medical Center,CLEANQUIET,Patient Centeredness,PERIOD1,30111,2019,0.00177992,58.54166667,-1.27368
1,Complex Care Medical Center,CLEANQUIET,Patient Centeredness,PERIOD1,50040,2019,0.00203199,59.06666667,-1.19652
2,Complex Care Medical Center,CLEANQUIET,Patient Centeredness,PERIOD1,60010,2019,0.00942951,67.84166667,0.0931903
3,Complex Care Medical Center,CLEANQUIET,Patient Centeredness,PERIOD1,60049,2019,0.0,missing,
4,Complex Care Medical Center,CLEANQUIET,Patient Centeredness,PERIOD1,60128,2019,0.0,missing,


In [85]:
metrics_joined_with_ranks_df = pd.merge(df_metric_vals, df_hosp_ranks, how="left", on=["Parent_ID","PERIOD", "YEAR"])

In [86]:
metrics_joined_with_ranks_df = metrics_joined_with_ranks_df.drop(['COHORT_y'],axis=1)

In [87]:
metrics_joined_with_ranks_df = metrics_joined_with_ranks_df.rename(columns={'COHORT_x': "COHORT"})

In [88]:
metrics_joined_with_ranks_df['metric_value'] = metrics_joined_with_ranks_df['metric_value'].replace({ 'missing' : np.nan, 'LV' : np.nan, '-' : np.nan , '.': np.nan})

In [89]:
metrics_joined_with_ranks_df['metric_score'] = metrics_joined_with_ranks_df['metric_score'].replace({ 'missing' : np.nan, 'LV' : np.nan, '-' : np.nan , '.': np.nan})

In [90]:
metrics_joined_with_ranks_df.head()

Unnamed: 0,COHORT,Metric,Domain,PERIOD,Parent_ID,YEAR,metric_score,metric_value,z,Eff_Score,...,mort_Rank,mort_wt_score,patct_score,patct_wt_score,safety_Rank,safety_wt_score,top_decile_rank,top_quartile_rank,top_decile_hosp,top_quartile_hosp
0,Complex Care Medical Center,CLEANQUIET,Patient Centeredness,PERIOD1,30111,2019,0.00178,58.54166667,-1.27368,0.628621,...,58.0,0.166147,0.301269,0.047601,94,0.124131,12,29,0,0
1,Complex Care Medical Center,CLEANQUIET,Patient Centeredness,PERIOD1,50040,2019,0.002032,59.06666667,-1.19652,0.025043,...,67.0,0.155861,0.543275,0.085837,100,0.105224,12,29,0,0
2,Complex Care Medical Center,CLEANQUIET,Patient Centeredness,PERIOD1,60010,2019,0.00943,67.84166667,0.0931903,0.636605,...,62.0,0.161637,0.576086,0.091022,27,0.180056,12,29,0,0
3,Complex Care Medical Center,CLEANQUIET,Patient Centeredness,PERIOD1,60049,2019,0.0,,,0.981948,...,61.0,0.16294,0.0,0.0,85,0.135517,12,29,0,0
4,Complex Care Medical Center,CLEANQUIET,Patient Centeredness,PERIOD1,60128,2019,0.0,,,0.696609,...,12.0,0.246645,0.0,0.0,1,0.250525,12,29,1,1


In [91]:
metrics_joined_with_ranks_df['metric_score'] = metrics_joined_with_ranks_df['metric_score'].replace('.', np.nan)

In [92]:
metrics_joined_with_ranks_df['metric_value'] = pd.to_numeric(metrics_joined_with_ranks_df['metric_value'])

In [93]:
metrics_joined_with_ranks_df['metric_score'] = pd.to_numeric(metrics_joined_with_ranks_df['metric_score'])

In [95]:
path_to_store_joined_file = input('Where to boss?')

Where to boss?P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\calculator data\original_unlocked\all_metrics_joined_with_rankings


In [97]:
metrics_joined_with_ranks_df.to_excel(os.path.join(path_to_store_joined_file,'all_metrics_joined_with_rankings.xlsx'),sheet_name = 'data')