In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions
import math

#libraries for interacting with dataframes in python
import pandas as pd
import numpy as np
#library for manipulating Excel files.
from openpyxl import load_workbook
import openpyxl
import pyodbc
import os
from itertools import islice
import glob
import time
import shutil
import re
import urllib

### Updating this set of functions to handle new FY21 metrics and Critical Access calculators

#### include required functions

In [2]:
def gather_cohort_data():
    try:
        wd = input(
            'Enter file path for Vizient cohort file (File Should be like this: QA_Calculator_..._Cohorts.xlsx).')
        # change directory to directory with file.  abspath function normalizes the directory path.
        os.chdir(os.path.abspath(wd))
        # print(os.listdir())
    except:
        print('Something is wrong with cohort file path.')
        return
    try:
        # find the cohort excel file.  The file name should have QA_ and _Cohorts.xlsx in the name string.
        cohort_file_list = [file for file in os.listdir(wd) if
                            ('QA_' in file and '_Cohorts.xlsx' in file) and not (file.startswith('~'))]
        try:
            # There should only be one file in the cohort directory
            if len(cohort_file_list) > 1:
                print('More than one cohort list file.  There should only be one.')
                return
            # There should be at least one cohort file in the directory
            elif len(cohort_file_list) == 0:
                print('No cohort list files in this directory.  Check for empty directory or check file name.')
                return
            else:
                # Once you get once cohort excel file found, load it.
                try:
                    wb = openpyxl.load_workbook(filename=cohort_file_list[0], data_only=True)
                except:
                    print('Could not load workbook')
                    return

                # once you have the workbook loaded, find all the period cohort worksheets.
                # Unfortunately, the sheet names change each period and the critical access cohort is on a separate
                # sheet starting in FY2020 Period 1 so we don't have one constant worksheet name to call so we must
                # find them.
                # find the current period cohort worksheet name
                cohort_sheet_list = [sheet for sheet in wb.sheetnames if
                                     ('Q&A' in sheet and 'Cohorts' in sheet) or (sheet == 'Crit Acc. & Small Comm')]

                # if there is only one sheet, it likely lacks the critical access sheet.  Probably FY19.
                if len(cohort_sheet_list) == 1:
                    try:
                        ws = wb[cohort_sheet_list[0]]
                    except:
                        print('Something went wrong opening the worksheet.')
                        return
                    data = ws.values
                    cols = next(data)[1:]
                    data = list(data)
                    idx = [r[0] for r in data]
                    data = (islice(r, 1, None) for r in data)
                    df = pd.DataFrame(data, index=idx, columns=cols)
                    df['Hospital'] = df.index
                    df = pd.DataFrame(df, columns=['Hospital', 'Cohort'])
                    #They added a comma in LSCCMC cohort name in the 2020 calculators....
                    df['Cohort'] = df['Cohort'].str.replace(',','').str.replace('.','')
                    return (df)
                elif len(cohort_sheet_list) == 0:
                    # empty list.  failed to find any worksheets.  return and figure out the problem.
                    # if there are
                    print('Did not find any worksheets.')
                    return
                elif len(cohort_sheet_list) == 2:
                    cohort_sheet_list_index = 0
                    crit_access_list_index = 0
                    for i, item in enumerate(cohort_sheet_list):
                        if ('Q&A' in item and 'Cohorts' in item):
                            cohort_sheet_list_index = i
                        if item == 'Crit Acc. & Small Comm':
                            crit_access_list_index = i

                    # open and process cohort worksheet
                    try:
                        ws = wb[cohort_sheet_list[cohort_sheet_list_index]]
                    except:
                        print('Something went wrong opening the worksheet.')
                        return

                    data = ws.values
                    columns = list(next(data)[0:])

                    # in order to handle the 2019 Period 4 calculator, look for 'Hospital' column name as indicator
                    if 'Hospital' in columns:
                        data = ws.values
                        cols = next(data)[1:]
                        data = list(data)
                        idx = [r[0] for r in data]
                        data = (islice(r, 1, None) for r in data)
                        df = pd.DataFrame(data, index=idx, columns=cols)
                        df['Hospital'] = df.index
                        df = pd.DataFrame(df, columns=['Hospital', 'Cohort'])
                        # They added a comma in LSCCMC cohort name in the 2020 calculators....
                        df['Cohort'] = df['Cohort'].str.replace(',','').str.replace('.','')
                        return (df)
                    else:
                        # open and process cohort worksheet
                        # Starting with FY2020, there is a Medicare ID column and Short name column instead of
                        # concatenated 'Hospital' column
                        print('must by FY2020')
                        data = ws.values
                        cols = next(data)[0:]
                        data = list(data)
                        # recreate the Hospital column from FY19.  Just concatenate the medicare ID to the Short name
                        data[:] = [list(i) for i in data]
                        [i.insert(0, str(i[0]) + ' ' + str(i[1])) for i in data]
                        data[:] = [tuple(i) for i in data]
                        idx = [r[0] for r in data]
                        data = (islice(r, 0, None) for r in data)
                        cols = list(cols)
                        cols.insert(0, 'Hospital')
                        cols = tuple(cols)
                        df = pd.DataFrame(data, index=idx, columns=cols)
                        df['Hospital'] = df.index
                        df = pd.DataFrame(df, columns=['Hospital', 'Cohort'])

                        # now do almost the same for the critical access worksheet

                        try:
                            ws_crit = wb[cohort_sheet_list[crit_access_list_index]]
                        except:
                            print('Something went wrong opening the worksheet.')
                            return

                        data_crit = ws_crit.values
                        # skip the header row
                        cols_crit = next(data_crit)
                        cols_crit = next(data_crit)
                        data_crit = list(data_crit)
                        # recreate the Hospital column from FY19.  Just concatenate the medicare ID to the Short name
                        data_crit[:] = [list(i) for i in data_crit]
                        [i.insert(0, str(i[0]) + ' ' + str(i[1])) for i in data_crit]
                        data_crit[:] = [tuple(i) for i in data_crit]
                        idx_crit = [r[0] for r in data_crit]
                        data_crit = (islice(r, 0, None) for r in data_crit)
                        cols_crit = list(cols_crit)
                        cols_crit.insert(0, 'Hospital')
                        cols_crit = tuple(cols_crit)
                        df_crit = pd.DataFrame(data_crit, index=idx_crit, columns=cols_crit)
                        df_crit['Hospital'] = df_crit.index
                        # no cohort column so we need to fill this in.
                        df_crit['Cohort'] = 'Critical Access & Small Community'
                        df_crit = pd.DataFrame(df_crit, columns=['Hospital', 'Cohort'])
                        df_list = [df, df_crit]
                        df_final = pd.concat(df_list)
                        # They added a comma in LSCCMC cohort name in the 2020 calculators....
                        df_final['Cohort'] = df_final['Cohort'].str.replace(',','').str.replace('.','')
                        return (df_final)

        except:
            print('Something went wrong finding files ending in _Cohorts.xlsx.')
            return
    except:
        print('No cohort file found.')
        return

In [3]:
def gather_hyperlink_files():
    try:
        wd = input('Enter file path for hospital hyperlink folder (template_hyperlinks).')
        # change directory to directory with file.  abspath function normalizes the directory path.
        os.chdir(os.path.abspath(wd))
    except:
        print('Something is wrong with Vizient hyperlink excel file path.')
        return
    files = os.listdir(os.curdir)
    # Filter folder files to only include '_links.xlsx' excel files.
    files = [ii for ii in files if '_links.xlsx' in ii]
    return(files,wd)

In [4]:
def get_report_template_links_orig():
    #find the folder with Vizient calculator template hyperlinks and put file names in a list
    try:
        file_names = gather_hyperlink_files()
    except:
        print('Problem gathering hyperlink files.')
        return
    #Get all unique report templates for all hospitals
    #initialize empty dataframe to store hyperlinks
    hyperlinks = pd.DataFrame()
    #iterate through list of hyper link files obtain from Vizient calculators and store measure name & hyperlink.
    for ii, item in enumerate(file_names[0]):
        dataframe_ob = pd.DataFrame(pd.read_excel(item,sheet_name="Sheet1",engine='openpyxl'))
        dataframe_ob = pd.DataFrame(dataframe_ob,columns=['Hospital','Formal Name','Hyperlink','JobStoreID','ReportID','AdjustmentModel','AHRQ Version','Keyword/Metric','Domain'])
        hyperlinks = pd.concat([hyperlinks, dataframe_ob])
    #Remove any zero rows or null rows.  These are measures without a report template link.
    hyperlink_indices =  hyperlinks['Hyperlink'] != 0
    hyperlinks = hyperlinks[hyperlink_indices]
    hyperlink_indices2 =  hyperlinks['Hyperlink'].notnull()
    hyperlinks = hyperlinks[hyperlink_indices2]
    #Convert JobStoreID and ReportID back to integer to remove decimal point.
    hyperlinks[["JobStoreID", "ReportID"]] = hyperlinks[["JobStoreID", "ReportID"]].astype(int)
    #Remove duplicates.  Only require a unique list of report templates for all hospitals
    hyperlinks = hyperlinks.drop_duplicates()
    
    #UL003
    #Keyword/Metric for VWH only is not all caps...for some reason so I now have to convert this column to all caps.
    hyperlinks['Keyword/Metric'] = hyperlinks['Keyword/Metric'].str.upper()
    #hyperlinks['zipped_data'] = list(zip(hyperlinks.Hyperlink,hyperlinks.JobStoreID,hyperlinks.ReportID))
    #lookup_data_container = pd.Series(hyperlinks.zipped_data.values,index=hyperlinks['Formal Name'].values).to_dict()
    #hyperlinks.to_csv (r'C:/Users/NM184423/Desktop/QA_docs/template_hyperlinks', index = None, header=True)
    #hyperlinks.to_csv('here.csv', encoding='utf-8')
    return(hyperlinks,file_names[1])

In [5]:
def get_report_template_links():
    #find the folder with Vizient calculator template hyperlinks and put file names in a list
    try:
        file_names = gather_hyperlink_files()
    except:
        print('Problem gathering hyperlink files.')
        return
    #Get all unique report templates for all hospitals
    #initialize empty dataframe to store hyperlinks
    hyperlinks = pd.DataFrame()
    print(os.getcwd())
    print('file names:',file_names)
    #iterate through list of hyper link files obtain from Vizient calculators and store measure name & hyperlink.
    for ii, item in enumerate(file_names[0]):
        dataframe_ob = pd.DataFrame(pd.read_excel(item,sheet_name="Sheet1",engine='openpyxl'))
        dataframe_ob = pd.DataFrame(dataframe_ob,columns=['Hospital','Formal Name','Hyperlink','JobStoreID','ReportID','AdjustmentModel','AHRQ Version','Keyword/Metric','Domain'])
        hyperlinks = pd.concat([hyperlinks, dataframe_ob])
    #Remove any zero rows or null rows.  These are measures without a report template link.
    hyperlink_indices =  hyperlinks['Hyperlink'] != 0
    hyperlinks = hyperlinks[hyperlink_indices]
    hyperlink_indices2 =  hyperlinks['Hyperlink'].notnull()
    hyperlinks = hyperlinks[hyperlink_indices2]
    #Convert JobStoreID and ReportID back to integer to remove decimal point.
    hyperlinks[["JobStoreID", "ReportID"]] = hyperlinks[["JobStoreID", "ReportID"]].astype(int)
    hyperlinks = hyperlinks.drop_duplicates()
    
    #UL003
    #Keyword/Metric for VWH only is not all caps...for some reason so I now have to convert this column to all caps.
    hyperlinks['Keyword/Metric'] = hyperlinks['Keyword/Metric'].str.upper()
    
    return(hyperlinks)

In [6]:
def create_hyperlink_dict_wrangle(merged_df):
    merged_df = pd.DataFrame(merged_df,columns=['Cohort','Formal Name','Keyword/Metric','Domain'])
    merged_df = merged_df.drop_duplicates()
    merged_df['zipped_data'] = list(zip(merged_df.Cohort,merged_df['Domain'],merged_df['Formal Name'],merged_df['Keyword/Metric']))
    merged_df['zipped_keys'] = list(zip(merged_df.Cohort,merged_df['Keyword/Metric']))
    lookup_data_container = pd.Series(merged_df.zipped_data.values,index=merged_df.zipped_keys.values).to_dict()
    return(lookup_data_container)

### First part of vizient_data_folder_walker_and_prep_for_db_inserts()

In [7]:
wd = input('Enter the path of the folder you want to parse.')
wd = os.path.abspath(wd)
# Set teh path of the folder structure where we want the final files
wd_dest = input('Enter the path of the folder you want to final files to go in.')
wd_dest = os.path.abspath(wd_dest)

# step 2:  Import cohort data from the Vizient documentation file.
cohort_helper_df = gather_cohort_data()

# step 3:  Import the Vizient template hyperlink file and generate a helper dictionary from it.
hyperlinks_helper_df = get_report_template_links()

merged_hyperlink_helper_df = pd.merge(cohort_helper_df, hyperlinks_helper_df, on='Hospital')
wrangled_hyperlinks_dict = create_hyperlink_dict_wrangle(merged_hyperlink_helper_df)

# Check which type of sum the client wants for edac and readmission values
edac_readm_sum_option = 'not answered'

Enter the path of the folder you want to parse.P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\fy23 scraped cdb data\010_june_fytd\OP Revisits with Denom
Enter the path of the folder you want to final files to go in.P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\file_parsing_output
Enter file path for Vizient cohort file (File Should be like this: QA_Calculator_..._Cohorts.xlsx).P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\cohort data\2022\calc_3_cohorts


  warn(msg)


must by FY2020
Enter file path for hospital hyperlink folder (template_hyperlinks).P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\hyperlink data\2022\period3_template_hyperlinks
P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\hyperlink data\2022\period3_template_hyperlinks
file names: (['cdh_links.xlsx', 'dch_links.xlsx', 'hh_links.xlsx', 'kish_links.xlsx', 'lfh_links.xlsx', 'mch_links.xlsx', 'nmh_links.xlsx', 'palos_links.xlsx', 'vwh_links.xlsx'], 'P:\\Datastore02\\Analytics\\230 Inpatient Quality Composite\\data\\hyperlink data\\2022\\period3_template_hyperlinks')


In [8]:
wrangled_hyperlinks_dict

{('Comprehensive Academic Medical Center',
  'DCOST_CARD'): ('Comprehensive Academic Medical Center', 'Efficiency', 'DCOST O/E - Cardiology', 'DCOST_CARD'),
 ('Comprehensive Academic Medical Center',
  'DCOST_CT'): ('Comprehensive Academic Medical Center', 'Efficiency', 'DCOST O/E - CT Surgery', 'DCOST_CT'),
 ('Comprehensive Academic Medical Center',
  'DCOST_ENT'): ('Comprehensive Academic Medical Center', 'Efficiency', 'DCOST O/E - Otolaryngology', 'DCOST_ENT'),
 ('Comprehensive Academic Medical Center',
  'DCOST_GASTRO'): ('Comprehensive Academic Medical Center', 'Efficiency', 'DCOST O/E - Gastroenterology', 'DCOST_GASTRO'),
 ('Comprehensive Academic Medical Center',
  'DCOST_GYN'): ('Comprehensive Academic Medical Center', 'Efficiency', 'DCOST O/E - Gynecology', 'DCOST_GYN'),
 ('Comprehensive Academic Medical Center',
  'DCOST_MED'): ('Comprehensive Academic Medical Center', 'Efficiency', 'DCOST O/E - Medicine General', 'DCOST_MED'),
 ('Comprehensive Academic Medical Center',
  'DC

In [9]:
# Check which type of sum the client wants for edac and readmission values
edac_readm_sum_option = 'not answered'
while edac_readm_sum_option not in [1, 2]:
    try:
        edac_readm_sum_option = int(input(
            "EDAC and Readmission values were pulled by quarter.  Do you want the sum of all months (choose 1) or do you want the sum of specific months? (choose 2)"))
    except:
        print('Not an integer.')
    if edac_readm_sum_option not in [1, 2]:
        print('Please choose 1 or 2')

# if choice 2, then get a list of months to sum up.
if edac_readm_sum_option == 2:
    try:
        print(
            'You chose "sum of specific months" (choice 2).  Please add all months in the quarter date range you would like to sum for EDAC and READMISSIONS.')
        edac_readm_month_list = []
        month = 'GO'
        r = re.compile('\d\d\d\d-\d\d')
        while month != 'STOP':
            month = input(
                'ADD Month with "YYYY-MM" format.  Type "STOP" to stop adding months to custom month list.')
            if r.match(month) is not None:
                print('Matches Format.  Added to month list')
                edac_readm_month_list.append(month)
            elif month.upper() == 'STOP':
                print('Stopping...')
                print(edac_readm_month_list)
                break
            else:
                print('That does not match "YYYY-MM" format.  Try again.')
        # if input is not-integer, just print the list
    except:
        print(edac_readm_month_list)

EDAC and Readmission values were pulled by quarter.  Do you want the sum of all months (choose 1) or do you want the sum of specific months? (choose 2)2
You chose "sum of specific months" (choice 2).  Please add all months in the quarter date range you would like to sum for EDAC and READMISSIONS.
ADD Month with "YYYY-MM" format.  Type "STOP" to stop adding months to custom month list.2022-07
Matches Format.  Added to month list
ADD Month with "YYYY-MM" format.  Type "STOP" to stop adding months to custom month list.2022-08
Matches Format.  Added to month list
ADD Month with "YYYY-MM" format.  Type "STOP" to stop adding months to custom month list.2022-09
Matches Format.  Added to month list
ADD Month with "YYYY-MM" format.  Type "STOP" to stop adding months to custom month list.2022-10
Matches Format.  Added to month list
ADD Month with "YYYY-MM" format.  Type "STOP" to stop adding months to custom month list.2022-11
Matches Format.  Added to month list
ADD Month with "YYYY-MM" format.

In [10]:
edac_readm_month_list

['2022-07',
 '2022-08',
 '2022-09',
 '2022-10',
 '2022-11',
 '2022-12',
 '2023-01',
 '2023-02',
 '2023-03',
 '2023-04']

In [11]:
# Set counters which will act as proxy indexes in order to append rows to pandas dataframes
CAMC_counter = 0
LSCCMC_counter = 0
CCMC_counter = 0
COMM_counter = 0
#UL003
CASC_counter= 0

CAMC_events_counter = 0
LSCCMC_events_counter = 0
CCMC_events_counter = 0
COMM_events_counter = 0
#UL003
CASC_events_counter = 0

CAMC_denom_counter = 0
LSCCMC_denom_counter = 0
CCMC_denom_counter = 0
COMM_denom_counter = 0
#UL003
CASC_denom_counter = 0

# Create empty dataframes to store final values.  First dataframe stores the unsorted values, the _final dataframe
# will store the sorted values and will be written to a csv file
CAMC_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'Metric Value'])
LSCCMC_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'Metric Value'])
CCMC_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'Metric Value'])
COMM_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'Metric Value'])
#UL003
CASC_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'Metric Value'])

CAMC_events_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'N Events', 'event_type_nm'])
LSCCMC_events_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'N Events', 'event_type_nm'])
CCMC_events_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'N Events', 'event_type_nm'])
COMM_events_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'N Events', 'event_type_nm'])
#UL003
CASC_events_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'N Events', 'event_type_nm'])

CAMC_denom_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'denominator', 'denominator_event_type_nm'])
LSCCMC_denom_df = pd.DataFrame(
    columns=['Hospital', 'Domain', 'Measure', 'denominator', 'denominator_event_type_nm'])
CCMC_denom_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'denominator', 'denominator_event_type_nm'])
COMM_denom_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'denominator', 'denominator_event_type_nm'])
#UL003
CASC_denom_df = pd.DataFrame(columns=['Hospital', 'Domain', 'Measure', 'denominator', 'denominator_event_type_nm'])


In [12]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=edw00pd05wva.corp.nm.org\EDWIDS1;'
                      'Database=clarity;'
                      'Trusted_Connection=yes;')

In [13]:
current_cohort_calc_id = 15 #2019 Q&A calculator Period 3

# Query the NM_Analytics.vizient_qa database to grab all hospitals within each Vizient cohort.
camc_cohort_hosp = """
SELECT
distinct
concat(hospital_medicare_id,' ',hospital_name) as hospital_name
FROM
NM_Analytics_prototype.vizient_qa.hospitals as h
join NM_Analytics_Prototype.vizient_qa.calc_hospital_cohort as chc
on chc.hospital_id = h.hospital_id
where
chc.hospital_cohort_id = 1  --Comprehensive Academic Medical Center
and
chc.calc_id = %s
""" % current_cohort_calc_id

lsccmc_cohort_hosp = """
SELECT
distinct
concat(hospital_medicare_id,' ',hospital_name) as hospital_name
FROM
NM_Analytics_prototype.vizient_qa.hospitals as h
join NM_Analytics_Prototype.vizient_qa.calc_hospital_cohort as chc
on chc.hospital_id = h.hospital_id
where
chc.hospital_cohort_id = 2  --Large Specialized Complex Care Medical Center
and
chc.calc_id = %s
""" % current_cohort_calc_id

ccmc_cohort_hosp = """
SELECT
distinct
concat(hospital_medicare_id,' ',hospital_name) as hospital_name
FROM
NM_Analytics_prototype.vizient_qa.hospitals as h
join NM_Analytics_Prototype.vizient_qa.calc_hospital_cohort as chc
on chc.hospital_id = h.hospital_id
where
chc.hospital_cohort_id = 3	--Complex Care Medical Center
and
chc.calc_id = %s
""" % current_cohort_calc_id

comm_cohort_hosp = """
SELECT
distinct
concat(hospital_medicare_id,' ',hospital_name) as hospital_name
FROM
NM_Analytics_prototype.vizient_qa.hospitals as h
join NM_Analytics_Prototype.vizient_qa.calc_hospital_cohort as chc
on chc.hospital_id = h.hospital_id
where
chc.hospital_cohort_id = 4	--Community
and
chc.calc_id = %s
"""  % current_cohort_calc_id

#UL003
casc_cohort_hosp = """
SELECT
distinct
concat(hospital_medicare_id,' ',hospital_name) as hospital_name
FROM
NM_Analytics_prototype.vizient_qa.hospitals as h
join NM_Analytics_Prototype.vizient_qa.calc_hospital_cohort as chc
on chc.hospital_id = h.hospital_id
where
chc.hospital_cohort_id = 5 --Critical Access & Small Community
and
chc.calc_id = %s
"""  % current_cohort_calc_id

# Execute the query and store the values in pandas dataframes
camc_cohort = pd.DataFrame(pd.read_sql(camc_cohort_hosp, conn))
lsccmc_cohort = pd.DataFrame(pd.read_sql(lsccmc_cohort_hosp, conn))
ccmc_cohort = pd.DataFrame(pd.read_sql(ccmc_cohort_hosp, conn))
comm_cohort = pd.DataFrame(pd.read_sql(comm_cohort_hosp, conn))
#UL003
casc_cohort = pd.DataFrame(pd.read_sql(casc_cohort_hosp, conn))

# Convert the dataframes into lists in order to use the list in the dataframe filter functions
camc_cohort_list = camc_cohort['hospital_name'].values.tolist()
lsccmc_cohort_list = lsccmc_cohort['hospital_name'].values.tolist()
ccmc_cohort_list = ccmc_cohort['hospital_name'].values.tolist()
comm_cohort_list = comm_cohort['hospital_name'].values.tolist()
#UL003
casc_cohort_list = casc_cohort['hospital_name'].values.tolist()

conn.close()

### Query database and get OP Procedure Revisit measure IDs

In [14]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=edw00pd05wva.corp.nm.org\EDWIDS1;'
                      'Database=clarity;'
                      'Trusted_Connection=yes;')

measure_ids = '''
SELECT
m.measure_id
,m.measure_name
from
NM_Analytics_Prototype.vizient_qa.measure as m
where
measure_id
IN
(
145	--OP Procedure Revisits - Arthroscopy
,146	--OP Procedure Revisits - Biliary
,147	--OP Procedure Revisits - Colonoscopy
,148	--OP Procedure Revisits - Urological
)
'''

op_revisit_meas = pd.DataFrame(pd.read_sql(measure_ids, conn))
conn.close()

In [15]:
op_revisit_meas

Unnamed: 0,measure_id,measure_name
0,145,OP Procedure Revisits - Arthroscopy
1,146,OP Procedure Revisits - Biliary
2,147,OP Procedure Revisits - Colonoscopy
3,148,OP Procedure Revisits - Urological


### Query db and get all hospitals

In [16]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=edw00pd05wva.corp.nm.org\EDWIDS1;'
                      'Database=clarity;'
                      'Trusted_Connection=yes;')

all_hosp_query = """
SELECT
distinct
hospital_id
,hospital_medicare_id
FROM
NM_Analytics_prototype.vizient_qa.hospitals as h

"""
all_hospitals = pd.DataFrame(pd.read_sql(all_hosp_query, conn))
conn.close()

### Part 2:  Looping

In [17]:
def open_excel_file(path_obj, file_list_obj):
    
    #check for 'ghost file' with ~ in front of it.  
    if file_list_obj[0].startswith('~'):
        file_list_obj.remove(file_list_obj[0])
    #Create filename path in order to open the excel file
    file_loc = os.path.join(os.path.abspath(path_obj),file_list_obj[0])
    dirname = os.path.dirname(file_loc)
    #save the excel workbook object in a variable
    wb = openpyxl.load_workbook(file_loc)
    #save the excel worksheet object in a variable
    wb_sheetnames = wb.sheetnames
    #take the first sheet in the workbook
    ws = wb[wb_sheetnames[0]]
    #find the first column number with a value in it
    first_col = find_first_ws_col(ws)
    print('first_col:',first_col)
    #find the first row number with 'Hospital' or 'AHRQ Safety' indicating the header row of the Vizient files
    header_row = find_ws_header_row(ws,first_col)
    print('header_row:',header_row)
    #create a list of all columns which have values
    populated_columns = find_ws_populated_cols(ws,header_row)
    print('pop columns',populated_columns)
    #parse the Excel spreadsheet to create a pandas dataframe
    xlsx_file = pd.DataFrame(pd.read_excel(file_loc,sheet_name=wb_sheetnames[0],skiprows=header_row-1,usecols=populated_columns,engine='openpyxl'))
    return(xlsx_file)

In [18]:
def find_first_ws_col(worksheet_var):
    for i in range(1,worksheet_var.max_column+1):
        if worksheet_var.cell(row=1,column=i).value is None:
            pass
        else:
            return(i)

In [19]:
def find_ws_header_row(worksheet_var, first_val_col):
    for i in range(1,worksheet_var.max_row+1):
        if worksheet_var.cell(row=i,column=first_val_col).value == 'Hospital' or worksheet_var.cell(row=i,column=first_val_col).value == 'AHRQ Safety' or worksheet_var.cell(row=i,column=first_val_col).value == 'Encounter Month' or worksheet_var.cell(row=i,column=first_val_col).value == 'Hospital/ Hospital System':
            return(i)

In [20]:
def find_ws_populated_cols(worksheet_var,header_row):
    cols_with_values = []
    for i in range(1,worksheet_var.max_column+1):
        if worksheet_var.cell(row=header_row,column=i).value is not None:
            cols_with_values.append(i-1)
    return(cols_with_values)

### TEST ###

In [21]:
amc_risk_hospitals = list(set(camc_cohort_list + lsccmc_cohort_list))
comm_risk_hospitals = list(set(ccmc_cohort_list + comm_cohort_list + casc_cohort_list))

In [22]:
amc_risk_df = pd.DataFrame(amc_risk_hospitals,columns=['hospital']) 
comm_risk_df = pd.DataFrame(comm_risk_hospitals,columns=['hospital']) 

In [23]:
amc_risk_df["hospital"].str.split(" ", n = 1, expand = True)[0]

0      390174
1      459984
2      050373
3      330306
4      220110
        ...  
226    050696
227    450820
228    520087
229    410007
230    110198
Name: 0, Length: 231, dtype: object

In [24]:
amc_risk_df["medicare_id"]= amc_risk_df["hospital"].str.split(" ", n = 1, expand = True)[0].str.strip()
comm_risk_df["medicare_id"]= comm_risk_df["hospital"].str.split(" ", n = 1, expand = True)[0].str.strip()

In [25]:
def inner_join_hospital_df(input_df,amc_or_comm):
    if amc_or_comm == 'AMC':
        input_df = pd.merge(input_df, amc_risk_df, how='inner', on = 'medicare_id')
        return(input_df)
    else:
        input_df = pd.merge(input_df, comm_risk_df, how='inner', on = 'medicare_id')
        return(input_df)

In [26]:
wd

'P:\\Datastore02\\Analytics\\230 Inpatient Quality Composite\\data\\fy23 scraped cdb data\\010_june_fytd\\OP Revisits with Denom'

In [27]:
for path, dirs, files in os.walk(wd):
    # if files list is not empty, open it up
    print(len(files))
    if len(files) > 0 and files[0].endswith('xlsx') and files[0] != 'op_revisits_file_validation.xlsx':
        # parse folder name to get measure name
        path_tail = str((os.path.basename(path)))
        # parse parent folder to get cohort name
        path_head = os.path.dirname(path)
        path_second_tail = str(os.path.basename(path_head))
        #print(path_second_tail)
        #print(path_key)
        #print(files)
        print(path)
        #print(path_tail)
        print(files)
        excel_file_data = open_excel_file(path, files)
        #print(excel_file_data.head())
        
        if '_DENOM' in files[0]:
            
            try:
                excel_file_data = excel_file_data[['Hospital','Cases']]
                excel_file_data['medicare_id'] = excel_file_data["Hospital"].str.strip().str.split(" ", n = 1, expand = True)[0].str.strip()
            except:
                print(files[0])
                excel_file_data = excel_file_data[['Hospital/ Hospital System','Cases']]
                excel_file_data = excel_file_data.rename(columns={'Hospital/ Hospital System': 'Hospital'})
                excel_file_data['medicare_id'] = excel_file_data["Hospital"].str.strip().str.split(" ", n = 1, expand = True)[0].str.strip()
                print(excel_file_data.head())
                ' 008000 AH_CELEBRATION'
                '008000 AH_CELEBRATION'
            
            if '_URI_' in files[0]:
                
                if 'AMC_' in files[0]:
                    uri_amc_denom = excel_file_data
                    uri_amc_denom = inner_join_hospital_df(uri_amc_denom,'AMC')
                    print(uri_amc_denom.head())
                else:
                    uri_comm_denom = excel_file_data
                    uri_comm_denom = inner_join_hospital_df(uri_comm_denom,'COMM')
                    
            elif '_COLON_' in files[0]:

                    if 'AMC_' in files[0]:
                        colon_amc_denom = excel_file_data
                        colon_amc_denom = inner_join_hospital_df(colon_amc_denom,'AMC')
                        print(colon_amc_denom.head())
                    else:
                        colon_comm_denom = excel_file_data
                        colon_comm_denom = inner_join_hospital_df(colon_comm_denom,'COMM')
            elif '_ARTHRO_' in files[0]:

                    if 'AMC_' in files[0]:
                        arthro_amc_denom = excel_file_data
                        arthro_amc_denom = inner_join_hospital_df(arthro_amc_denom,'AMC')
                    else:
                        arthro_comm_denom = excel_file_data
                        arthro_comm_denom = inner_join_hospital_df(arthro_comm_denom,'COMM')
            elif '_CHOL_' in files[0]:

                    if 'AMC_' in files[0]:
                        chol_amc_denom = excel_file_data
                        chol_amc_denom = inner_join_hospital_df(chol_amc_denom,'AMC')
                    else:
                        chol_comm_denom = excel_file_data
                        chol_comm_denom = inner_join_hospital_df(chol_comm_denom,'COMM')
                        
        else:
            try:
                excel_file_data = excel_file_data[['Hospital','Readmit Rate Num Cases (Readmit Cases)']]
                excel_file_data['medicare_id'] = excel_file_data["Hospital"].str.strip().str.split(" ", n = 1, expand = True)[0].str.strip()
            except:
                print(files[0])
                excel_file_data = excel_file_data[['Hospital/ Hospital System','Readmit Rate Num Cases (Readmit Cases)']]
                excel_file_data = excel_file_data.rename(columns={'Hospital/ Hospital System': 'Hospital'})
                excel_file_data['medicare_id'] = excel_file_data["Hospital"].str.strip().str.split(" ", n = 1, expand = True)[0].str.strip()
                print(excel_file_data.head())
            if '_URI_' in files[0]:
                
                if 'AMC_' in files[0]:
                    uri_amc_num = excel_file_data
                    uri_amc_num = inner_join_hospital_df(uri_amc_num,'AMC')
                    
                else:
                    uri_comm_num = excel_file_data
                    uri_comm_num = inner_join_hospital_df(uri_comm_num,'COMM')

            elif '_COLON_' in files[0]:

                    if 'AMC_' in files[0]:
                        colon_amc_num = excel_file_data
                        colon_amc_num = inner_join_hospital_df(colon_amc_num,'AMC')
                        print(colon_amc_num.head())
                    else:
                        colon_comm_num = excel_file_data
                        colon_comm_num = inner_join_hospital_df(colon_comm_num,'COMM')
            elif '_ARTHRO_' in files[0]:

                    if 'AMC_' in files[0]:
                        arthro_amc_num = excel_file_data
                        arthro_amc_num = inner_join_hospital_df(arthro_amc_num,'AMC')
                    else:
                        arthro_comm_num = excel_file_data
                        arthro_comm_num = inner_join_hospital_df(arthro_comm_num,'COMM')
            elif '_CHOL_' in files[0]:

                    if 'AMC_' in files[0]:
                        chol_amc_num = excel_file_data
                        chol_amc_num = inner_join_hospital_df(chol_amc_num,'AMC')
                    else:
                        chol_comm_num = excel_file_data
                        chol_comm_num = inner_join_hospital_df(chol_comm_num,'COMM')
        
        
        

0
0
1
P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\fy23 scraped cdb data\010_june_fytd\OP Revisits with Denom\AMC_RISK_MODEL\READM_ARTHRO_DENOM
['AMC_READM_ARTHRO_CUSTOM_DENOM (4).xlsx']
first_col: 2
header_row: 37
pop columns [1, 2, 3, 4, 5, 6, 7, 8]
1
P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\fy23 scraped cdb data\010_june_fytd\OP Revisits with Denom\AMC_RISK_MODEL\READM_ARTHRO_NUM
['AMC_READM_ARTHRO_CUSTOM (4).xlsx']
first_col: 2
header_row: 43
pop columns [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
1
P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\fy23 scraped cdb data\010_june_fytd\OP Revisits with Denom\AMC_RISK_MODEL\READM_CHOL_DENOM
['AMC_READM_CHOL_CUSTOM_DENOM (33).xlsx']
first_col: 2
header_row: 37
pop columns [1, 2, 3, 4, 5, 6, 7, 8]
1
P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\fy23 scraped cdb data\010_june_fytd\OP Revisits with Denom\AMC_RISK_MODEL\READM_CHOL_NUM
['AMC_READM_CHOL_CUSTOM (34).xlsx']
first_co

In [28]:
def left_join_df(left_df,right_df):
    new_df = pd.merge(left_df,right_df,how='left',on='medicare_id')
    return(new_df)

### Left join the denominator to the numerator df

In [29]:
uri_amc_df = left_join_df(uri_amc_denom,uri_amc_num)
uri_comm_df = left_join_df(uri_comm_denom,uri_comm_num)

colon_amc_df = left_join_df(colon_amc_denom,colon_amc_num)
colon_comm_df = left_join_df(colon_comm_denom,colon_comm_num)

arthro_amc_df = left_join_df(arthro_amc_denom,arthro_amc_num)
arthro_comm_df = left_join_df(arthro_comm_denom,arthro_comm_num)

chol_amc_df = left_join_df(chol_amc_denom,chol_amc_num)
chol_comm_df = left_join_df(chol_comm_denom,chol_comm_num)

In [30]:
arthro_comm_df[arthro_comm_df['Hospital_x'].str.contains("NORTHWESTERN")]

Unnamed: 0,Hospital_x,Cases,medicare_id,hospital_x,Hospital_y,Readmit Rate Num Cases (Readmit Cases),hospital_y
83,140062 NORTHWESTERN_PALOS,263,140062,140062 NORTHWESTERN_PALOS,140062 NORTHWESTERN_PALOS,3,140062 NORTHWESTERN_PALOS
86,140116 NORTHWESTERN_MCHENRY,399,140116,140116 NORTHWESTERN_MCHENRY,140116 NORTHWESTERN_MCHENRY,2,140116 NORTHWESTERN_MCHENRY
88,140130 NORTHWESTERN_LAKEFOREST,432,140130,140130 NORTHWESTERN_LAKEFOREST,140130 NORTHWESTERN_LAKEFOREST,5,140130 NORTHWESTERN_LAKEFOREST
93,140211 NORTHWESTERN_DELNOR,674,140211,140211 NORTHWESTERN_DELNOR,140211 NORTHWESTERN_DELNOR,2,140211 NORTHWESTERN_DELNOR
95,140286 NORTHWESTERN_KISH,202,140286,140286 NORTHWESTERN_KISH,140286 NORTHWESTERN_KISH,1,140286 NORTHWESTERN_KISH
97,141340 NORTHWESTERN_VALLEYW,14,141340,141340 VALLEY_WEST_COMMUNITY_HOSPITAL,141340 NORTHWESTERN_VALLEYW,0,141340 VALLEY_WEST_COMMUNITY_HOSPITAL
99,149916 NORTHWESTERN_HUNTLEY,380,149916,149916 NORTHWESTERN_HUNTLEY,149916 NORTHWESTERN_HUNTLEY,4,149916 NORTHWESTERN_HUNTLEY


In [31]:
uri_amc_df[uri_amc_df['Hospital_x'].str.contains("NORTHWESTERN")]

Unnamed: 0,Hospital_x,Cases,medicare_id,hospital_x,Hospital_y,Readmit Rate Num Cases (Readmit Cases),hospital_y
60,140242 NORTHWESTERN_CDH,406,140242,140242 NORTHWESTERN_CDH,140242 NORTHWESTERN_CDH,5,140242 NORTHWESTERN_CDH
62,140281 NORTHWESTERN_MEMORIAL,1194,140281,140281 NORTHWESTERN_MEMORIAL,140281 NORTHWESTERN_MEMORIAL,42,140281 NORTHWESTERN_MEMORIAL


### Fill na with 0 in the numerator column.  This works because we did a left join to the denominator df.  If the hospital has a denominator value but not numerator cases value, it will be null.  Convert this to a zero.  If the hospital does not have a denominator, they will not have a numerator and will be excluded from the inserts and receive a '-".

In [32]:
uri_amc_df[['Readmit Rate Num Cases (Readmit Cases)']] = uri_amc_df[['Readmit Rate Num Cases (Readmit Cases)']].fillna(value=0)
uri_comm_df[['Readmit Rate Num Cases (Readmit Cases)']] = uri_comm_df[['Readmit Rate Num Cases (Readmit Cases)']].fillna(value=0)

In [33]:
colon_amc_df[['Readmit Rate Num Cases (Readmit Cases)']] = colon_amc_df[['Readmit Rate Num Cases (Readmit Cases)']].fillna(value=0)
colon_comm_df[['Readmit Rate Num Cases (Readmit Cases)']] = colon_comm_df[['Readmit Rate Num Cases (Readmit Cases)']].fillna(value=0)

In [34]:
arthro_amc_df[['Readmit Rate Num Cases (Readmit Cases)']] = arthro_amc_df[['Readmit Rate Num Cases (Readmit Cases)']].fillna(value=0)
arthro_comm_df[['Readmit Rate Num Cases (Readmit Cases)']] = arthro_comm_df[['Readmit Rate Num Cases (Readmit Cases)']].fillna(value=0)

chol_amc_df[['Readmit Rate Num Cases (Readmit Cases)']] = chol_amc_df[['Readmit Rate Num Cases (Readmit Cases)']].fillna(value=0)
chol_comm_df[['Readmit Rate Num Cases (Readmit Cases)']] = chol_comm_df[['Readmit Rate Num Cases (Readmit Cases)']].fillna(value=0)

In [35]:
uri_amc_df['measure_nm'] = 'OP Procedure Revisits - Urological'
uri_comm_df['measure_nm'] = 'OP Procedure Revisits - Urological'

colon_amc_df['measure_nm'] = 'OP Procedure Revisits - Colonoscopy'
colon_comm_df['measure_nm'] = 'OP Procedure Revisits - Colonoscopy'

arthro_amc_df['measure_nm'] = 'OP Procedure Revisits - Arthroscopy'
arthro_comm_df['measure_nm'] = 'OP Procedure Revisits - Arthroscopy'

chol_amc_df['measure_nm'] = 'OP Procedure Revisits - Biliary'
chol_comm_df['measure_nm'] = 'OP Procedure Revisits - Biliary'

### union datasets together

In [36]:
df_list = [uri_amc_df,uri_comm_df,colon_amc_df,colon_comm_df,arthro_amc_df,arthro_comm_df,chol_amc_df,chol_comm_df]

In [37]:
op_proc_df = pd.concat(df_list)

### Append new id columns to the final dataframe

In [38]:
op_proc_df.columns

Index(['Hospital_x', 'Cases', 'medicare_id', 'hospital_x', 'Hospital_y',
       'Readmit Rate Num Cases (Readmit Cases)', 'hospital_y', 'measure_nm'],
      dtype='object')

# ATTENTION:  HARD-CODE SECTION.  UPDATE THIS WITH THE CALC_ID and CURRENT PERIOD_ID.  DO NOT CHANGE EVENT_TYPE_ID or MEASURE VALUE ID

In [39]:
op_proc_df['numerator_event_type_id'] = 11
op_proc_df['denominator_event_type_id'] = 21
op_proc_df['measure_value_id'] = 1
op_proc_df['calc_id'] = 15
op_proc_df['period_id'] = 7514

In [40]:
op_proc_df.head()

Unnamed: 0,Hospital_x,Cases,medicare_id,hospital_x,Hospital_y,Readmit Rate Num Cases (Readmit Cases),hospital_y,measure_nm,numerator_event_type_id,denominator_event_type_id,measure_value_id,calc_id,period_id
0,010033 ALABAMA,1983,10033,010033 ALABAMA,010033 ALABAMA,87.0,010033 ALABAMA,OP Procedure Revisits - Urological,11,21,1,15,7514
1,010087 USA,1,10087,010087 USA,010087 USA,0.0,010087 USA,OP Procedure Revisits - Urological,11,21,1,15,7514
2,030002 BANNERHEALTH_PHOENIX,300,30002,030002 BANNERHEALTH_PHOENIX,030002 BANNERHEALTH_PHOENIX,30.0,030002 BANNERHEALTH_PHOENIX,OP Procedure Revisits - Urological,11,21,1,15,7514
3,030006 TUCSON,409,30006,030006 TUCSON,030006 TUCSON,24.0,030006 TUCSON,OP Procedure Revisits - Urological,11,21,1,15,7514
4,030064 BANNERHEALTH_ARIZONA,313,30064,030064 BANNERHEALTH_ARIZONA,030064 BANNERHEALTH_ARIZONA,17.0,030064 BANNERHEALTH_ARIZONA,OP Procedure Revisits - Urological,11,21,1,15,7514


### Some Cases rows are actually a string '<5'.  Convert this to 5 and then convert column datatype to float

In [41]:
op_proc_df['Cases'] = np.where(op_proc_df['Cases'] == '<5','5',op_proc_df['Cases'])

In [42]:
op_proc_df['Cases'] = op_proc_df['Cases'].astype('float')

In [43]:
#conditionally create measure value column.  If denominator is 0, then we put 0 in order to avoid divide by zero error.
op_proc_df['measure_value'] = np.where(op_proc_df['Cases']==0, 0,(op_proc_df['Readmit Rate Num Cases (Readmit Cases)']/op_proc_df['Cases'])*100)

### Join the measure ids

In [44]:
op_proc_df = pd.merge(op_proc_df,op_revisit_meas,left_on='measure_nm',right_on='measure_name',how='left')

### Join the hospital ids

In [45]:
op_proc_df.head()

Unnamed: 0,Hospital_x,Cases,medicare_id,hospital_x,Hospital_y,Readmit Rate Num Cases (Readmit Cases),hospital_y,measure_nm,numerator_event_type_id,denominator_event_type_id,measure_value_id,calc_id,period_id,measure_value,measure_id,measure_name
0,010033 ALABAMA,1983.0,10033,010033 ALABAMA,010033 ALABAMA,87.0,010033 ALABAMA,OP Procedure Revisits - Urological,11,21,1,15,7514,4.387292,148,OP Procedure Revisits - Urological
1,010087 USA,1.0,10087,010087 USA,010087 USA,0.0,010087 USA,OP Procedure Revisits - Urological,11,21,1,15,7514,0.0,148,OP Procedure Revisits - Urological
2,030002 BANNERHEALTH_PHOENIX,300.0,30002,030002 BANNERHEALTH_PHOENIX,030002 BANNERHEALTH_PHOENIX,30.0,030002 BANNERHEALTH_PHOENIX,OP Procedure Revisits - Urological,11,21,1,15,7514,10.0,148,OP Procedure Revisits - Urological
3,030006 TUCSON,409.0,30006,030006 TUCSON,030006 TUCSON,24.0,030006 TUCSON,OP Procedure Revisits - Urological,11,21,1,15,7514,5.867971,148,OP Procedure Revisits - Urological
4,030064 BANNERHEALTH_ARIZONA,313.0,30064,030064 BANNERHEALTH_ARIZONA,030064 BANNERHEALTH_ARIZONA,17.0,030064 BANNERHEALTH_ARIZONA,OP Procedure Revisits - Urological,11,21,1,15,7514,5.43131,148,OP Procedure Revisits - Urological


In [46]:
op_proc_df = pd.merge(op_proc_df,all_hospitals,left_on='medicare_id',right_on='hospital_medicare_id',how='left')

In [47]:
op_proc_df

Unnamed: 0,Hospital_x,Cases,medicare_id,hospital_x,Hospital_y,Readmit Rate Num Cases (Readmit Cases),hospital_y,measure_nm,numerator_event_type_id,denominator_event_type_id,measure_value_id,calc_id,period_id,measure_value,measure_id,measure_name,hospital_id,hospital_medicare_id
0,010033 ALABAMA,1983.0,010033,010033 ALABAMA,010033 ALABAMA,87.0,010033 ALABAMA,OP Procedure Revisits - Urological,11,21,1,15,7514,4.387292,148,OP Procedure Revisits - Urological,1,010033
1,010087 USA,1.0,010087,010087 USA,010087 USA,0.0,010087 USA,OP Procedure Revisits - Urological,11,21,1,15,7514,0.000000,148,OP Procedure Revisits - Urological,94,010087
2,030002 BANNERHEALTH_PHOENIX,300.0,030002,030002 BANNERHEALTH_PHOENIX,030002 BANNERHEALTH_PHOENIX,30.0,030002 BANNERHEALTH_PHOENIX,OP Procedure Revisits - Urological,11,21,1,15,7514,10.000000,148,OP Procedure Revisits - Urological,89,030002
3,030006 TUCSON,409.0,030006,030006 TUCSON,030006 TUCSON,24.0,030006 TUCSON,OP Procedure Revisits - Urological,11,21,1,15,7514,5.867971,148,OP Procedure Revisits - Urological,150,030006
4,030064 BANNERHEALTH_ARIZONA,313.0,030064,030064 BANNERHEALTH_ARIZONA,030064 BANNERHEALTH_ARIZONA,17.0,030064 BANNERHEALTH_ARIZONA,OP Procedure Revisits - Urological,11,21,1,15,7514,5.431310,148,OP Procedure Revisits - Urological,2,030064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2593,521344 GUNDERSEN_BOSCOBEL,5.0,521344,521344 BOSCOBEL_AREA_HEALTH_CARE,521344 GUNDERSEN_BOSCOBEL,0.0,521344 BOSCOBEL_AREA_HEALTH_CARE,OP Procedure Revisits - Biliary,11,21,1,15,7514,0.000000,146,OP Procedure Revisits - Biliary,743,521344
2594,521350 ASPIRUS_LANGLADE,37.0,521350,521350 ASPIRUS_LANGLADE_HOSPITAL,521350 ASPIRUS_LANGLADE,1.0,521350 ASPIRUS_LANGLADE_HOSPITAL,OP Procedure Revisits - Biliary,11,21,1,15,7514,2.702703,146,OP Procedure Revisits - Biliary,623,521350
2595,530014 CHEYENNE,94.0,530014,530014 CHEYENNE,530014 CHEYENNE,0.0,530014 CHEYENNE,OP Procedure Revisits - Biliary,11,21,1,15,7514,0.000000,146,OP Procedure Revisits - Biliary,530,530014
2596,670077 METHODIST_WEST_HOUSTON,121.0,670077,670077 METHODIST_WEST_HOUSTON,670077 METHODIST_WEST_HOUSTON,4.0,670077 METHODIST_WEST_HOUSTON,OP Procedure Revisits - Biliary,11,21,1,15,7514,3.305785,146,OP Procedure Revisits - Biliary,208,670077


### Reorder columns

In [48]:
op_proc_df.columns

Index(['Hospital_x', 'Cases', 'medicare_id', 'hospital_x', 'Hospital_y',
       'Readmit Rate Num Cases (Readmit Cases)', 'hospital_y', 'measure_nm',
       'numerator_event_type_id', 'denominator_event_type_id',
       'measure_value_id', 'calc_id', 'period_id', 'measure_value',
       'measure_id', 'measure_name', 'hospital_id', 'hospital_medicare_id'],
      dtype='object')

In [49]:
op_proc_df = op_proc_df.rename(columns={"Cases": "denominator", "Readmit Rate Num Cases (Readmit Cases)": "numerator"})

In [50]:
op_proc_df = op_proc_df[['calc_id','hospital_id','measure_id','period_id','measure_value_id',\
                        'measure_value','numerator','numerator_event_type_id','denominator','denominator_event_type_id']]

In [51]:
op_proc_df.drop_duplicates(subset = ['calc_id','hospital_id','measure_id','period_id','measure_value_id'],keep='first',inplace=True)

### Insert data into database

In [52]:
def insert_measure_values_from_reports_df(df):
    # connect to the NM_Analytics database
    conn = pyodbc.connect('Driver={SQL Server};'
                          'Server=edw00pd05wva.corp.nm.org\EDWIDS1;'
                          'Database=NM_Analytics_Prototype;'
                          'Trusted_Connection=yes;')

    cursor = conn.cursor()
    # iterate over df rows and insert into NM_Analytics_Prototype.vizient_qa.datadump
    for index, row in df.iterrows():
        if row['measure_value'] != 'Missing':
            cursor.execute(
                "INSERT INTO NM_Analytics_Prototype.vizient_qa.measure_values([calc_id],[hospital_id],[measure_id],[period_id],[measure_value_id],[measure_value],[numerator],[numerator_event_type_id],[denominator],[denominator_event_type_id]) values (?,?,?,?,?,?,?,?,?,?)",
                row['calc_id'], row['hospital_id'], row['measure_id'], row['period_id'], row['measure_value_id'],
                row['measure_value'], row['numerator'], row['numerator_event_type_id'], row['denominator'],
                row['denominator_event_type_id'])
            conn.commit()

    cursor.close()
    conn.close()
    print('done inserting rows.')

### final line that actually inserts the data into the database.

In [54]:
insert_measure_values_from_reports_df(op_proc_df)

done inserting rows.


In [53]:
op_proc_df.shape

(2598, 10)