In [1]:
import pandas as pd
import numpy as np
import os
import openpyxl
import pyodbc

### With every Period Calculator release, the number of hospitals will change as Vizient adds/loses hospitals.  Also, some hospitals may change cohorts over time as they grow or shrink.  In order to track these changes, we have created a table called calc_hospital_cohort to store the hospital cohort assignments at each Period calculator release time.  This needs to be updated at the very least once a fiscal year.

### Before we can run this script, make sure you have already updated the vizient_qa.hospitals table to add all new hospitals and given them an ID.  Also, make sure you have already updated the cohort table to make sure you have all cohorts/cohort_ids you need.  

In [2]:
def grab_cohort_worksheet_sheet(path_obj,file_obj,sheetname):
    file_loc = os.path.abspath(os.path.join(path_obj,file_obj))
    wb = openpyxl.load_workbook(file_loc, data_only = True)
    ws = wb[sheetname]
    return(ws)

In [3]:
filepath = r'P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\cohort data\2023\calc_3_cohorts'
filename = 'Period3_QA_2023_Cohorts.xlsx'
sheet = '2023 Period3 Q&A Cohorts'
#provide calculator label.  This can be found in vizient_qa.calculator
calculator_period_label = '2023 Q&A calculator Period 3'

In [4]:
cohort_ws = grab_cohort_worksheet_sheet(filepath, filename,sheet)

  warn(msg)


In [5]:
df = pd.DataFrame(cohort_ws.values)

In [6]:
#rename column headers using the 'Domain' header row coordinates
df = df.rename(columns=df.iloc[0])

In [7]:
df = df.drop([0])

In [8]:
df = df[['Medicare ID','Short name', 'Cohort']]

### Same thing for Critical Access Hospitals Worksheet  (when this script was written, the critical access cohort hospitals were listed on a separate sheet.  This may change later.

In [10]:
ca_sheet = 'Crit Acc. & Small Comm'
ca_cohort_ws = grab_cohort_worksheet_sheet(filepath, filename,ca_sheet)

  warn(msg)


In [11]:
ca_df = pd.DataFrame(ca_cohort_ws.values)

In [12]:
#rename column headers using the 'Domain' header row coordinates
ca_df = ca_df.rename(columns=ca_df.iloc[1])

In [13]:
ca_df = ca_df.drop([0,1])

In [14]:
ca_df = ca_df[['Medicareid','Name']]

In [15]:
#rename columns to make sure they match the main sheet column headers
ca_df = ca_df.rename(columns={"Medicareid": "Medicare ID", "Name": "Short name"})

In [16]:
ca_df['Cohort'] = 'Critical Access & Small Community'

### Union the dataframes together

In [17]:
df_list = [df,ca_df]
df_concat = pd.concat(df_list)

In [18]:
df_concat = df_concat.drop_duplicates()

In [19]:
df_concat

Unnamed: 0,Medicare ID,Short name,Cohort
1,010033,ALABAMA,Comprehensive Academic Medical Center
2,030002,BANNERHEALTH_PHOENIX,Comprehensive Academic Medical Center
3,030064,BANNERHEALTH_ARIZONA,Comprehensive Academic Medical Center
4,040016,ARKANSAS,Comprehensive Academic Medical Center
5,050025,UCSD,Comprehensive Academic Medical Center
...,...,...,...
162,521344,Boscobel Area Health Care,Critical Access & Small Community
163,521346,ThedaCare Medical Center Shawano,Critical Access & Small Community
164,521349,Allina River Falls Hospital,Critical Access & Small Community
165,521350,Aspirus Langlade Hospital,Critical Access & Small Community


### Query db.  Get all cohorts in db

In [20]:
#function to query db and get cohort ids and cohort names.
def get_all_cohorts():
    # define the query
    all_cohorts = '''
        SELECT
        *
        FROM
        NM_Analytics_Prototype.vizient_qa.cohort
        '''
    # create the connection to the ms sql db
    conn = pyodbc.connect('Driver={SQL Server};'
                          'Server=edw00pd05wva.corp.nm.org\EDWIDS1;'
                          'Database=clarity;'
                          'Trusted_Connection=yes;')

    # read query results into pandas dataframe
    cohort_df = pd.DataFrame(pd.read_sql(all_cohorts, conn))
    # close the db connection
    conn.close()
    # return the results dataframe of measure ids and names
    return (cohort_df)

### Query db.  Get all hospitals in db.

In [21]:
#function to query db and get hospital ids and hospital names.
def get_all_hospitals():
    # define the query
    all_hospitals = '''
        SELECT
        *
        FROM
        NM_Analytics_Prototype.vizient_qa.hospitals
        '''
    # create the connection to the ms sql db
    conn = pyodbc.connect('Driver={SQL Server};'
                          'Server=edw00pd05wva.corp.nm.org\EDWIDS1;'
                          'Database=clarity;'
                          'Trusted_Connection=yes;')

    # read query results into pandas dataframe
    hospital_df = pd.DataFrame(pd.read_sql(all_hospitals, conn))
    # close the db connection
    conn.close()
    # return the results dataframe of measure ids and names
    return (hospital_df)

### Query db.  Get current Calculator ID

In [22]:
def get_calc_id():
    # define the query
    calc_id = '''
        SELECT
        calc_id
        FROM
        NM_Analytics_Prototype.vizient_qa.calculator
        WHERE
        calc_nm in ('%s')
        ''' % calculator_period_label
    # create the connection to the ms sql db
    conn = pyodbc.connect('Driver={SQL Server};'
                          'Server=edw00pd05wva.corp.nm.org\EDWIDS1;'
                          'Database=clarity;'
                          'Trusted_Connection=yes;')

    # read query results into pandas dataframe
    calc_df = pd.DataFrame(pd.read_sql(calc_id, conn))
    # close the db connection
    conn.close()
    # return the results dataframe of measure ids and names
    return (calc_df.iloc[0][0])

In [23]:
calculator_period_label

'2023 Q&A calculator Period 3'

In [24]:
get_calc_id()

19

In [25]:
cohort_df = get_all_cohorts()

In [26]:
hospital_df = get_all_hospitals()

In [27]:
calc_id = get_calc_id()

### Clean up the dataset

### Conditionally update the cohort names because they are different than what is stored in the database.  Clean up the hospital names

In [28]:
#validate cohort names.  Vizient keeps changing cohort name formatting slightly so I will force the cohorts to match
#what is already in the database
df_concat['Cohort'].unique()

array(['Comprehensive Academic Medical Center',
       'Large, Specialized Medical Center', 'Complex Care Medical Center',
       'Community Medical Center', 'Critical Access & Small Community'],
      dtype=object)

In [29]:
df_concat['Cohort'] = np.where(df_concat['Cohort']=='Comprehensive Academic Medical Center', 'Comprehensive Academic Medical Center', df_concat['Cohort'])

In [30]:
df_concat['Cohort'] = np.where(df_concat['Cohort']=='Large, Specialized Complex Care Medical Center', 'Large Specialized Complex Care Medical Center', df_concat['Cohort'])

In [31]:
#period 4 2021 they messed up the lsccmc name...
df_concat['Cohort'] = np.where(df_concat['Cohort']=='Large, Specialized Medical Center', 'Large Specialized Complex Care Medical Center', df_concat['Cohort'])

In [32]:
df_concat['Cohort'] = np.where(df_concat['Cohort']=='Complex Care Medical Center', 'Complex Care Medical Center', df_concat['Cohort'])

In [33]:
df_concat['Cohort'] = np.where(df_concat['Cohort']=='Community Medical Center', 'Community', df_concat['Cohort'])

In [34]:
df_concat['Cohort'] = np.where(df_concat['Cohort']=='Critical Access & Small Community', 'Critical Access & Small Community', df_concat['Cohort'])

In [35]:
df_concat['Cohort'].unique()

array(['Comprehensive Academic Medical Center',
       'Large Specialized Complex Care Medical Center',
       'Complex Care Medical Center', 'Community',
       'Critical Access & Small Community'], dtype=object)

In [36]:
#clean up hospital names
df_concat['Short name'] = df_concat['Short name'].str.upper().str.replace(' ','_').str.replace('-','_').str.replace("'","")
hospital_df['hospital_name'] = hospital_df['hospital_name'].str.upper().str.replace(' ','_').str.replace('-','_').str.replace("'","")

### Left join to the database cohorts

In [37]:
#left join dataframe to datbase lookup table
df_concat_with_cohort_id = pd.merge(df_concat, cohort_df, left_on='Cohort',right_on="cohort_name",how="left")

In [38]:
#check if there are any unjoined rows
df_concat_with_cohort_id[df_concat_with_cohort_id['cohort_name'].isna()]

Unnamed: 0,Medicare ID,Short name,Cohort,cohort_id,cohort_name


### TUHS_JEANES has a new medicare ID but I cannot find it on cms.gov.  I will manually change TUHS medicare ID back to 390080 until I confirm 399927 is correct.

### Update.  399927 is a fake medicare ID created by Vizient because TUHS JEANES has been integrated into a larger hospital system.  They wanted two separate data feeds so Vizient created an internal medicare number for them.

In [86]:
#no longer needed.
#df_concat_with_cohort_id[df_concat_with_cohort_id['Medicare ID'] == '399927'].index[0]

In [87]:
#manually override TUHS_JEANES Medicare ID
#no longer needed
#df_concat_with_cohort_id.at[df_concat_with_cohort_id[df_concat_with_cohort_id['Medicare ID'] == '399927'].index[0],'Medicare ID'] = '390080'

### Left join to the database hospitals

In [39]:
#left join dataframe to database hospital table
df_final = pd.merge(df_concat_with_cohort_id, hospital_df, left_on=['Medicare ID'],right_on=['hospital_medicare_id'],how="left")

In [40]:
df_final[df_final['hospital_name'].isna()]

Unnamed: 0,Medicare ID,Short name,Cohort,cohort_id,cohort_name,hospital_id,hospital_name,hospital_medicare_id,hospital_cohort_id


### Convert to same format as calc_hospital_cohort table

In [41]:
#fill in calc_id column using the calculator period label variable and query results from above
df_final['calc_id'] = calc_id

In [42]:
df_final = df_final[['calc_id','hospital_id','cohort_id']]

In [43]:
#add row for TOP Decile hospital placeholder
df_final.loc[-1] = [calc_id,0,0]  #adding a row
df_final.index = df_final.index + 1  # shifting index
df_final = df_final.sort_index()  # sorting by index

In [44]:
df_final['calc_id'] = df_final['calc_id'].astype(int)
df_final['hospital_id'] = df_final['hospital_id'].astype(int)
df_final['cohort_id'] = df_final['cohort_id'].astype(int)

In [45]:
#check output
df_final.head()

Unnamed: 0,calc_id,hospital_id,cohort_id
0,19,0,0
1,19,1,1
2,19,89,1
3,19,2,1
4,19,3,1


### Re-Cohort Kish and Huntley.  In 2021, 2022, Vizient has Kish and Huntley placed in the Community Cohort.  NM Quality Leadership would like to 're-cohort' those hospitals into CCMC cohort in order to track more measures.  Adding this line to re-assign the cohort back to CCMC.  This line may not always be necessary.  Depends on the current needs of these hospitals.

In [50]:
#df_final[df_final['hospital_id']== 222]
df_final[df_final['hospital_id']== 471]

Unnamed: 0,calc_id,hospital_id,cohort_id
534,19,471,3


In [49]:
df_final.loc[df_final["hospital_id"] == 222, "cohort_id"] = 3
df_final.loc[df_final["hospital_id"] == 471, "cohort_id"] = 3

In [51]:
#check number rows match the cohort file
df_final.shape

(956, 3)

### BELOW HERE:  INSERT data into database table calc_hospital_cohort

In [52]:
# create the connection to the ms sql db

conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=edw00pd05wva.corp.nm.org\EDWIDS1;'
                      'Database=clarity;'
                      'Trusted_Connection=yes;')

cursor = conn.cursor()
for i, item in enumerate(df_final.iterrows()):
    print(item[1]['calc_id'],item[1]['hospital_id'],item[1]['cohort_id'])
    cursor.execute("insert into NM_Analytics_Prototype.vizient_qa.calc_hospital_cohort values (?,?,?)",str(item[1]['calc_id']),str(item[1]['hospital_id']),str(item[1]['cohort_id']))
    #cursor.execute(query)
    conn.commit()

# close the db connection
conn.close()

19 0 0
19 1 1
19 89 1
19 2 1
19 3 1
19 4 1
19 808 1
19 778 1
19 824 1
19 5 1
19 86 1
19 6 1
19 7 1
19 8 1
19 9 1
19 10 1
19 11 1
19 12 1
19 13 1
19 82 1
19 14 1
19 79 1
19 90 1
19 779 1
19 825 1
19 15 1
19 16 1
19 17 1
19 18 1
19 936 1
19 19 1
19 20 1
19 21 1
19 22 1
19 23 1
19 24 1
19 25 1
19 26 1
19 476 1
19 27 1
19 92 1
19 28 1
19 29 1
19 30 1
19 31 1
19 77 1
19 32 1
19 33 1
19 34 1
19 80 1
19 362 1
19 35 1
19 36 1
19 83 1
19 37 1
19 826 1
19 38 1
19 39 1
19 40 1
19 363 1
19 81 1
19 41 1
19 84 1
19 364 1
19 42 1
19 361 1
19 157 1
19 43 1
19 780 1
19 44 1
19 45 1
19 78 1
19 47 1
19 48 1
19 49 1
19 50 1
19 781 1
19 51 1
19 695 1
19 52 1
19 53 1
19 87 1
19 54 1
19 55 1
19 56 1
19 57 1
19 782 1
19 58 1
19 827 1
19 59 1
19 828 1
19 60 1
19 61 1
19 829 1
19 783 1
19 62 1
19 63 1
19 64 1
19 65 1
19 85 1
19 66 1
19 696 1
19 365 1
19 67 1
19 68 1
19 69 1
19 93 1
19 70 1
19 71 1
19 91 1
19 72 1
19 73 1
19 477 1
19 74 1
19 160 1
19 75 1
19 76 1
19 94 2
19 150 2
19 95 2
19 360 2
19 809 2
19 746

19 447 5
19 448 5
19 992 5
19 993 5
19 994 5
19 995 5
19 996 5
19 997 5
19 998 5
19 616 5
19 449 5
19 740 5
19 617 5
19 741 5
19 618 5
19 619 5
19 742 5
19 620 5
19 470 5
19 450 5
19 621 5
19 693 5
19 451 5
19 622 5
19 743 5
19 452 5
19 913 5
19 623 5
19 453 5
