In [181]:
import pandas as pd
import numpy as np
import os
import openpyxl
import pyodbc

### Jupyter Notebook tool to help update the NM_Analytics_Prototype.vizient_qa.hospitals table for each Vizient Q&A period release.  Sometimes Vizient will add/lose member hospitals, sometimes hospitals change their name, sometimes hospitals change cohort (less likely).  This tool will help you identify which hospitals we need to be UPDATED or INSERTED.

#NOTE ABOUT RE-COHORTING:  IN 2021, 2022, NORTHWESTERN_KISH AND NORTHWESTERN_HUNTLEY WERE OFFICIALLY PUT IN THE VIZIENT COMMUNITY COHORT.
#NORTHWESTERN QUALITY LEADERS WANT TO TRACK CCMC MEASURES.  THEREFORE, WE ARE MANUALLY MOVING
#KISH AND HUNTLEY TO THE CCMC COHORT.  THEREFORE, KISH AND HUNTLEY MAY APPEAR IN THE BELOW OUTPUT.  YOU SHOULD IGNORE IF
#KISH AND HUNTLEY ARE CORRECTLY ASSIGNED PER NM QUALITY.  THIS SITUATION MAY CHANGE IN THE FUTURE BUT, AS OF FY23, IT WAS THE CASE.

In [182]:
def grab_cohort_worksheet_sheet(path_obj,file_obj,sheetname):
    file_loc = os.path.abspath(os.path.join(path_obj,file_obj))
    wb = openpyxl.load_workbook(file_loc, data_only = True)
    ws = wb[sheetname]
    return(ws)

In [183]:
filepath = r'P:\Datastore02\Analytics\230 Inpatient Quality Composite\data\cohort data\2023\calc_3_cohorts'
filename = 'Period3_QA_2023_Cohorts.xlsx'
sheet = '2023 Period3 Q&A Cohorts'

In [184]:
cohort_ws = grab_cohort_worksheet_sheet(filepath, filename,sheet)

  warn(msg)


In [185]:
df = pd.DataFrame(cohort_ws.values)

In [186]:
#rename column headers using the 'Domain' header row coordinates
df = df.rename(columns=df.iloc[0])

In [187]:
df = df.drop([0])

In [188]:
df = df[['Medicare ID','Short name', 'Cohort']]

In [189]:
df.head()

Unnamed: 0,Medicare ID,Short name,Cohort
1,10033,ALABAMA,Comprehensive Academic Medical Center
2,30002,BANNERHEALTH_PHOENIX,Comprehensive Academic Medical Center
3,30064,BANNERHEALTH_ARIZONA,Comprehensive Academic Medical Center
4,40016,ARKANSAS,Comprehensive Academic Medical Center
5,50025,UCSD,Comprehensive Academic Medical Center


### Same thing for Critical Access Hospitals Worksheet  (when this script was written, the critical access cohort hospitals were listed on a separate sheet.  This may change later.

In [190]:
ca_sheet = 'Crit Acc. & Small Comm'
ca_cohort_ws = grab_cohort_worksheet_sheet(filepath, filename,ca_sheet)

  warn(msg)


In [191]:
ca_df = pd.DataFrame(ca_cohort_ws.values)

In [192]:
#rename column headers using the 'Domain' header row coordinates
ca_df = ca_df.rename(columns=ca_df.iloc[1])

In [193]:
ca_df = ca_df.drop([0,1])

In [194]:
ca_df = ca_df[['Medicareid','Name']]

In [195]:
#rename columns to make sure they match the main sheet column headers
ca_df = ca_df.rename(columns={"Medicareid": "Medicare ID", "Name": "Short name"})

In [196]:
ca_df['Cohort'] = 'Critical Access & Small Community'

### Union the dataframes together

In [197]:
df_list = [df,ca_df]
df_concat = pd.concat(df_list)

In [198]:
df_concat = df_concat.drop_duplicates()

In [199]:
df_concat

Unnamed: 0,Medicare ID,Short name,Cohort
1,010033,ALABAMA,Comprehensive Academic Medical Center
2,030002,BANNERHEALTH_PHOENIX,Comprehensive Academic Medical Center
3,030064,BANNERHEALTH_ARIZONA,Comprehensive Academic Medical Center
4,040016,ARKANSAS,Comprehensive Academic Medical Center
5,050025,UCSD,Comprehensive Academic Medical Center
...,...,...,...
162,521344,Boscobel Area Health Care,Critical Access & Small Community
163,521346,ThedaCare Medical Center Shawano,Critical Access & Small Community
164,521349,Allina River Falls Hospital,Critical Access & Small Community
165,521350,Aspirus Langlade Hospital,Critical Access & Small Community


In [200]:
df_concat.groupby(['Cohort'])['Cohort'].count()

Cohort
Community Medical Center                 334
Complex Care Medical Center              188
Comprehensive Academic Medical Center    116
Critical Access & Small Community        165
Large, Specialized Medical Center        152
Name: Cohort, dtype: int64

### Query db.  Get all cohorts in db

In [201]:
#function to query db and get cohort ids and cohort names.
def get_all_cohorts():
    # define the query
    all_cohorts = '''
        SELECT
        *
        FROM
        NM_Analytics_Prototype.vizient_qa.cohort
        '''
    # create the connection to the ms sql db
    conn = pyodbc.connect('Driver={SQL Server};'
                          'Server=edw00pd05wva.corp.nm.org\EDWIDS1;'
                          'Database=clarity;'
                          'Trusted_Connection=yes;')

    # read query results into pandas dataframe
    cohort_df = pd.DataFrame(pd.read_sql(all_cohorts, conn))
    # close the db connection
    conn.close()
    # return the results dataframe of measure ids and names
    return (cohort_df)

### Query db.  Get all hospitals in db.

In [202]:
#function to query db and get hospital ids and hospital names.
def get_all_hospitals():
    # define the query
    all_hospitals = '''
        SELECT
        *
        FROM
        NM_Analytics_Prototype.vizient_qa.hospitals
        '''
    # create the connection to the ms sql db
    conn = pyodbc.connect('Driver={SQL Server};'
                          'Server=edw00pd05wva.corp.nm.org\EDWIDS1;'
                          'Database=clarity;'
                          'Trusted_Connection=yes;')

    # read query results into pandas dataframe
    hospital_df = pd.DataFrame(pd.read_sql(all_hospitals, conn))
    # close the db connection
    conn.close()
    # return the results dataframe of measure ids and names
    return (hospital_df)

In [203]:
cohort_df = get_all_cohorts()

In [204]:
hospital_df = get_all_hospitals()

### Conditionally update the cohort names because they are different than what is stored in the database.  Clean up the hospital names

In [205]:
#validate cohort names.  Vizient keeps changing cohort name formatting slightly so I will force the cohorts to match
#what is already in the database
df_concat['Cohort'].unique()

array(['Comprehensive Academic Medical Center',
       'Large, Specialized Medical Center', 'Complex Care Medical Center',
       'Community Medical Center', 'Critical Access & Small Community'],
      dtype=object)

In [206]:
df_concat['Cohort'] = np.where(df_concat['Cohort']=='Comprehensive Academic Medical Center', 'Comprehensive Academic Medical Center', df_concat['Cohort'])

In [207]:
df_concat['Cohort'] = np.where(df_concat['Cohort']=='Large, Specialized Complex Care Medical Center', 'Large Specialized Complex Care Medical Center', df_concat['Cohort'])

In [208]:
#period 4 2021 they messed up the lsccmc name...
df_concat['Cohort'] = np.where(df_concat['Cohort']=='Large, Specialized Medical Center', 'Large Specialized Complex Care Medical Center', df_concat['Cohort'])

In [209]:
df_concat['Cohort'] = np.where(df_concat['Cohort']=='Complex Care Medical Center', 'Complex Care Medical Center', df_concat['Cohort'])

In [210]:
df_concat['Cohort'] = np.where(df_concat['Cohort']=='Community Medical Center', 'Community', df_concat['Cohort'])

In [211]:
df_concat['Cohort'] = np.where(df_concat['Cohort']=='Critical Access & Small Community', 'Critical Access & Small Community', df_concat['Cohort'])

In [212]:
df_concat['Cohort'].unique()

array(['Comprehensive Academic Medical Center',
       'Large Specialized Complex Care Medical Center',
       'Complex Care Medical Center', 'Community',
       'Critical Access & Small Community'], dtype=object)

In [213]:
#clean up hospital names
df_concat['Short name'] = df_concat['Short name'].str.upper().str.replace(' ','_').str.replace('-','_').str.replace("'","")
hospital_df['hospital_name'] = hospital_df['hospital_name'].str.upper().str.replace(' ','_').str.replace('-','_').str.replace("'","")

### Left join to the database cohorts

In [214]:
#left join dataframe to datbase lookup table
df_concat_with_cohort_id = pd.merge(df_concat, cohort_df, left_on='Cohort',right_on="cohort_name",how="left")

In [215]:
#check if there are any unjoined rows
df_concat_with_cohort_id[df_concat_with_cohort_id['cohort_name'].isna()]

Unnamed: 0,Medicare ID,Short name,Cohort,cohort_id,cohort_name


In [216]:
df_concat_with_cohort_id.head()

Unnamed: 0,Medicare ID,Short name,Cohort,cohort_id,cohort_name
0,10033,ALABAMA,Comprehensive Academic Medical Center,1,Comprehensive Academic Medical Center
1,30002,BANNERHEALTH_PHOENIX,Comprehensive Academic Medical Center,1,Comprehensive Academic Medical Center
2,30064,BANNERHEALTH_ARIZONA,Comprehensive Academic Medical Center,1,Comprehensive Academic Medical Center
3,40016,ARKANSAS,Comprehensive Academic Medical Center,1,Comprehensive Academic Medical Center
4,50025,UCSD,Comprehensive Academic Medical Center,1,Comprehensive Academic Medical Center


### Left join to the database hospitals

In [217]:
#left join dataframe to database hospital table
df_final = pd.merge(df_concat_with_cohort_id, hospital_df, left_on=['Short name','cohort_id'],right_on=["hospital_name",'hospital_cohort_id'],how="left")

In [218]:
df_final_nas = df_final[df_final['hospital_name'].isna()]

### Write the output to an excel file for further exploration and validation.  Need to check whether failed left join hospitals are because of Name change, Cohort Change or if it is a brand new Vizient member hospital.  Most likely the cause will be because of name change or because the hospital is new.  

In [219]:
#df_final_nas.to_excel("qa_hospitals_check.xlsx",index=False)

### Left join to database hospitals on medicare ID

In [220]:
df_final_med_id = pd.merge(df_concat_with_cohort_id, hospital_df, left_on=['Medicare ID'],right_on=['hospital_medicare_id'],how="left")

In [221]:
df_med_id_final_nas = df_final_med_id[df_final_med_id['hospital_name'].isna()]

In [222]:
df_med_id_final_nas.shape

(0, 9)

In [223]:
df_final_nas.shape

(0, 9)

In [224]:
name_or_cohort_changed_hospitals = [i for i in list(df_final_nas['Medicare ID']) if i not in list(df_med_id_final_nas['Medicare ID'])]

In [225]:
new_hospitals = list(df_med_id_final_nas['Medicare ID'])

In [226]:
len(list(set(new_hospitals)))

0

In [227]:
new_hosp_df_final_nas = df_final_nas[df_final_nas['Medicare ID'].isin(new_hospitals)]

In [228]:
name_or_cohort_change_df_final_nas = df_final_nas[df_final_nas['Medicare ID'].isin(name_or_cohort_changed_hospitals)]

In [229]:
new_hosp_df_final_nas.shape

(0, 9)

In [230]:
new_hosp_df_final_nas.head()

Unnamed: 0,Medicare ID,Short name,Cohort,cohort_id,cohort_name,hospital_id,hospital_name,hospital_medicare_id,hospital_cohort_id


In [231]:
name_or_cohort_change_df_final_nas.shape

(0, 9)

In [232]:
#NOTE ABOUT RE-COHORTING:  IN 2021, 2022, NORTHWESTERN_KISH AND NORTHWESTERN_HUNTLEY WERE OFFICIALLY PUT IN THE VIZIENT COMMUNITY COHRT.
#NORTHWESTERN QUALITY LEADERS WANT TO TRACK CCMC MEASURES.  THEREFORE, WE ARE MANUALLY MOVING
#KISH AND HUNTLEY TO THE CCMC COHORT.  THEREFORE, KISH AND HUNTLEY MAY APPEAR IN THE BELOW OUTPUT.  YOU SHOULD IGNORE IF
#KISH AND HUNTLEY ARE CORRECTLY ASSIGNED PER NM QUALITY.  THIS SITUATION MAY CHANGE IN THE FUTURE BUT, AS OF FY23, IT WAS THE CASE.
name_or_cohort_change_df_final_nas

Unnamed: 0,Medicare ID,Short name,Cohort,cohort_id,cohort_name,hospital_id,hospital_name,hospital_medicare_id,hospital_cohort_id


### export just the hospitals that already have a medicare ID in the database.  This means the hospital already has a hospital_id but something about the hospital name or cohort ID changed.  Export to Excel and use Excel to create update statements to change all these.

In [52]:
name_or_cohort_change_df_final_nas.to_excel("named_or_cohort_changed_hospitals_check.xlsx",index=False)

## BELOW HERE:  Once dataframe has been narrowed down to just new hospitals, perform inserts into database

In [169]:
#isolate just medicare id, hospital name and cohort id
new_hosp_df_final_nas = new_hosp_df_final_nas[['Medicare ID','Short name','cohort_id']]

In [170]:
new_hosp_df_final_nas.head()

Unnamed: 0,Medicare ID,Short name,cohort_id
28,120001,QMC_MANAMANA,1
135,60009,IH_LUTHERAN,2
139,60023,IH_STMARYGRAND,2
176,210003,UM_PRINCE_GEORGES,2
197,270049,IH_STVINCENT,2


### get max hospital_id in the the hospitals table

In [171]:
#function to query db and get measure ids and measure names.
def get_max_hosp_id():
    # define the query
    max_hosp_id = '''
        SELECT
        max(hospital_id)
        FROM
        NM_Analytics_Prototype.vizient_qa.hospitals
        '''
    # create the connection to the ms sql db
    conn = pyodbc.connect('Driver={SQL Server};'
                          'Server=edw00pd05wva.corp.nm.org\EDWIDS1;'
                          'Database=clarity;'
                          'Trusted_Connection=yes;')

    # read query results into pandas dataframe
    max_df = pd.DataFrame(pd.read_sql(max_hosp_id, conn))
    # close the db connection
    conn.close()
    # return the results dataframe of measure ids and names
    return (max_df)

In [172]:
max_id = get_max_hosp_id()

In [173]:
max_id = max_id.iloc[0][0]

In [174]:
np.arange(len(new_hosp_df_final_nas)) + 1 + max_id

array([936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948,
       949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961,
       962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974,
       975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987,
       988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998])

In [175]:
#create new hospitals ids
new_hosp_df_final_nas['hospital_id'] = np.arange(len(new_hosp_df_final_nas)) + 1 + max_id

In [176]:
new_hosp_df_final_nas.head()

Unnamed: 0,Medicare ID,Short name,cohort_id,hospital_id
28,120001,QMC_MANAMANA,1,936
135,60009,IH_LUTHERAN,2,937
139,60023,IH_STMARYGRAND,2,938
176,210003,UM_PRINCE_GEORGES,2,939
197,270049,IH_STVINCENT,2,940


In [177]:
#rename columns to match the database
new_hosp_df_final_nas = new_hosp_df_final_nas.rename(columns={"Short name": "hospital_name", "Medicare ID": "hospital_medicare_id", "cohort_id":"hospital_cohort_id"})

In [178]:
#reorder the columns to match the database
new_hosp_df_final_nas = new_hosp_df_final_nas[['hospital_id','hospital_name','hospital_medicare_id','hospital_cohort_id']]

In [179]:
new_hosp_df_final_nas

Unnamed: 0,hospital_id,hospital_name,hospital_medicare_id,hospital_cohort_id
28,936,QMC_MANAMANA,120001,1
135,937,IH_LUTHERAN,060009,2
139,938,IH_STMARYGRAND,060023,2
176,939,UM_PRINCE_GEORGES,210003,2
197,940,IH_STVINCENT,270049,2
...,...,...,...,...
930,994,POTOMAC_VALLEY_HOSPITAL,511315,5
931,995,JEFFERSON_MEDICAL_CENTER,511319,5
932,996,JACKSON_GENERAL_HOSPITAL,511320,5
933,997,SAINT_JOSEPHS_HOSPITAL_OF_BUCKHANNON,511321,5


### Insert into database

In [180]:
# create the connection to the ms sql db

conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=edw00pd05wva.corp.nm.org\EDWIDS1;'
                      'Database=clarity;'
                      'Trusted_Connection=yes;')

cursor = conn.cursor()

for i, item in enumerate(new_hosp_df_final_nas.iterrows()):
    #print(item[1]['hospital_id'])
    #print("insert into NM_Analytics_Prototype.vizient_qa.hospitals values (%s,%s,%s,%s)" % (item[1]['hospital_id'], item[1]['hospital_name'],item[1]['hospital_medicare_id'],item[1]['hospital_cohort_id']))
    cursor.execute("insert into NM_Analytics_Prototype.vizient_qa.hospitals values (?,?,?,?)",item[1]['hospital_id'], item[1]['hospital_name'],item[1]['hospital_medicare_id'],item[1]['hospital_cohort_id'])
    #cursor.execute(query)
    conn.commit()

# close the db connection
conn.close()

In [382]:
print("insert into NM_Analytics_Prototype.vizient_qa.hospitals values (%s,%s)" % ('test','test2'))

insert into NM_Analytics_Prototype.vizient_qa.hospitals values (test,test2)
