### **Python Data Engineering Project**
##### *-- Anh Vi Pham --*

### **1. Setup database: HospitalOperation**

#### Connect server

In [1]:
import pyodbc
import json
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)



def connect_database(database, server='localhost,1433', driver='{ODBC Driver 18 for SQL Server}', username='sa', password='rainscales@2024'):
    # establish connecttion
    conn = pyodbc.connect(f'DRIVER={driver};SERVER={server};DATABASE={database};UID={username};PWD={password};TrustServerCertificate=yes')
    print(f"Connection to '{database}' established")
    return conn 

conn_master = connect_database(database='master')
cursor_master = conn_master.cursor()
    

Connection to 'master' established


#### Create new database HospitalOperation

In [2]:
def drop_database_if_exist(database, conn, cursor):
    # drop database
    conn.autocommit = True
        # Drop the database if it exists
    cursor.execute(f"IF EXISTS (SELECT * FROM sys.databases WHERE name = '{database}') DROP DATABASE {database}")
    print(F"Database '{database}' dropped successfully if it existed.")
    
drop_database_if_exist(database='HospitalOperation', conn=conn_master, cursor=cursor_master)

Database 'HospitalOperation' dropped successfully if it existed.


In [3]:
def create_new_database(database, conn, cursor):
    conn.autocommit = True
    cursor.execute(f"CREATE DATABASE {database}")
    print(f"Database '{database}' created successfully.")

    conn.autocommit = False

create_new_database(database='HospitalOperation', conn=conn_master, cursor=cursor_master)

Database 'HospitalOperation' created successfully.


In [4]:
# close connection to "master" database
conn_master.close()

#### Connect to HospitalOperation

In [5]:
conn_hospital = connect_database(database='HospitalOperation')
cursor_hospital = conn_hospital.cursor()

Connection to 'HospitalOperation' established


#### Load database configuration file from table_creation_code.txt

In [6]:
def load_sql_from_txt(file_name: str, mode='r'):
    try:
        commands = []
        with open(file_name, mode) as file:
            imported_commands = file.read().split(';')
            for com in imported_commands:
                commands.append(com.replace('\n', ' ').strip())
        return commands
    except Exception as e:
        return print(f'Failed to read commands from {file_name}  - \n Error: {e} \n Connection closed')

def execute_list_of_sql_commands(command_list, conn, cursor):
    try:
        for command in command_list:
            if command: 
                cursor.execute(command)

        conn.commit()
        return print(f'Commands excuted successfully')

    except Exception as e:
        conn.close()
        return print(f'Failed to execute commands - \n Error: {e} \n Connection closed')


In [7]:
db_file = 'table_creation_code.txt'
table_creation_codes = load_sql_from_txt(db_file)

execute_list_of_sql_commands(command_list=table_creation_codes, conn=conn_hospital, cursor=cursor_hospital)

Commands excuted successfully


#### Load data from sample_dataset.txt

In [8]:
data_file = 'sample_dataset.txt'
data_codes = load_sql_from_txt(data_file)

execute_list_of_sql_commands(command_list=data_codes, cursor=cursor_hospital, conn=conn_hospital)

Commands excuted successfully


#### Change data type: text to varchar(150)

In [9]:
def find_columns_by_data_type(data_type: str, conn):
    sql_code = f"""
    SELECT TABLE_NAME, COLUMN_NAME 
    FROM INFORMATION_SCHEMA.COLUMNS 
    WHERE DATA_TYPE = '{data_type}'
    """
    columns = pd.read_sql(sql_code, conn)
    return columns

data_type = 'text'
text_columns = find_columns_by_data_type(data_type, conn_hospital)
print(f"Columns with {data_type} data type:")
print(text_columns)

Columns with text data type:
      TABLE_NAME  COLUMN_NAME
0      Physician         Name
1      Physician     Position
2     Department         Name
3   NewProcedure         Name
4        Patient         Name
..           ...          ...
10    Medication         Name
11    Medication        Brand
12    Medication  Description
13    Prescribes         Dose
14          Room         Type

[15 rows x 2 columns]


In [10]:
def alter_text_to_varchar(conn, table_name, column_name, varchar_length=150):
    alter_sql = f"ALTER TABLE [{table_name}] ALTER COLUMN [{column_name}] VARCHAR({varchar_length})"
    cursor = conn.cursor()
    cursor.execute(alter_sql)
    conn.commit()
    print(f"Column '{column_name}' in table '{table_name}' changed to VARCHAR({varchar_length})")

# Execute the change for each identified column
for index, row in text_columns.iterrows():
    alter_text_to_varchar(conn_hospital, row['TABLE_NAME'], row['COLUMN_NAME'])


Column 'Name' in table 'Physician' changed to VARCHAR(150)
Column 'Position' in table 'Physician' changed to VARCHAR(150)
Column 'Name' in table 'Department' changed to VARCHAR(150)
Column 'Name' in table 'NewProcedure' changed to VARCHAR(150)
Column 'Name' in table 'Patient' changed to VARCHAR(150)
Column 'Address' in table 'Patient' changed to VARCHAR(150)
Column 'Phone' in table 'Patient' changed to VARCHAR(150)
Column 'Name' in table 'Nurse' changed to VARCHAR(150)
Column 'Position' in table 'Nurse' changed to VARCHAR(150)
Column 'ExaminationRoom' in table 'Appointment' changed to VARCHAR(150)
Column 'Name' in table 'Medication' changed to VARCHAR(150)
Column 'Brand' in table 'Medication' changed to VARCHAR(150)
Column 'Description' in table 'Medication' changed to VARCHAR(150)
Column 'Dose' in table 'Prescribes' changed to VARCHAR(150)
Column 'Type' in table 'Room' changed to VARCHAR(150)


### **2. Transform data: PatientInteraction**

#### Query data

In [11]:
def query(sql_code, engine = conn_hospital):
    df_query = pd.read_sql(sql_code, engine)
    return df_query

In [12]:
df = query(""" 
        SELECT
        -- Patient table
        Patient.SSN AS PatientSSN,
        Patient.Name AS PatientName,
        Patient.Address AS PatientAddress,
        Patient.Phone AS PatientPhone,
        Patient.InsuranceID AS PatientInsuranceID,
        Patient.PCP AS PatientPCP,

        -- Appointment table
        Appointment.AppointmentID,
        Appointment.Patient AS AppointmentPatient,
        Appointment.Start AS AppointmentStart,
        Appointment.[End] AS AppointmentEnd,
        Appointment.ExaminationRoom AS AppointmentExaminationRoom,
        Appointment_Physician.EmployeeID AS AppointmentPhysicianID,
        Appointment_Physician.Name AS AppointmentPhysicianName,
        Appointment_Physician.Position AS AppointmentPhysicianPosition,
        Appointment_Nurse.EmployeeID AS AppointmentNurseID,
        Appointment_Nurse.Name AS AppointmentNurseName,
        Appointment_Nurse.Position AS AppointmentNursePosition,

        -- Prescribes table
        Prescribes.Physician AS PrescribingPhysician,
        Prescribes.Patient AS PrescriptionPatient,
        Prescribes.Medication AS PrescriptionMedication,
        Prescribes.Date AS PrescriptionDate,
        Prescribes.Appointment AS PrescriptionAppointment,
        Prescribes.Dose AS PrescriptionDose,
        Prescribes_Physician.EmployeeID AS PrescribingPhysicianID,
        Prescribes_Physician.Name AS PrescribingPhysicianName,
        Prescribes_Physician.Position AS PrescribingPhysicianPosition,

        -- Medication table
        Medication.Code AS MedicationCode,
        Medication.Name AS MedicationName,
        Medication.Brand AS MedicationBrand,
        Medication.Description AS MedicationDescription,

        -- Undergoes table
        Undergoes.Patient AS UndergoesPatient,
        Undergoes.NewProcedure AS UndergoesNewProcedure,
        Undergoes.Stay AS UndergoesStayID,
        Undergoes.Date AS UndergoesDate,
        Undergoes.Physician AS UndergoesPhysicianID,
        Undergoes.AssistingNurse AS UndergoesNurseID,

        -- NewProcedure table
        NewProcedure.Code AS NewProcedureCode,
        NewProcedure.Name AS NewProcedureName,
        NewProcedure.Cost AS NewProcedureCost,

        -- Stay table
        Stay.StayID AS StayID,
        Stay.Patient AS StayPatient,
        Stay.Room AS StayRoom,
        Stay.Start AS StayStart,
        Stay.[End] AS StayEnd,

        -- Room table
        Room.Number AS RoomNumber,
        Room.Type AS RoomType,
        Room.BlockFloor AS RoomBlockFloor,
        Room.BlockCode AS RoomBlockCode,
        Room.Unavailable AS RoomUnavailable,

        -- Block table
        Block.Floor AS BlockFloor,
        Block.Code AS BlockCode,

        -- On_Call table
        On_Call.Nurse AS OnCallNurse,
        On_Call.BlockFloor AS OnCallBlockFloor,
        On_Call.BlockCode AS OnCallBlockCode,
        On_Call.Start AS OnCallStart,
        On_Call.[End] AS OnCallEnd,
        OnCall_Nurse.EmployeeID AS OnCallNurseID,
        OnCall_Nurse.Name AS OnCallNurseName,
        OnCall_Nurse.Position AS OnCallNursePosition,
        OnCall_Nurse.Registered AS OnCallNurseRegistered
        FROM dbo.Patient
        -- Join with Appointment and related tables
        LEFT JOIN dbo.Appointment
        ON Patient.SSN = Appointment.Patient
        LEFT JOIN dbo.Physician AS Appointment_Physician
        ON Appointment.Physician = Appointment_Physician.EmployeeID
        LEFT JOIN dbo.Nurse AS Appointment_Nurse
        ON Appointment.PrepNurse = Appointment_Nurse.EmployeeID

        -- Join with Prescribes and related tables
        LEFT JOIN dbo.Prescribes
        ON Appointment.AppointmentID = Prescribes.Appointment
        LEFT JOIN dbo.Physician AS Prescribes_Physician
        ON Prescribes.Physician = Prescribes_Physician.EmployeeID
        LEFT JOIN dbo.Medication
        ON Prescribes.Medication = Medication.Code

        -- Join with Undergoes, NewProcedure, and related tables
        LEFT JOIN dbo.Undergoes
        ON Patient.SSN = Undergoes.Patient
        LEFT JOIN dbo.NewProcedure
        ON Undergoes.NewProcedure = NewProcedure.Code
        LEFT JOIN dbo.Stay
        ON Undergoes.Stay = Stay.StayID
        LEFT JOIN dbo.Room
        ON Stay.Room = Room.Number

        -- Join with Block and On_Call tables
        LEFT JOIN dbo.Block
        ON Room.BlockCode = Block.Code
        LEFT JOIN dbo.On_Call
        ON Block.Code = On_Call.BlockCode AND Block.Floor = On_Call.BlockFloor
        LEFT JOIN dbo.Nurse AS OnCall_Nurse
        ON On_Call.Nurse = OnCall_Nurse.EmployeeID;

""")
df.head(3)

Unnamed: 0,PatientSSN,PatientName,PatientAddress,PatientPhone,PatientInsuranceID,PatientPCP,AppointmentID,AppointmentPatient,AppointmentStart,AppointmentEnd,AppointmentExaminationRoom,AppointmentPhysicianID,AppointmentPhysicianName,AppointmentPhysicianPosition,AppointmentNurseID,AppointmentNurseName,AppointmentNursePosition,PrescribingPhysician,PrescriptionPatient,PrescriptionMedication,PrescriptionDate,PrescriptionAppointment,PrescriptionDose,PrescribingPhysicianID,PrescribingPhysicianName,PrescribingPhysicianPosition,MedicationCode,MedicationName,MedicationBrand,MedicationDescription,UndergoesPatient,UndergoesNewProcedure,UndergoesStayID,UndergoesDate,UndergoesPhysicianID,UndergoesNurseID,NewProcedureCode,NewProcedureName,NewProcedureCost,StayID,StayPatient,StayRoom,StayStart,StayEnd,RoomNumber,RoomType,RoomBlockFloor,RoomBlockCode,RoomUnavailable,BlockFloor,BlockCode,OnCallNurse,OnCallBlockFloor,OnCallBlockCode,OnCallStart,OnCallEnd,OnCallNurseID,OnCallNurseName,OnCallNursePosition,OnCallNurseRegistered
0,100000001,John Smith,42 Foobar Lane,555-0256,68476213,1,13216584,100000001,2008-04-24 10:00:00,2008-04-24 11:00:00,A,1,John Dorian,Staff Internist,101.0,Carla Espinosa,Head Nurse,1.0,100000001.0,1.0,2008-04-24 10:47:00,13216584.0,5,1.0,John Dorian,Staff Internist,1.0,Procrastin-X,X,,100000001.0,2.0,3215.0,2008-05-03,7.0,101.0,2.0,Obtuse Pyloric Recombobulation,3750.0,3215.0,100000001.0,111.0,2008-05-01,2008-05-04,111.0,Single,1.0,2.0,False,1.0,2.0,101.0,1.0,2.0,2008-11-04 11:00:00,2008-11-04 19:00:00,101.0,Carla Espinosa,Head Nurse,True
1,100000001,John Smith,42 Foobar Lane,555-0256,68476213,1,13216584,100000001,2008-04-24 10:00:00,2008-04-24 11:00:00,A,1,John Dorian,Staff Internist,101.0,Carla Espinosa,Head Nurse,1.0,100000001.0,1.0,2008-04-24 10:47:00,13216584.0,5,1.0,John Dorian,Staff Internist,1.0,Procrastin-X,X,,100000001.0,2.0,3215.0,2008-05-03,7.0,101.0,2.0,Obtuse Pyloric Recombobulation,3750.0,3215.0,100000001.0,111.0,2008-05-01,2008-05-04,111.0,Single,1.0,2.0,False,1.0,2.0,103.0,1.0,2.0,2008-11-04 19:00:00,2008-11-05 03:00:00,103.0,Paul Flowers,Nurse,False
2,100000001,John Smith,42 Foobar Lane,555-0256,68476213,1,13216584,100000001,2008-04-24 10:00:00,2008-04-24 11:00:00,A,1,John Dorian,Staff Internist,101.0,Carla Espinosa,Head Nurse,1.0,100000001.0,1.0,2008-04-24 10:47:00,13216584.0,5,1.0,John Dorian,Staff Internist,1.0,Procrastin-X,X,,100000001.0,2.0,3215.0,2008-05-03,7.0,101.0,2.0,Obtuse Pyloric Recombobulation,3750.0,3215.0,100000001.0,111.0,2008-05-01,2008-05-04,111.0,Single,1.0,2.0,False,2.0,2.0,,,,NaT,NaT,,,,


#### Explore data

In [13]:
df.shape

(93, 60)

In [14]:
pd.set_option('display.max_rows', 60)
df.dtypes

PatientSSN                               int64
PatientName                             object
PatientAddress                          object
PatientPhone                            object
PatientInsuranceID                       int64
PatientPCP                               int64
AppointmentID                            int64
AppointmentPatient                       int64
AppointmentStart                datetime64[ns]
AppointmentEnd                  datetime64[ns]
AppointmentExaminationRoom              object
AppointmentPhysicianID                   int64
AppointmentPhysicianName                object
AppointmentPhysicianPosition            object
AppointmentNurseID                     float64
AppointmentNurseName                    object
AppointmentNursePosition                object
PrescribingPhysician                   float64
PrescriptionPatient                    float64
PrescriptionMedication                 float64
PrescriptionDate                datetime64[ns]
PrescriptionA

In [15]:
df.duplicated().sum()

0

In [16]:
df.isnull().sum()

PatientSSN                       0
PatientName                      0
PatientAddress                   0
PatientPhone                     0
PatientInsuranceID               0
PatientPCP                       0
AppointmentID                    0
AppointmentPatient               0
AppointmentStart                 0
AppointmentEnd                   0
AppointmentExaminationRoom       0
AppointmentPhysicianID           0
AppointmentPhysicianName         0
AppointmentPhysicianPosition     0
AppointmentNurseID              30
AppointmentNurseName            30
AppointmentNursePosition        30
PrescribingPhysician            63
PrescriptionPatient             63
PrescriptionMedication          63
PrescriptionDate                63
PrescriptionAppointment         63
PrescriptionDose                63
PrescribingPhysicianID          63
PrescribingPhysicianName        63
PrescribingPhysicianPosition    63
MedicationCode                  63
MedicationName                  63
MedicationBrand     

#### Listing problems
- Missing values 

- Duplicated data

- Data type for columns needed to be changed:
    - Some ID columns are represented as "float" instead of "int" since they contains null 
    -> handle missing values and transform into int for consistency
- Create primary key

#### Transform data

##### Missing values

In [18]:
df.isnull().sum()

PatientSSN                       0
PatientName                      0
PatientAddress                   0
PatientPhone                     0
PatientInsuranceID               0
PatientPCP                       0
AppointmentID                    0
AppointmentPatient               0
AppointmentStart                 0
AppointmentEnd                   0
AppointmentExaminationRoom       0
AppointmentPhysicianID           0
AppointmentPhysicianName         0
AppointmentPhysicianPosition     0
AppointmentNurseID              30
AppointmentNurseName            30
AppointmentNursePosition        30
PrescribingPhysician            63
PrescriptionPatient             63
PrescriptionMedication          63
PrescriptionDate                63
PrescriptionAppointment         63
PrescriptionDose                63
PrescribingPhysicianID          63
PrescribingPhysicianName        63
PrescribingPhysicianPosition    63
MedicationCode                  63
MedicationName                  63
MedicationBrand     

In [19]:
# Function to fill numeric columns first
def fill_numeric_columns_with_default(data, numeric_columns, default_value=-1):
    """
    Fills numeric columns with a default value for missing data.
    
    Args:
        data (pd.DataFrame): The DataFrame to process.
        numeric_columns (list): List of numeric columns to fill.
        default_value (int/float): The value to fill missing data.
    
    Returns:
        pd.DataFrame: Updated DataFrame with numeric columns filled.
    """
    for column in numeric_columns:
        if column in data.columns:
            data[column] = data[column].fillna(default_value)
    return data

# Identify numeric columns in your DataFrame
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Fill missing values in numeric columns with -1
df = fill_numeric_columns_with_default(data=df, numeric_columns=numeric_columns, default_value=-1)

In [20]:
def fill_null_without_condition_column(data, replace_columns: list, replace_value):
    data.loc[data[replace_columns].isna().all(axis=1), replace_columns] = replace_value
    return data 


def fill_null_with_condition_column(data, condition_column, replace_columns: list, replace_value, note_for_abnormal: str, drop=True):
    # dataframe to store abnormal data
    data_abnormal = pd.DataFrame()
    for column in replace_columns:
        # fill if missing in condition_column and the condition_column
        data.loc[(data[condition_column].notnull()) & (data[column].isnull()), column] = replace_value
        # extract abnormal data
        abnormal_records = data[(data[condition_column].isnull()) & (data[column].notnull())]

        # add note for abnormal records
        if not abnormal_records.empty:
            abnormal_records['Note'] = note_for_abnormal
        # append abnormal records
        data_abnormal = pd.concat([data_abnormal, abnormal_records])

    # keep or drop abnormal
    if drop:    
        data = data.drop(data_abnormal.index)

    return data, data_abnormal

# store abnormal data
df_abnormal = pd.DataFrame()


# -------------------------------------------------------------- Appointment -> Prescribe -> Medication
# ---------- Appointments
# app_id_column = ['AppointmentID']
# df = fill_null_without_condition_column(data=df, replace_columns=app_id_column, replace_value='No Appointment made')

nurse_columns_app = ['AppointmentNurseName','AppointmentNursePosition']
df = fill_null_without_condition_column(data=df, replace_columns=nurse_columns_app, replace_value='No Appointment made')

# ---------- Appointment -> Prescribe
df, df_abnormal_temp = fill_null_with_condition_column(
    data=df, 
    condition_column='AppointmentID',
    replace_columns=['PrescriptionDose',
       'PrescribingPhysicianName', 'PrescribingPhysicianPosition'],
    replace_value='No prescription issued',
    note_for_abnormal='Prescription issued without Appointment',
)
df_abnormal = pd.concat([df_abnormal, df_abnormal_temp])

# ---------- Prescribe -> Medication
df, df_abnormal_temp = fill_null_with_condition_column(
    data=df, 
    condition_column='PrescribingPhysician',
    replace_columns=['MedicationName', 'MedicationBrand','MedicationDescription'],
    replace_value='No Medication was prescribed',
    note_for_abnormal='Medication involved without Prescription',
)
df_abnormal = pd.concat([df_abnormal, df_abnormal_temp])


# --------------------------------------------------------------   Undergoes -> Stay -> Room -> Block
# ---------- Undergoes
# undegoes_columns = ['UndergoesPatient', 'UndergoesNewProcedure',
#        'UndergoesStayID', 'UndergoesDate', 'UndergoesPhysicianID']
# df = fill_null_without_condition_column(data=df, replace_columns=undegoes_columns, replace_value='Patient had no Undergoes')

# nurse_columns_und = ['UndergoesNurseID']
# df = fill_null_without_condition_column(data=df, replace_columns=nurse_columns_und, replace_value='Undergoes had no Nurse')    

# ---------- Undergoes -> Stay
# df, df_abnormal_temp = fill_null_with_condition_column(
#     data=df, 
#     condition_column='UndergoesStayID',
#     replace_columns=['StayID', 'StayPatient', 'StayRoom', 'StayStart', 'StayEnd'],
#     replace_value='No stay recorded',
#     note_for_abnormal='Stay details present without StayID in Undergoes',
# )
# df_abnormal = pd.concat([df_abnormal, df_abnormal_temp])

# ---------- Stay -> Room
df, df_abnormal_temp = fill_null_with_condition_column(
    data=df, 
    condition_column='StayRoom',
    replace_columns=['RoomType', 'RoomUnavailable'],
    replace_value='No room details',
    note_for_abnormal='Room details present without Room reference in Stay',
)
df_abnormal = pd.concat([df_abnormal, df_abnormal_temp])

# ---------- Room -> Block
# df, df_abnormal_temp = fill_null_with_condition_column(
#     data=df, 
#     condition_column='RoomBlockCode',
#     replace_columns=['BlockFloor', 'BlockCode'],
#     replace_value='No block details',
#     note_for_abnormal='Block details present without Block reference in Room',
# )
# df_abnormal = pd.concat([df_abnormal, df_abnormal_temp])

# --------------------------------------------------------------  OnCall -> Nurse
df, df_abnormal_temp = fill_null_with_condition_column(
    data=df, 
    condition_column='OnCallNurse',
    replace_columns=['OnCallNurseName', 'OnCallNursePosition', 'OnCallNurseRegistered'],
    replace_value='No nurse details',
    note_for_abnormal='Nurse details present without being OnCall',
)
df_abnormal = pd.concat([df_abnormal, df_abnormal_temp])


# -------------------------------------------------------------- The Rest
# ---------- Procedure
procedure_columns = ['NewProcedureName']
df = fill_null_without_condition_column(data=df, replace_columns=procedure_columns, replace_value='No Procedure taken')

# ---------- On Call
oncall_columns = ['OnCallNurse', 'OnCallBlockFloor', 'OnCallBlockCode', 'OnCallStart', 'OnCallEnd',
       'OnCallNurseID', 'OnCallNurseName', 'OnCallNursePosition',
       'OnCallNurseRegistered']
df = fill_null_without_condition_column(data=df, replace_columns=oncall_columns, replace_value='No OnCall')
df.isnull().sum()


PatientSSN                       0
PatientName                      0
PatientAddress                   0
PatientPhone                     0
PatientInsuranceID               0
PatientPCP                       0
AppointmentID                    0
AppointmentPatient               0
AppointmentStart                 0
AppointmentEnd                   0
AppointmentExaminationRoom       0
AppointmentPhysicianID           0
AppointmentPhysicianName         0
AppointmentPhysicianPosition     0
AppointmentNurseID               0
AppointmentNurseName             0
AppointmentNursePosition         0
PrescribingPhysician             0
PrescriptionPatient              0
PrescriptionMedication           0
PrescriptionDate                63
PrescriptionAppointment          0
PrescriptionDose                 0
PrescribingPhysicianID           0
PrescribingPhysicianName         0
PrescribingPhysicianPosition     0
MedicationCode                   0
MedicationName                   0
MedicationBrand     

In [21]:
df_abnormal

Unnamed: 0,PatientSSN,PatientName,PatientAddress,PatientPhone,PatientInsuranceID,PatientPCP,AppointmentID,AppointmentPatient,AppointmentStart,AppointmentEnd,AppointmentExaminationRoom,AppointmentPhysicianID,AppointmentPhysicianName,AppointmentPhysicianPosition,AppointmentNurseID,AppointmentNurseName,AppointmentNursePosition,PrescribingPhysician,PrescriptionPatient,PrescriptionMedication,PrescriptionDate,PrescriptionAppointment,PrescriptionDose,PrescribingPhysicianID,PrescribingPhysicianName,PrescribingPhysicianPosition,MedicationCode,MedicationName,MedicationBrand,MedicationDescription,UndergoesPatient,UndergoesNewProcedure,UndergoesStayID,UndergoesDate,UndergoesPhysicianID,UndergoesNurseID,NewProcedureCode,NewProcedureName,NewProcedureCost,StayID,StayPatient,StayRoom,StayStart,StayEnd,RoomNumber,RoomType,RoomBlockFloor,RoomBlockCode,RoomUnavailable,BlockFloor,BlockCode,OnCallNurse,OnCallBlockFloor,OnCallBlockCode,OnCallStart,OnCallEnd,OnCallNurseID,OnCallNurseName,OnCallNursePosition,OnCallNurseRegistered


##### Duplicated values

In [22]:
def remove_duplicates(data, data_abnormal):
    df_duplicates = data[data.duplicated(keep=False)]

    # remove duplicates from df
    data = data.drop_duplicates(keep=False)

    # concatenate df_duplicates to df_abnormal
    data_abnormal = pd.concat([data_abnormal, df_duplicates], ignore_index=True)
    return data, data_abnormal

df, df_abnormal = remove_duplicates(df, df_abnormal)

In [23]:
df.shape

(93, 60)

In [24]:
df_abnormal

Unnamed: 0,PatientSSN,PatientName,PatientAddress,PatientPhone,PatientInsuranceID,PatientPCP,AppointmentID,AppointmentPatient,AppointmentStart,AppointmentEnd,AppointmentExaminationRoom,AppointmentPhysicianID,AppointmentPhysicianName,AppointmentPhysicianPosition,AppointmentNurseID,AppointmentNurseName,AppointmentNursePosition,PrescribingPhysician,PrescriptionPatient,PrescriptionMedication,PrescriptionDate,PrescriptionAppointment,PrescriptionDose,PrescribingPhysicianID,PrescribingPhysicianName,PrescribingPhysicianPosition,MedicationCode,MedicationName,MedicationBrand,MedicationDescription,UndergoesPatient,UndergoesNewProcedure,UndergoesStayID,UndergoesDate,UndergoesPhysicianID,UndergoesNurseID,NewProcedureCode,NewProcedureName,NewProcedureCost,StayID,StayPatient,StayRoom,StayStart,StayEnd,RoomNumber,RoomType,RoomBlockFloor,RoomBlockCode,RoomUnavailable,BlockFloor,BlockCode,OnCallNurse,OnCallBlockFloor,OnCallBlockCode,OnCallStart,OnCallEnd,OnCallNurseID,OnCallNurseName,OnCallNursePosition,OnCallNurseRegistered


##### Creating primary key

In [25]:
#  creating key
df['InteractionID'] = range(1, len(df) + 1)

#  placing at the first place
df.insert(0, 'InteractionID', df.pop('InteractionID'))
df.head(3)

Unnamed: 0,InteractionID,PatientSSN,PatientName,PatientAddress,PatientPhone,PatientInsuranceID,PatientPCP,AppointmentID,AppointmentPatient,AppointmentStart,AppointmentEnd,AppointmentExaminationRoom,AppointmentPhysicianID,AppointmentPhysicianName,AppointmentPhysicianPosition,AppointmentNurseID,AppointmentNurseName,AppointmentNursePosition,PrescribingPhysician,PrescriptionPatient,PrescriptionMedication,PrescriptionDate,PrescriptionAppointment,PrescriptionDose,PrescribingPhysicianID,PrescribingPhysicianName,PrescribingPhysicianPosition,MedicationCode,MedicationName,MedicationBrand,MedicationDescription,UndergoesPatient,UndergoesNewProcedure,UndergoesStayID,UndergoesDate,UndergoesPhysicianID,UndergoesNurseID,NewProcedureCode,NewProcedureName,NewProcedureCost,StayID,StayPatient,StayRoom,StayStart,StayEnd,RoomNumber,RoomType,RoomBlockFloor,RoomBlockCode,RoomUnavailable,BlockFloor,BlockCode,OnCallNurse,OnCallBlockFloor,OnCallBlockCode,OnCallStart,OnCallEnd,OnCallNurseID,OnCallNurseName,OnCallNursePosition,OnCallNurseRegistered
0,1,100000001,John Smith,42 Foobar Lane,555-0256,68476213,1,13216584,100000001,2008-04-24 10:00:00,2008-04-24 11:00:00,A,1,John Dorian,Staff Internist,101.0,Carla Espinosa,Head Nurse,1.0,100000001.0,1.0,2008-04-24 10:47:00,13216584.0,5,1.0,John Dorian,Staff Internist,1.0,Procrastin-X,X,,100000001.0,2.0,3215.0,2008-05-03,7.0,101.0,2.0,Obtuse Pyloric Recombobulation,3750.0,3215.0,100000001.0,111.0,2008-05-01,2008-05-04,111.0,Single,1.0,2.0,False,1.0,2.0,101.0,1.0,2.0,2008-11-04 11:00:00,2008-11-04 19:00:00,101.0,Carla Espinosa,Head Nurse,True
1,2,100000001,John Smith,42 Foobar Lane,555-0256,68476213,1,13216584,100000001,2008-04-24 10:00:00,2008-04-24 11:00:00,A,1,John Dorian,Staff Internist,101.0,Carla Espinosa,Head Nurse,1.0,100000001.0,1.0,2008-04-24 10:47:00,13216584.0,5,1.0,John Dorian,Staff Internist,1.0,Procrastin-X,X,,100000001.0,2.0,3215.0,2008-05-03,7.0,101.0,2.0,Obtuse Pyloric Recombobulation,3750.0,3215.0,100000001.0,111.0,2008-05-01,2008-05-04,111.0,Single,1.0,2.0,False,1.0,2.0,103.0,1.0,2.0,2008-11-04 19:00:00,2008-11-05 03:00:00,103.0,Paul Flowers,Nurse,False
2,3,100000001,John Smith,42 Foobar Lane,555-0256,68476213,1,13216584,100000001,2008-04-24 10:00:00,2008-04-24 11:00:00,A,1,John Dorian,Staff Internist,101.0,Carla Espinosa,Head Nurse,1.0,100000001.0,1.0,2008-04-24 10:47:00,13216584.0,5,1.0,John Dorian,Staff Internist,1.0,Procrastin-X,X,,100000001.0,2.0,3215.0,2008-05-03,7.0,101.0,2.0,Obtuse Pyloric Recombobulation,3750.0,3215.0,100000001.0,111.0,2008-05-01,2008-05-04,111.0,Single,1.0,2.0,False,2.0,2.0,-1.0,-1.0,-1.0,NaT,NaT,-1.0,No nurse details,No nurse details,No nurse details


#### Close connection with HospitalOperation

In [26]:
conn_hospital.close()

### **3. Load patient data into new database PatientInteraction**

#### Create new database PatientInteraction

In [27]:
# establish connecttion to master database
conn_master = connect_database(database='master')
cursor_master = conn_master.cursor()

drop_database_if_exist(database='PatientInteraction', conn=conn_master, cursor=cursor_master)

Connection to 'master' established
Database 'PatientInteraction' dropped successfully if it existed.


In [28]:
create_new_database('PatientInteraction', conn=conn_master, cursor=cursor_master)

Database 'PatientInteraction' created successfully.


In [29]:
conn_master.close()

#### Connect to PatientInteraction

In [30]:
conn_patient = connect_database('PatientInteraction')
cursor_patient = conn_patient.cursor()

Connection to 'PatientInteraction' established


#### Create table using txt file

In [31]:
code_create_patient_table = load_sql_from_txt('tabel_creation_patient_interaction.txt')
execute_list_of_sql_commands(code_create_patient_table, conn=conn_patient, cursor=cursor_patient)

Commands excuted successfully


#### Push data to PatientInteraction

In [32]:
import pandas as pd
from sqlalchemy import create_engine
import urllib.parse

# User credentials
username = "sa"
password = "rainscales@2024"  # Password with special character
host = "localhost"
port = "1433"
database = "PatientInteraction"
# URL-encode the password
encoded_password = urllib.parse.quote_plus(password)
# Correct connection string
connection_string = f"mssql+pyodbc://{username}:{encoded_password}@{host}:{port}/{database}?driver=ODBC+Driver+18+for+SQL+Server&TrustServerCertificate=yes"

# Function to load data into SQL
def load_data_to_sql(df, table_name, connection_string):
    try:
        # Create an SQLAlchemy engine
        engine = create_engine(connection_string)

        # Load data into the SQL table
        df.to_sql(table_name, con=engine, if_exists='append', index=False)

        print(f"Data loaded successfully into table '{table_name}'.")
    except Exception as e:
        print(f"An error occurred while loading data: {e}")

# Load the data into the SQL table
load_data_to_sql(df, table_name="Interaction", connection_string=connection_string)

Data loaded successfully into table 'Interaction'.


In [33]:
conn_patient.close()

In [34]:
df.to_csv('Patient_Interaction.csv', index=False)