### **Connect database PatientInteraction**

In [1]:
import pyodbc
import json
import pandas as pd 
import warnings
import urllib.parse
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 65)


def connect_database(database, server='localhost,1433', driver='{ODBC Driver 18 for SQL Server}', username='sa', password='rainscales@2024'):
    # establish connecttion
    conn = pyodbc.connect(f'DRIVER={driver};SERVER={server};DATABASE={database};UID={username};PWD={password};TrustServerCertificate=yes')
    print(f"Connection to '{database}' established")
    return conn 

conn_patient= connect_database(database='PatientInteraction')
cursor_patient = conn_patient.cursor()

Connection to 'PatientInteraction' established


### **1. Update data in Transforming Stage**

#### Get data in Transforming Stage

In [2]:
df = pd.read_csv('Patient_Interaction.csv')
df['PrescribingPhysicianID'].value_counts()

PrescribingPhysicianID
-1.0    63
 1.0    15
 9.0    15
Name: count, dtype: int64

#### Update missing data in ID columns from -1 to 0

In [3]:
def update_id_columns(df):
    id_columns = [col for col in df.columns if 'ID' in col]

    # Replace -1 with 0 in these columns
    for column in id_columns:
        df[column] = df[column].replace(-1, 0)
    
    return df

df = update_id_columns(df)
df['PrescribingPhysicianID'].value_counts()

PrescribingPhysicianID
0.0    63
1.0    15
9.0    15
Name: count, dtype: int64

In [6]:
df.isnull().sum()

InteractionID                    0
PatientSSN                       0
PatientName                      0
PatientAddress                   0
PatientPhone                     0
PatientInsuranceID               0
PatientPCP                       0
AppointmentID                    0
AppointmentPatient               0
AppointmentStart                 0
AppointmentEnd                   0
AppointmentExaminationRoom       0
AppointmentPhysicianID           0
AppointmentPhysicianName         0
AppointmentPhysicianPosition     0
AppointmentNurseID               0
AppointmentNurseName             0
AppointmentNursePosition         0
PrescribingPhysician             0
PrescriptionPatient              0
PrescriptionMedication           0
PrescriptionDate                63
PrescriptionAppointment          0
PrescriptionDose                 0
PrescribingPhysicianID           0
PrescribingPhysicianName         0
PrescribingPhysicianPosition     0
MedicationCode                   0
MedicationName      

#### Update database

In [4]:
from sqlalchemy import create_engine, text
import pandas as pd

def update_changed_records(df, table_name, primary_key, connection_string):
    """
    Updates records in the database table that differ from the DataFrame.

    Args:
        df (pd.DataFrame): The updated DataFrame.
        table_name (str): The name of the target table in the database.
        primary_key (str): The primary key column used to identify records.
        connection_string (str): The SQLAlchemy connection string for the database.

    Returns:
        None
    """
    # Create SQLAlchemy engine
    engine = create_engine(connection_string)

    # Define the staging table name
    staging_table = f"{table_name}_staging"

    # Step 1: Push DataFrame to the staging table
    df.to_sql(staging_table, con=engine, if_exists="replace", index=False)
    print(f"Staging table '{staging_table}' created.")

    # Step 2: Use MERGE to update only the changed records
    with engine.connect() as connection:
        # Generate condition clauses for WHEN MATCHED
        condition_clauses = " OR ".join([f"target.{col} <> source.{col}" for col in df.columns if col != primary_key])
        
        # Generate the MERGE query
        merge_query = text(f"""
        MERGE INTO {table_name} AS target
        USING {staging_table} AS source
        ON target.{primary_key} = source.{primary_key}
        WHEN MATCHED AND (
            {condition_clauses}
        )
        THEN UPDATE SET
            {', '.join([f'{col} = source.{col}' for col in df.columns if col != primary_key])}
        WHEN NOT MATCHED BY TARGET THEN
            INSERT ({', '.join(df.columns)})
            VALUES ({', '.join([f'source.{col}' for col in df.columns])});
        """)
        
        connection.execute(merge_query)
        print("Database table updated.")

    # Step 3: Drop the staging table
    with engine.connect() as connection:
        connection.execute(text(f"DROP TABLE IF EXISTS {staging_table}"))
        print(f"Staging table '{staging_table}' dropped.")



# User credentials
username = "sa"
password = "rainscales@2024"  # Password with special character
host = "localhost"
port = "1433"
database = "PatientInteraction"
# URL-encode the password
encoded_password = urllib.parse.quote_plus(password)
# Correct connection string
connection_string = f"mssql+pyodbc://{username}:{encoded_password}@{host}:{port}/{database}?driver=ODBC+Driver+18+for+SQL+Server&TrustServerCertificate=yes"
update_changed_records(df=df, table_name='Interaction', primary_key='InteractionID', connection_string=connection_string)


Staging table 'Interaction_staging' created.
Database table updated.
Staging table 'Interaction_staging' dropped.


#### **Check for updates in database**

In [5]:
query = """
    SELECT * FROM Interaction
"""
df_patient = pd.read_sql(query, con=conn_patient)
df['PrescribingPhysicianID'].value_counts()

PrescribingPhysicianID
0.0    63
1.0    15
9.0    15
Name: count, dtype: int64