### **Python Data Engineering Project**
##### *-- Anh Vi Pham --*

### **Setup Database**

#### Connect Server

In [78]:
import pyodbc
import json
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

# database credentials
server = 'localhost,1433' 
database = 'master' 
driver = '{ODBC Driver 18 for SQL Server}'

# import keys from json file
with open("database_keys.json", "r") as file:
    config = json.load(file)
username = config["username"]
password = config["password"]

# establish connecttion

conn = pyodbc.connect(f'DRIVER={driver};SERVER={server};DATABASE={database};UID={username};PWD={password};TrustServerCertificate=yes')
cursor = conn.cursor()

#### Create new database

In [79]:
def get_current_database(cursor=cursor):
    cursor.execute("SELECT DB_NAME() AS CurrentDatabase")
    current_db = cursor.fetchone()[0]
    return print(f"Currently connected to database: {current_db}")
get_current_database(cursor)

Currently connected to database: master


In [80]:
# drop database
conn.autocommit = True

# Drop the database if it exists
cursor.execute("IF EXISTS (SELECT * FROM sys.databases WHERE name = 'HospitalOperation') DROP DATABASE HospitalOperation")
print("Database 'HospitalOperation' dropped successfully if it existed.")

Database 'HospitalOperation' dropped successfully if it existed.


In [81]:
conn.autocommit = True
try:
    cursor.execute("CREATE DATABASE HospitalOperation")
    print("Database 'HospitalOperation' created successfully.")
except Exception as e:
    print(f"Error: {e}")

conn.autocommit = False

Database 'HospitalOperation' created successfully.


In [82]:
# close connection to "master" database
conn.close()

#### Connect to new database

In [83]:
new_database_name = 'HospitalOperation'
conn = pyodbc.connect(f'DRIVER={driver};SERVER={server};DATABASE={new_database_name};UID={username};PWD={password};TrustServerCertificate=yes')
cursor = conn.cursor()

get_current_database(cursor)

Currently connected to database: HospitalOperation


#### Load database configuration file from table_creation_code.txt

In [84]:
def load_sql_from_txt(file_name: str, mode='r'):
    try:
        commands = []
        with open(file_name, mode) as file:
            imported_commands = file.read().split(';')
            for com in imported_commands:
                commands.append(com.replace('\n', ' ').strip())
        return commands
    except Exception as e:
        conn.close()
        return print(f'Failed to read commands from {file_name}  - \n Error: {e} \n Connection closed')

def execute_list_of_sql_commands(command_list, cursor, conn):
    try:
        for command in command_list:
            if command: 
                cursor.execute(command)

        conn.commit()
        return print(f'Commands excuted successfully')

    except Exception as e:
        conn.close()
        return print(f'Failed to execute commands - \n Error: {e} \n Connection closed')


In [85]:
db_file = 'table_creation_code.txt'
table_creation_codes = load_sql_from_txt(db_file)

execute_list_of_sql_commands(command_list=table_creation_codes, cursor=cursor, conn=conn)

Commands excuted successfully


#### Load data from sample_dataset.txt

In [86]:
data_file = 'sample_dataset.txt'
data_codes = load_sql_from_txt(data_file)

execute_list_of_sql_commands(command_list=data_codes, cursor=cursor, conn=conn)

Commands excuted successfully


### **Query**

In [90]:
get_current_database(cursor)

Currently connected to database: HospitalOperation


In [87]:
def query(sql_code, engine = conn):
    df_query = pd.read_sql(sql_code, engine)
    return df_query

In [107]:
li_tables = query(
    """
    SELECT table_name 
    FROM information_schema.tables
    WHERE table_type = 'BASE TABLE';
    """ )['table_name'].to_list()
print('Number of tables:', len(li_tables))

Number of tables: 15


In [115]:
df = query(
    f"""
    SELECT * FROM Appointment
    """ )
df

Unnamed: 0,AppointmentID,Patient,PrepNurse,Physician,Start,End,ExaminationRoom
0,13216584,100000001,101.0,1,2008-04-24 10:00:00,2008-04-24 11:00:00,A
1,26548913,100000002,101.0,2,2008-04-24 10:00:00,2008-04-24 11:00:00,B
2,36549879,100000001,102.0,1,2008-04-25 10:00:00,2008-04-25 11:00:00,A
3,46846589,100000004,103.0,4,2008-04-25 10:00:00,2008-04-25 11:00:00,B
4,59871321,100000004,,4,2008-04-26 10:00:00,2008-04-26 11:00:00,C
5,69879231,100000003,103.0,2,2008-04-26 11:00:00,2008-04-26 12:00:00,C
6,76983231,100000001,,3,2008-04-26 12:00:00,2008-04-26 13:00:00,C
7,86213939,100000004,102.0,9,2008-04-27 10:00:00,2008-04-21 11:00:00,A
8,93216548,100000002,101.0,2,2008-04-27 10:00:00,2008-04-27 11:00:00,B


In [None]:
def get_primary_key_columns(table_name, conn):
    sql_code = f"""
    SELECT COLUMN_NAME 
    FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS TC
    JOIN INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE AS CCU 
    ON TC.CONSTRAINT_NAME = CCU.CONSTRAINT_NAME
    WHERE TC.TABLE_NAME = '{table_name}' AND TC.CONSTRAINT_TYPE = 'PRIMARY KEY'
    """
    primary_key_columns = query(sql_code, conn)
    return primary_key_columns['COLUMN_NAME'].tolist()  # Return a list of primary key columns

def check_primary_key(table_name, conn):
    primary_key_columns = get_primary_key_columns(table_name, conn)
    
    df_temp = qqeur


In [119]:
table_name = 'Appointment'
check_primary_key(table_name, conn)

Null values in primary key columns of Appointment: Empty DataFrame
Columns: [AppointmentID, Patient, PrepNurse, Physician, Start, End, ExaminationRoom]
Index: []
Duplicate values in primary key columns of Appointment: Empty DataFrame
Columns: [AppointmentID, cnt]
Index: []


### **Close Connection**

In [89]:
# conn.close()