In [168]:
pip install tabulate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [32]:
import sqlite3
import pandas as pd
import os
from tabulate import tabulate
from datetime import datetime

In [33]:
# Dynamically determine the base project root directory
current_dir = os.getcwd()  # Current working directory (from which the notebook is run)
project_root = os.path.abspath(os.path.join(current_dir, '..','Healthcare_ETL_Project'))

# Dynamically locate the database path in the 'db' folder within the project root
db_dir = os.path.join(project_root, "db")
db_path = os.path.join(db_dir, "healthcare_data.db")

# Ensure the database exists at the expected location
if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database not found at {db_path}")

# Connect to the SQLite database
conn = sqlite3.connect(db_path)
print(f"Database connection successful: {db_path}")


Database connection successful: /Users/avinashmacbookair/Documents/TREND Health Partners 2/Healthcare_ETL_Project/db/healthcare_data.db


In [34]:
# Example query: Fetching data from TREATMENT table
query = "SELECT * FROM PROVIDER;"  # Adjust query as needed
df = pd.read_sql_query(query, conn)

# Display the result as a nicely formatted table
if not df.empty:
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))
else:
    print("No data returned from the query.")

+--------------+---------------+--------------+-------------+-----------------+-------------------+---------------------------------------+--------------+------------+--------------+
|   Version_ID |   Provider_ID | First_Name   | Last_Name   |   Speciality_Id | Speciality_Name   | Affiliated_Hospital                   | Valid_From   | Valid_To   |   Is_Current |
|--------------+---------------+--------------+-------------+-----------------+-------------------+---------------------------------------+--------------+------------+--------------|
|            1 |             1 | Nandini      | Srivastava  |               8 | Radiology         | Mayo Clinic                           | 2025-04-27   |            |            1 |
|            2 |             2 | Aashi        | Devi        |               1 | Cardiology        | Cleveland Clinic                      | 2025-04-27   |            |            1 |
|            3 |             3 | Madhavi      | Ahluwalia   |              12 | Nephr

In [13]:
cursor = conn.cursor()

cursor.execute("""
UPDATE PROVIDER
SET Affiliated_Hospital = 'Stark Hospitals'
WHERE Provider_ID = 1 AND Is_Current = 1
""")
#cursor.execute(update_query)
conn.commit()

df = pd.read_sql_query("SELECT * FROM PROVIDER WHERE Provider_ID = 1", conn)
print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))


+--------------+---------------+--------------+-------------+-----------------+-------------------+-----------------------+---------------------+---------------------+--------------+
|   Version_ID |   Provider_ID | First_Name   | Last_Name   |   Speciality_Id | Speciality_Name   | Affiliated_Hospital   | Valid_From          | Valid_To            |   Is_Current |
|--------------+---------------+--------------+-------------+-----------------+-------------------+-----------------------+---------------------+---------------------+--------------|
|            1 |             1 | Nandini      | Srivastava  |               8 | Radiology         | Mayo Clinic           | 2025-04-26          | 2025-04-27 12:20:26 |            0 |
|           10 |             1 | Nandini      | Srivastava  |               8 | Radiology         | Stark Hospitals       | 2025-04-27 12:20:26 |                     |            1 |
+--------------+---------------+--------------+-------------+-----------------+------

In [15]:

query = "SELECT Count(*) AS Total_Records FROM TREATMENT LIMIT 10;"  # Adjust query as needed
df = pd.read_sql_query(query, conn)

# Display the result as a nicely formatted table
if not df.empty:
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))
else:
    print("No data returned from the query.")

+-----------------+
|   Total_Records |
|-----------------|
|          350000 |
+-----------------+


In [17]:
# Calculate the average treatment duration per treatment type
query = """
WITH TreatmentSummary AS (
    SELECT Type, 
           AVG(Treatment_Duration) AS Avg_Treatment_Duration
    FROM TREATMENT
    GROUP BY Type
)
SELECT * FROM TreatmentSummary;
"""
df = pd.read_sql_query(query, conn)

# Display the result as a nicely formatted table
if not df.empty:
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))
else:
    print("No data returned from the query.")


+-----------------+--------------------------+
| Type            |   Avg_Treatment_Duration |
|-----------------+--------------------------|
| pharmacological |                  4.48511 |
| preventive      |                  4.49862 |
| surgical        |                  4.49565 |
| therapeutic     |                  4.49916 |
+-----------------+--------------------------+


In [18]:
# Determine the total cost of treatments by outcome quarter and year
query = """
WITH CostSummary AS (
    SELECT 
        strftime('%Y', Outcome_Date) AS Outcome_Year,
        Outcome_Quarter, 
        SUM(Cost) AS Total_Cost
    FROM TREATMENT
    GROUP BY Outcome_Year, Outcome_Quarter
)
SELECT * FROM CostSummary;
"""
df = pd.read_sql_query(query, conn)

# Set pandas options to avoid scientific notation globally
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Apply formatting to the Total_Cost column if needed
df['Total_Cost'] = df['Total_Cost'].apply(lambda x: '{:,.2f}'.format(x))

# Display the result as a nicely formatted table
if not df.empty:
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))
else:
    print("No data returned from the query.")


+----------------+-------------------+------------------+
|   Outcome_Year |   Outcome_Quarter | Total_Cost       |
|----------------+-------------------+------------------|
|           2024 |                 1 | 4,667,369,635.31 |
|           2024 |                 2 | 5,185,229,884.95 |
|           2024 |                 3 | 537,295,281.94   |
+----------------+-------------------+------------------+


In [19]:
# Find the most common day of the week for treatments
query = """
WITH TreatmentDays AS (
    SELECT Outcome_Day, 
           COUNT(*) AS Frequency
    FROM TREATMENT
    GROUP BY Outcome_Day
)
SELECT Outcome_Day
FROM TreatmentDays
ORDER BY Frequency DESC
LIMIT 1;
"""
df = pd.read_sql_query(query, conn)

# Display the result as a nicely formatted table
if not df.empty:
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))
else:
    print("No data returned from the query.")


+---------------+
| Outcome_Day   |
|---------------|
| Monday        |
+---------------+


In [20]:
# Calculate the effectiveness score for treatments by disease
query = """
WITH EffectivenessByDisease AS (
    SELECT Disease_ID, 
           AVG(Effectiveness_Score) AS Avg_Effectiveness_Score
    FROM TREATMENT
    WHERE Effectiveness_Score IS NOT NULL
    GROUP BY Disease_ID
)
SELECT * FROM EffectivenessByDisease;
"""
df = pd.read_sql_query(query, conn)

# Display the result as a nicely formatted table
if not df.empty:
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))
else:
    print("No data returned from the query.")


+--------------+---------------------------+
|   Disease_ID |   Avg_Effectiveness_Score |
|--------------+---------------------------|
|            1 |                   2.52023 |
|            2 |                   2.46752 |
|            3 |                   2.49717 |
|            4 |                   2.52146 |
|            5 |                   2.48843 |
|           16 |                   2.48356 |
|           17 |                   2.5247  |
|           18 |                   2.50669 |
|           19 |                   2.48955 |
|           20 |                   2.46616 |
|           21 |                   2.52607 |
|           22 |                   2.52652 |
|           23 |                   2.47625 |
|           24 |                   2.5005  |
|           25 |                   2.46881 |
|           31 |                   2.46791 |
|           32 |                   2.49118 |
|           33 |                   2.50146 |
|           34 |                   2.5156  |
|         

In [21]:
# Total number of treatments and the average treatment cost for each provider
query = """
WITH ProviderTreatmentStats AS (
    SELECT Provider_ID, 
           COUNT(*) AS Treatment_Count, 
           AVG(Cost) AS Avg_Cost
    FROM TREATMENT
    GROUP BY Provider_ID
)
SELECT * FROM ProviderTreatmentStats;
"""
df = pd.read_sql_query(query, conn)

# Display the result as a nicely formatted table
if not df.empty:
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))
else:
    print("No data returned from the query.")


+---------------+-------------------+------------+
|   Provider_ID |   Treatment_Count |   Avg_Cost |
|---------------+-------------------+------------|
|             1 |             46720 |    29701.8 |
|             2 |             46793 |    29642.4 |
|             3 |             46412 |    29700   |
|             4 |             46719 |    29750   |
|             5 |             46635 |    29707.4 |
|             6 |             46521 |    29658.9 |
|             7 |             46887 |    29715.4 |
|             8 |             23313 |    29529.2 |
+---------------+-------------------+------------+


In [22]:
# Calculate total cost per month for 2024 and 2025
query = """
WITH MonthlyCost AS (
    SELECT 
        strftime('%Y', Outcome_Date) AS Year,
        strftime('%m', Outcome_Date) AS Month,
        SUM(Cost) AS Total_Cost
    FROM TREATMENT
    WHERE Year IN ('2024', '2025')
    GROUP BY Year, Month
    ORDER BY Year, Month
)
SELECT * FROM MonthlyCost;
"""

df = pd.read_sql_query(query, conn)

# Format the Total_Cost column to show full precision with commas
df['Total_Cost'] = df['Total_Cost'].apply(lambda x: '{:,.2f}'.format(x))

# Display the result
if not df.empty:
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))
else:
    print("No data returned from the query.")


+--------+---------+------------------+
|   Year |   Month | Total_Cost       |
|--------+---------+------------------|
|   2024 |      01 | 1,255,075,684.23 |
|   2024 |      02 | 1,650,916,170.90 |
|   2024 |      03 | 1,761,377,780.18 |
|   2024 |      04 | 1,710,797,772.09 |
|   2024 |      05 | 1,765,416,041.55 |
|   2024 |      06 | 1,709,016,071.31 |
|   2024 |      07 | 537,295,281.94   |
+--------+---------+------------------+


In [29]:

# How many doctors have changed their affiliated hospital in the last 6 months
query = """
SELECT COUNT(DISTINCT p.Provider_ID) AS doctors_changed_hospital
FROM (
    SELECT Provider_ID
    FROM PROVIDER
    GROUP BY Provider_ID
    HAVING COUNT(DISTINCT Affiliated_Hospital) > 1
) AS changed_providers
JOIN PROVIDER p ON changed_providers.Provider_ID = p.Provider_ID
WHERE p.Valid_From >= date('now', '-6 months');
"""

df = pd.read_sql_query(query, conn)

# Display the result as a nicely formatted table
if not df.empty:
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))
else:
    print("No data returned from the query.")


+----------------------------+
|   doctors_changed_hospital |
|----------------------------|
|                          2 |
+----------------------------+


In [30]:

# Details of doctors have changed their affiliated hospital in the last 6 months
query = """
SELECT DISTINCT p.Provider_ID, p.First_Name, p.Last_Name
FROM (
    SELECT Provider_ID
    FROM PROVIDER
    GROUP BY Provider_ID
    HAVING COUNT(DISTINCT Affiliated_Hospital) > 1
) AS changed_providers
JOIN PROVIDER p ON changed_providers.Provider_ID = p.Provider_ID
WHERE p.Valid_From >= date('now', '-6 months');
"""

df = pd.read_sql_query(query, conn)

# Display the result as a nicely formatted table
if not df.empty:
    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))
else:
    print("No data returned from the query.")


+---------------+--------------+-------------+
|   Provider_ID | First_Name   | Last_Name   |
|---------------+--------------+-------------|
|             1 | Nandini      | Srivastava  |
|             8 | Sarika       | Zutshi      |
+---------------+--------------+-------------+


In [35]:
conn.close()  # Close the connection after all queries are done