In [1]:
import os

import numpy as np
import sqlite3
import pandas as pd
import matplotlib.pyplot as pl
from matplotlib.font_manager import FontProperties


In [2]:
# Path to the root directory of datasci
dthon_path = os.path.join(os.path.expanduser('~'), 'Projects','datasci')
dthon_db = os.path.join(dthon_path, "melbourne-datathon","data","medi.db")
# Run this first to create database connection
conn = sqlite3.connect(dthon_db)

In [None]:
# How many pain killers are Listed under ChronicIllness_LookUp
# Opoids ATCLevel3Code: N02A
sql = """
    SELECT * FROM Drug_LookUp
    WHERE ATCLevel3Code = 'N02A'
    AND MasterProductID IN (
        SELECT MasterProductID
        FROM ChronicIllness_Lookup
    )
"""

df = pd.read_sql_query(sql, conn)
df.head()

In [None]:
# What type of pain killers do chronic sufferrers get
sql = """
    SELECT t.Drug_ID, MasterProductFullName, ChronicIllness, count(*) AS 'transactions'
    FROM transactions t
    JOIN Drug_LookUp d on d.MasterProductID = t.Drug_ID
    JOIN implied_condition ic ON ic.Patient_ID = t.Patient_ID
    WHERE t.Patient_ID IN ( -- Only want patients with a chronic illness
        SELECT Patient_ID
        FROM implied_condition
        GROUP BY Patient_ID
    )
    AND Drug_ID IN (
        SELECT MasterProductID
        FROM Drug_LookUp
        WHERE ATCLevel3Code = 'N02A'
    )
    GROUP BY t.Drug_ID, ChronicIllness
    ORDER BY transactions
--    GROUP BY t.Patient_ID, t.Drug_ID, ChronicIllness
"""
df = pd.read_sql_query(sql, conn)
df.tail()

In [None]:
# Volume of transactions of pain killers for each illness
sql = """
    SELECT ic.ChronicIllness, count(*) AS 'transactions',
    (
        SELECT count(*) FROM implied_condition WHERE ChronicIllness = ic.ChronicIllness
    ) AS patients
    FROM transactions t
    JOIN Drug_LookUp d on d.MasterProductID = t.Drug_ID
    JOIN implied_condition ic ON ic.Patient_ID = t.Patient_ID
    WHERE t.Patient_ID IN ( -- Only want patients with a chronic illness
        SELECT Patient_ID
        FROM implied_condition
        GROUP BY Patient_ID
    )
    AND Drug_ID IN (
        SELECT MasterProductID
        FROM Drug_LookUp
        WHERE ATCLevel3Code = 'N02A'
    )
    GROUP BY ChronicIllness
    ORDER BY transactions
--    GROUP BY t.Patient_ID, t.Drug_ID, ChronicIllness
"""
df = pd.read_sql_query(sql, conn)
df.tail()

In [None]:
groups = df.groupby('ChronicIllness')
# Plot
fig, ax = pl.subplots(figsize=(8,8))
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)

pl.xlabel("Number of Patients", fontsize=16)  
pl.ylabel("Number of Transactions", fontsize=16)  
for name, group in groups:
    ax.plot(group.patients, group.transactions, marker='o', linestyle='', ms=12, label=name)

ax.legend()

pl.show()

In [None]:
# Now the same in pandas
sql ="""
    SELECT Patient_ID, ChronicIllness, Drug_ID, MasterProductFullName, Prescription_Week
    FROM transactions t
    INNER JOIN Drug_LookUp d on d.MasterProductID = t.Drug_ID
    NATURAL JOIN implied_condition ic -- Only want patients with chronic illness
    WHERE ATCLevel3Code = 'N02A' -- Only want pain killers
"""
%time dfp2 = pd.read_sql_query(sql, conn)

In [None]:
dfp2.groupby(['ChronicIllness','Patient_ID'])['Drug_ID'].count().mean()
# Note that we cannot destinguish which painkillers apply for which Illness

In [None]:
mask = (dfp['Prescription_Week'] > '2015-01-01') & (dfp['Prescription_Week'] < '2016-01-01' )

In [None]:
groups = dfp.loc[mask].groupby('ChronicIllness')


# Plot
fig, ax = pl.subplots(figsize=(8,8))
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)

pl.title("Painkillers", fontsize=20)
pl.xlabel("Number of Patients", fontsize=16)  
pl.ylabel("Number of Painkiller Transactions", fontsize=16)  
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    x_pos = group.Patient_ID.unique().size
    y_pos = group.Drug_ID.count()
    
    x_os = 100*5
    y_os = 100*-2
    # Exceptions overlapping labels
    if name == "Anti-Coagulant":
        y_os += 500
    elif name == "Osteoporosis":
        y_os -= 400
    
    ax.plot(x_pos, y_pos, marker='o', linestyle='', ms=12, label=name)
    pl.text(x_pos+x_os, y_pos+y_os, name, fontsize=14)


pl.show()

In [None]:
import numpy as np

N = 5

ind = np.arange(N)  # the x locations for the groups
width = 0.35       # the width of the bars

# Bar graph
groups = dfp.loc[mask].groupby(['ChronicIllness','Patient_ID'])['Drug_ID'].count()

groups.unstack().plot(kind='bar',stacked=True)

pl.show()


In [None]:
dfp['Drug_ID'].head()

In [None]:
arr = dfp.loc[mask].groupby(['ChronicIllness','Patient_ID'])['Drug_ID'].count()
arr.unstack().head()
# Number of transactions per patient per illness

In [4]:
# Now the same in pandas
sql ="""
    SELECT Patient_ID, ChronicIllness, COUNT(Drug_ID)
    FROM transactions t
    INNER JOIN Drug_LookUp d on d.MasterProductID = t.Drug_ID
    NATURAL JOIN implied_condition ic -- Only want patients with chronic illness
    WHERE Patient_ID IN (
        SELECT Patient_ID
        FROM implied_condition
        GROUP BY Patient_ID
        HAVING count(Patient_ID) = 1
        ORDER BY RANDOM()
        LIMIT 1000
    )
    AND ATCLevel3Code = 'N02A' -- Only want pain killers
    AND Prescription_Week >= '2016-01-01'
    AND Prescription_Week < '2017-01-01'
    GROUP BY Patient_ID, ChronicIllness
"""
%time dfp = pd.read_sql_query(sql, conn)

CPU times: user 15min 32s, sys: 7min 32s, total: 23min 5s
Wall time: 23min 18s


In [None]:
dfp.head()

In [None]:
dfp['Patient_ID'].groupby(['ChronicIllness'])['COUNT(Drug_ID)'].mean().unstack()
# Hopefully an average painkillers per customer per chronic illness?

In [None]:
ax = avg.plot(kind='bar')

ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)

pl.title("Average Painkillers taken per chronic disease", fontsize=20)
pl.xlabel("", fontsize=16)
pl.ylabel("Avg painkillers per patient", fontsize=16)  


pl.show()

In [None]:
ax = avg.plot(kind='bar')

ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)

pl.title("Average Painkillers taken per chronic disease", fontsize=20)
pl.xlabel("", fontsize=16)
pl.ylabel("Avg painkillers per patient", fontsize=16)  


pl.show()

In [None]:

N = 5

ind = np.arange(N)  # the x locations for the groups
width = 0.35       # the width of the bars

# Plot
fig, ax = pl.subplots(figsize=(8,8))
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)

pl.title("Painkillers", fontsize=20)
pl.xlabel("Chronic Illness", fontsize=16)  
pl.ylabel("Number of Painkiller Transactions", fontsize=16)
ax.bar(ind, avg, width)