In [1]:
import os

import numpy as np
import calendar
import datetime
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.font_manager import FontProperties


ModuleNotFoundError: No module named 'matplotlib'

In [2]:
# Path to the root directory of datasci
dthon_path = os.path.join(os.path.expanduser('~'), 'Projects','datasci')
dthon_db = os.path.join(dthon_path, "melbourne-datathon","data","medi.db")
# Run this first to create database connection
conn = sqlite3.connect(dthon_db)

In [3]:
sql = """
    SELECT CAST(strftime('%s', Dispense_Week) AS INT) as timestamp, Drug_ID, 'red' as category
    FROM transactions
    WHERE Patient_ID = 71043;
"""
sql = """
SELECT CAST(strftime('%s', Dispense_Week) AS INT) as timestamp,
CASE WHEN (Drug_ID IN(
            SELECT MasterProductID
            FROM Drug_LookUp
            WHERE GenericIngredientName LIKE '%TRAMADOL%')) THEN 1 
    WHEN (Drug_ID IN(
            SELECT MasterProductID
            FROM Drug_LookUp
            WHERE GenericIngredientName LIKE '%MOCLOBEMIDE%')) THEN 2
    WHEN (Drug_ID IN(
            SELECT c.MasterProductID AS Drug_ID
            FROM ChronicIllness_LookUp c
            WHERE c.ChronicIllness = 'Depression')) THEN 3
    ELSE 4 END AS Drug_ID, 
'1234' as category
    FROM transactions
    WHERE Patient_ID = 71043
    AND (
        Drug_ID IN(
            SELECT d.MasterProductID
            FROM Drug_LookUp d
            WHERE d.GenericIngredientName LIKE '%TRAMADOL%'
            OR d.GenericIngredientName LIKE '%MOCLOBEMIDE%'
        )
    OR Drug_ID IN (
        SELECT c.MasterProductID AS Drug_ID
        FROM ChronicIllness_LookUp c
        WHERE c.ChronicIllness = 'Depression'
        )
    )
;"""
df = pd.read_sql_query(sql, conn)
df.head()

Unnamed: 0,timestamp,Drug_ID,category
0,1300579200,2,1234
1,1315699200,1,1234
2,1316304000,2,1234
3,1321142400,2,1234
4,1322956800,1,1234


In [None]:
from operator import itemgetter
from collections import defaultdict


def plot_timeline(dataset, **kwargs):
    """
    Plots a timeline of events from different sources to visualize a relative
    sequence or density of events. Expects data in the form of:
        (timestamp, source, category)
    Though this can be easily modified if needed. Expects sorted input.
    """
    outpath = kwargs.pop('savefig', None)  # Save the figure as an SVG
    colors  = kwargs.pop('colors', {})     # Plot the colors for the series.
    series  = set([])                      # Figure out the unique series

    # Bring the data into memory and sort
    dataset = sorted(list(dataset), key=itemgetter(0))

    # Make a first pass over the data to determine number of series, etc.
    for _, source, category in dataset:
        series.add(source)
        if category not in colors:
            colors[category] = 'k'

    # Sort and index the series
    series  = sorted(list(series))

    # Create the visualization
    x = []  # Scatterplot X values
    y = []  # Scatterplot Y Values
    c = []  # Scatterplot color values

    # Loop over the data a second time
    for timestamp, source, category in dataset:
        x.append(timestamp)
        y.append(series.index(source))
        c.append(colors[category])

    fig, axes = plt.subplots(1, 1, figsize=(14,4))
    plt.figure(figsize=(14,4))
    plt.title(kwargs.get('title', "Timeline Plot"))
    plt.ylim((-1,len(series)))
    plt.xlim((1.3*pow(10,9), dataset[-1][0]+1000))
    xfmt = mdates.DateFormatter('%Y-%m-%d')
    axes.xaxis.set_major_formatter(xfmt)
    axes.set_xticks(dates)
    plt.yticks(range(len(series)), series)
    plt.scatter(x, y, color=c, alpha=0.85, s=10)

    if outpath:
        return plt.savefig(outpath, format='svg', dpi=1200)

    return plt

if __name__ == '__main__':
    colors = {'red': 'r', 'blue': 'b', 'green': 'g'}
    dates = [datetime.datetime.fromtimestamp(row[0]).strftime('%Y%m%d') for index, row in df.iterrows()]
    plt = plot_timeline([
        (float(row[0]), row[1], row[2])
        for index, row in df.iterrows()
    ], colors=colors)
plt.show()

In [4]:
from operator import itemgetter
from collections import defaultdict


def plot_timeline(dataset, **kwargs):
    """
    Plots a timeline of events from different sources to visualize a relative
    sequence or density of events. Expects data in the form of:
        (timestamp, source, category)
    Though this can be easily modified if needed. Expects sorted input.
    """
    outpath = kwargs.pop('savefig', None)  # Save the figure as an SVG
    colors  = kwargs.pop('colors', {})     # Plot the colors for the series.
    series  = set([])                      # Figure out the unique series

    # Bring the data into memory and sort
    dataset = sorted(list(dataset), key=itemgetter(0))

    # Make a first pass over the data to determine number of series, etc.
    for _, source, category in dataset:
        series.add(source)
        if category not in colors:
            colors[category] = 'k'

    # Sort and index the series
    series  = sorted(list(series))

    # Setup xticks
    year = [20100101,20100601,20110101,20110601,20120101,20120601,20130101,20130601,20140101,20140601,20150101,20150601,20160101,20160601,20170101,20170601]
    dates = [datetime.datetime.strptime(str(int(date)),'%Y%m%d') for date in year]
    
    # Create the visualization
    x = []  # Scatterplot X values
    y = []  # Scatterplot Y Values
    c = []  # Scatterplot color values

    # Loop over the data a second time
    for timestamp, source, category in dataset:
        x.append(datetime.datetime.fromtimestamp(timestamp))
        y.append(series.index(source))
        c.append(colors[category])

    fig, axes = plt.subplots(1, 1, figsize=(14,4))
    axes.scatter(x, y, color=c, alpha=0.85, s=10)
    axes.grid()
    axes.set_xlabel('YEAR')
    axes.set_ylabel('VALUES')
    xfmt = mdates.DateFormatter('%Y-%m-%d')
    axes.xaxis.set_major_formatter(xfmt)
    axes.set_xticks(dates)
    axes.set_yticks(range(len(series)), series)
    plt.xticks(rotation=25)

    if outpath:
        return plt.savefig(outpath, format='svg', dpi=1200)

    return plt

if __name__ == '__main__':
    colors = {'red': 'r', '1234': 'b', 'green': 'g'}
    plt = plot_timeline([
        (float(row[0]), row[1], row[2])
        for index, row in df.iterrows()
    ], colors=colors)
plt.show()

NameError: name 'plt' is not defined

In [None]:
# What type of pain killers do chronic sufferrers get
sql = """
    SELECT t.Drug_ID, MasterProductFullName, ChronicIllness, count(*) AS 'transactions'
    FROM transactions t
    JOIN Drug_LookUp d on d.MasterProductID = t.Drug_ID
    JOIN implied_condition ic ON ic.Patient_ID = t.Patient_ID
    WHERE t.Patient_ID IN ( -- Only want patients with a chronic illness
        SELECT Patient_ID
        FROM implied_condition
        GROUP BY Patient_ID
    )
    AND Drug_ID IN (
        SELECT MasterProductID
        FROM Drug_LookUp
        WHERE ATCLevel3Code = 'N02A'
    )
    GROUP BY t.Drug_ID, ChronicIllness
    ORDER BY transactions
--    GROUP BY t.Patient_ID, t.Drug_ID, ChronicIllness
"""
df = pd.read_sql_query(sql, conn)
df.tail()

In [None]:
# Volume of transactions of pain killers for each illness
sql = """
    SELECT ic.ChronicIllness, count(*) AS 'transactions',
    (
        SELECT count(*) FROM implied_condition WHERE ChronicIllness = ic.ChronicIllness
    ) AS patients
    FROM transactions t
    JOIN Drug_LookUp d on d.MasterProductID = t.Drug_ID
    JOIN implied_condition ic ON ic.Patient_ID = t.Patient_ID
    WHERE t.Patient_ID IN ( -- Only want patients with a chronic illness
        SELECT Patient_ID
        FROM implied_condition
        GROUP BY Patient_ID
    )
    AND Drug_ID IN (
        SELECT MasterProductID
        FROM Drug_LookUp
        WHERE ATCLevel3Code = 'N02A'
    )
    GROUP BY ChronicIllness
    ORDER BY transactions
--    GROUP BY t.Patient_ID, t.Drug_ID, ChronicIllness
"""
df = pd.read_sql_query(sql, conn)
df.tail()

In [None]:
groups = df.groupby('ChronicIllness')
# Plot
fig, ax = pl.subplots(figsize=(8,8))
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)

pl.xlabel("Number of Patients", fontsize=16)  
pl.ylabel("Number of Transactions", fontsize=16)  
for name, group in groups:
    ax.plot(group.patients, group.transactions, marker='o', linestyle='', ms=12, label=name)

ax.legend()

pl.show()

In [None]:
# Now the same in pandas
sql ="""
    SELECT Patient_ID, ChronicIllness, Drug_ID, MasterProductFullName, Prescription_Week
    FROM transactions t
    INNER JOIN Drug_LookUp d on d.MasterProductID = t.Drug_ID
    NATURAL JOIN implied_condition ic -- Only want patients with chronic illness
    WHERE ATCLevel3Code = 'N02A' -- Only want pain killers
"""
%time dfp2 = pd.read_sql_query(sql, conn)

In [None]:
dfp2.groupby(['ChronicIllness','Patient_ID'])['Drug_ID'].count().mean()
# Note that we cannot destinguish which painkillers apply for which Illness

In [None]:
mask = (dfp['Prescription_Week'] > '2015-01-01') & (dfp['Prescription_Week'] < '2016-01-01' )

In [None]:
groups = dfp.loc[mask].groupby('ChronicIllness')


# Plot
fig, ax = pl.subplots(figsize=(8,8))
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)

pl.title("Painkillers", fontsize=20)
pl.xlabel("Number of Patients", fontsize=16)  
pl.ylabel("Number of Painkiller Transactions", fontsize=16)  
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    x_pos = group.Patient_ID.unique().size
    y_pos = group.Drug_ID.count()
    
    x_os = 100*5
    y_os = 100*-2
    # Exceptions overlapping labels
    if name == "Anti-Coagulant":
        y_os += 500
    elif name == "Osteoporosis":
        y_os -= 400
    
    ax.plot(x_pos, y_pos, marker='o', linestyle='', ms=12, label=name)
    pl.text(x_pos+x_os, y_pos+y_os, name, fontsize=14)


pl.show()

In [None]:
import numpy as np

N = 5

ind = np.arange(N)  # the x locations for the groups
width = 0.35       # the width of the bars

# Bar graph
groups = dfp.loc[mask].groupby(['ChronicIllness','Patient_ID'])['Drug_ID'].count()

groups.unstack().plot(kind='bar',stacked=True)

pl.show()


In [None]:
dfp['Drug_ID'].head()

In [None]:
arr = dfp.loc[mask].groupby(['ChronicIllness','Patient_ID'])['Drug_ID'].count()
arr.unstack().head()
# Number of transactions per patient per illness

In [None]:
# Now the same in pandas
sql ="""
    SELECT Patient_ID, ChronicIllness, COUNT(Drug_ID)
    FROM transactions t
    INNER JOIN Drug_LookUp d on d.MasterProductID = t.Drug_ID
    NATURAL JOIN implied_condition ic -- Only want patients with chronic illness
    WHERE Patient_ID IN (
        SELECT Patient_ID
        FROM implied_condition
        GROUP BY Patient_ID
        HAVING count(Patient_ID) = 1
        ORDER BY RANDOM()
        LIMIT 1000
    )
    AND ATCLevel3Code = 'N02A' -- Only want pain killers
    AND Prescription_Week >= '2016-01-01'
    AND Prescription_Week < '2017-01-01'
    GROUP BY Patient_ID, ChronicIllness
"""
%time dfp = pd.read_sql_query(sql, conn)

In [None]:
dfp.head()

In [None]:
dfp['Patient_ID'].groupby(['ChronicIllness'])['COUNT(Drug_ID)'].mean().unstack()
# Hopefully an average painkillers per customer per chronic illness?

In [None]:
ax = avg.plot(kind='bar')

ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)

pl.title("Average Painkillers taken per chronic disease", fontsize=20)
pl.xlabel("", fontsize=16)
pl.ylabel("Avg painkillers per patient", fontsize=16)  


pl.show()

In [None]:
ax = avg.plot(kind='bar')

ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)

pl.title("Average Painkillers taken per chronic disease", fontsize=20)
pl.xlabel("", fontsize=16)
pl.ylabel("Avg painkillers per patient", fontsize=16)  


pl.show()

In [None]:

N = 5

ind = np.arange(N)  # the x locations for the groups
width = 0.35       # the width of the bars

# Plot
fig, ax = pl.subplots(figsize=(8,8))
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)

pl.title("Painkillers", fontsize=20)
pl.xlabel("Chronic Illness", fontsize=16)  
pl.ylabel("Number of Painkiller Transactions", fontsize=16)
ax.bar(ind, avg, width)