In [77]:
import os

import sqlite3
import pandas as pd
import numpy

In [79]:
# Path to the root directory of datasci
dthon_path = os.path.join(os.path.expanduser('~'), 'Projects/datasci')
dthon_db = os.path.join(dthon_path, "datasci.db")

In [80]:
conn = sqlite3.connect(dthon_db)

# Sample SQL query to find the average prices of the drug 3028 in different regions
# Will likely have to weed out pensioner prices?

sql = """select s.postcode,
            round(avg(t.PatientPrice_Amt),2) as avg_patientprice,
            round(avg(t.WholeSalePrice_Amt),2) as avg_wholesaleprice,
            count(s.postcode) as samplesize-- Sample set of transactions for each suburb
        from transactions t
        join stores s on t.Store_ID = s.Store_ID
        where t.Drug_ID = 3028
        --and s.postcode >= 3000
        --and s.postcode < 3500
        and t.PatientPrice_Amt > 0
        group by s.postcode
        order by avg_patientprice desc
        """

# using pandas because I like the head command
df = pd.read_sql_query(sql, conn)

In [81]:
# Might be interesting to see which stores in each suburb contributed to
# these extremes
df.head()

Unnamed: 0,postcode,avg_patientprice,avg_wholesaleprice,samplesize
0,3018,29.41,14.25,39
1,7253,26.78,7.37,4
2,3104,26.21,11.21,31
3,2536,24.14,9.08,13
4,2225,22.9,12.71,36


In [82]:
df.tail()

Unnamed: 0,postcode,avg_patientprice,avg_wholesaleprice,samplesize
526,812,4.75,1.12,1
527,2020,4.75,3.35,3
528,3143,3.0,2.7,2
529,4066,2.54,2.49,4
530,2144,1.95,4.66,1


In [86]:
# Now doing similar things with panda commands, simplifying SQL
# Same result == good

# In summary, acheived a little, but have had to resort to casting
# floats as floats and sql if I want to do anything like sum or mean below??
sql = """select s.postcode, t.PatientPrice_Amt, cast(t.WholeSalePrice_Amt as float) as WholeSalePrice_Amt
        from transactions t
        join stores s on t.Store_ID = s.Store_ID
        where t.Drug_ID = 3028
        --and s.postcode >= 3000
        --and s.postcode < 3500
        and t.PatientPrice_Amt > 0
        """

df = pd.read_sql_query(sql, conn)

In [87]:
aggregations = {
    'PatientPrice_Amt': {
        'Min Patient Price': 'min',
        'Max Patient Price': 'max'
    },
    'WholeSalePrice_Amt': {
        'Avg Patient Price': 'mean',
        'Max Patient Price': 'max'
    }
}

df.groupby('postcode').agg(aggregations)



Unnamed: 0_level_0,PatientPrice_Amt,PatientPrice_Amt,WholeSalePrice_Amt,WholeSalePrice_Amt
Unnamed: 0_level_1,Min Patient Price,Max Patient Price,Max Patient Price,Avg Patient Price
postcode,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0812,4.75,4.75,1.12,1.120000
0820,5.80,5.8,1.86,1.860000
2000,5.60,16,2.70,2.280000
2006,10.35,10.35,2.70,2.700000
2007,7.00,7,2.70,2.700000
2011,14.65,14.65,2.70,2.700000
2018,5.60,5.6,1.86,1.860000
2020,1.96,6.2,4.66,3.353333
2021,9.90,9.9,1.86,1.860000
2022,5.60,13.45,4.66,2.488852
