In [19]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import feather
import pickle as pkl
import datetime

In [20]:
from hyperparameters import Hyperparameters
hp = Hyperparameters()
hp.data_dir


'C:/Users/bruno/OneDrive - The University of Auckland/Documents/Code/python/deep_learning/elective_test/'

# Generating test cohorts to play around with

In [21]:
np.random.seed(1)

# Varianz
rows = 10000
column_names = ["VSIMPLE_INDEX_MASTER","EVENT","end_fu_time","nhi_age",'gender_code',"en_nzdep_q","hx_vdr_diabetes",'en_prtsd_eth','hx_af','ph_bp_lowering_prior_6mths','ph_lipid_lowering_prior_6mths','ph_antithrombotic_prior_6mths']
zero_matrix = np.zeros(shape = ((rows),len(column_names)))
df = pd.DataFrame(zero_matrix, columns = column_names)

#generate random indexes
df["VSIMPLE_INDEX_MASTER"] = ["".join(np.random.choice(["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]) for _ in range(7)) for _ in range(rows)]
# df["VSIMPLE_INDEX_MASTER"]

#gedner code
df["gender_code"] = np.random.binomial(1,0.5,rows)

#generate random follow-up times
df["end_fu_time"] = np.random.randint(100,1835,rows)

#simulate events 
df["EVENT"] = np.random.binomial(1,0.1,rows)

#generate ethncicities
df["en_prtsd_eth"] = [np.random.choice([1,2,3,4,42,5,9]) for _ in range(rows)]

#deprivation
df["en_nzdep_q"] = [np.random.choice([1,2,3,4,5]) for _ in range(rows)]

#age
df["nhi_age"] = np.random.randint(30,74,rows)

#pivot
temp = df[["VSIMPLE_INDEX_MASTER","en_prtsd_eth"]].pivot_table(index = df["VSIMPLE_INDEX_MASTER"], columns = ["en_prtsd_eth"], aggfunc = len).fillna(0)
temp.columns = ["EN_PRTSD_ETH_{}".format(l2) for l1,l2 in temp.columns]
df = df.join(temp, on = "VSIMPLE_INDEX_MASTER")
df.drop("en_prtsd_eth",axis= 1, inplace = True)
#to return it melt is not the best way
# pd.melt(temp,id_vars = ["VSIMPLE_INDEX_MASTER"])
#instead it is pretty simple
test = temp.apply(lambda row: int( row.EN_PRTSD_ETH_1 + row.EN_PRTSD_ETH_2*2 + row.EN_PRTSD_ETH_3*3 + row.EN_PRTSD_ETH_4*4),axis = 1)
df["en_prtsd_eth"]  = test.values #need to extract values to get the np array!
df.drop(["EN_PRTSD_ETH_1","EN_PRTSD_ETH_2","EN_PRTSD_ETH_3","EN_PRTSD_ETH_4"], axis = 1,inplace = True)

# #simulate events for other variables
df['hx_vdr_diabetes'] = np.random.binomial(1,0.2,rows)
df['hx_af'] = np.random.binomial(1,0.1,rows)
df['ph_bp_lowering_prior_6mths'] = np.random.binomial(1,0.2,rows)
df['ph_lipid_lowering_prior_6mths'] = np.random.binomial(1,0.2,rows)
df['ph_anticoagulants_prior_6mths'] = np.random.binomial(1,0.2,rows)
df['ph_antithrombotic_prior_6mths'] = np.random.binomial(1,0.2,rows)

#reorder
df = df[["VSIMPLE_INDEX_MASTER","EVENT","end_fu_time","nhi_age",'gender_code',"en_nzdep_q","hx_vdr_diabetes",'en_prtsd_eth','hx_af','ph_bp_lowering_prior_6mths','ph_lipid_lowering_prior_6mths','ph_antithrombotic_prior_6mths']]
df

#occasionally will throw errors when non-unique index is generated

df.to_feather(hp.data_dir + "raw_data/VARIANZ_2014.feather")

df

Unnamed: 0,VSIMPLE_INDEX_MASTER,EVENT,end_fu_time,nhi_age,gender_code,en_nzdep_q,hx_vdr_diabetes,en_prtsd_eth,hx_af,ph_bp_lowering_prior_6mths,ph_lipid_lowering_prior_6mths,ph_antithrombotic_prior_6mths
0,flmijlf,0,1552,50,1,5,0,1,0,0,1,0
1,paqbmhn,0,270,42,1,1,0,0,0,1,0,0
2,gzsufsu,1,1609,47,1,3,0,0,0,0,0,1
3,lkosexx,0,1628,53,0,4,0,0,0,0,0,0
4,jrxawnj,1,1823,64,0,2,1,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,dhxsyqj,0,967,33,0,5,0,0,0,0,0,0
9996,thxszwr,0,1828,67,1,1,0,0,1,0,0,0
9997,xpvrbjm,0,221,30,1,4,0,0,0,0,0,0
9998,bbacrpw,0,1708,45,1,3,0,3,0,1,0,0


In [383]:
df

Unnamed: 0,VSIMPLE_INDEX_MASTER,EVENT,end_fu_time,nhi_age,gender_code,en_nzdep_q,hx_vdr_diabetes,en_prtsd_eth,hx_af,ph_bp_lowering_prior_6mths,ph_lipid_lowering_prior_6mths,ph_antithrombotic_prior_6mths
0,ivwcxur,0,966,42,1,4,0,4,1,0,0,0
1,htivxvb,0,546,36,0,2,0,1,0,0,0,0
2,tudlafm,0,1711,68,1,1,0,2,0,0,0,0
3,ttmtvut,0,929,49,0,3,0,3,1,0,0,0
4,salflgd,0,785,71,1,5,0,4,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,ijhabvj,0,1293,30,1,4,0,1,0,0,0,0
9996,pahfokw,0,596,47,1,2,0,3,0,0,0,0
9997,ljlbawz,0,1719,32,1,1,0,1,0,0,1,0
9998,hyrnwio,0,565,57,1,5,0,1,0,1,0,0


### <font color = "red">**GROUP BY OPERATION MASTERCLASS**</font> :)

In [32]:
np.random.seed(1)

#EVENTS
#can use a size variable 
size = 100000

#number of codes
ICD_codes = ["".join(np.random.choice(["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"], size = (5))) for _ in range(100)] 

ICD_list = np.random.choice(ICD_codes, size = (size)) 
ICD_list

DIAG_TYP = np.random.randint(1,4,size)
Index_list = np.random.choice(df["VSIMPLE_INDEX_MASTER"], size = (size)) 

date_index = np.random.randint(1,60,size)

repeats  = np.random.randint(1,5,size)

he_df = pd.DataFrame({"VSIMPLE_INDEX_MASTER":Index_list,
                      "CLIN_CD_10":ICD_list,
                      "DIAG_TYP":DIAG_TYP,
                      "eventmonth_index":date_index,
                      "num_ind":repeats})


#duplicate based on num_ind
he_df = he_df.loc[he_df.index.repeat(he_df.num_ind)]
#print(he_df.sort_values(by = ["VSIMPLE_INDEX_MASTER","CLIN_CD_10","DIAG_TYP","eventmonth_index"]))

#fancy function
#he_df.drop(["num_ind"])

#interesting testint to see how it works
#he_df.groupby(["VSIMPLE_INDEX_MASTER","CLIN_CD_10","DIAG_TYP","eventmonth_index"]).agg(lambda x: x+1)

#trialling .transform instead
#he_df.groupby(["VSIMPLE_INDEX_MASTER","CLIN_CD_10","DIAG_TYP","eventmonth_index"]).cumcount() + 1
he_df["ind"] = he_df.groupby(["VSIMPLE_INDEX_MASTER","CLIN_CD_10","DIAG_TYP","eventmonth_index"]).transform(lambda x: [i for i in range(len(x))]) #I HAD IT RIGHT! just a small error
he_df["eventmonth_index"] = he_df["eventmonth_index"] + he_df["ind"]
he_df.drop(["num_ind","ind"],axis = 1, inplace = True)
he_df.reset_index(drop = True,inplace = True)
#remove duplicates
#he_df.drop_duplicates(subset = ["VSIMPLE_INDEX_MASTER","CLIN_CD_10","DIAG_TYP"])

#he_df.groupby(["VSIMPLE_INDEX_MASTER","CLIN_CD_10"]).agg(lambda x: x.min())

#he_df.sort_values(by = ["VSIMPLE_INDEX_MASTER","CLIN_CD_10"])

he_df.to_feather(hp.data_dir + "raw_data/EVENTS0913.feather")



In [34]:
np.random.seed(10)

#PH
#can use a size variable 
size = 100000

#number of codes
#chem_codes = ["".join(np.random.choice(["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"], size = (5))) for _ in range(100)] 
chem_codes = [np.random.choice(np.random.randint(100,150,10)) for _ in range(100)] 


chem_list = np.random.choice(chem_codes, size = (size)) 
chem_list

Index_list = np.random.choice(df["VSIMPLE_INDEX_MASTER"], size = (size)) 

date_index = np.random.randint(1,60,size)

repeats  = np.random.randint(1,5,size)

he_df = pd.DataFrame({"VSIMPLE_INDEX_MASTER":Index_list,
                      "chem_id":chem_list,
                      "dispmonth_index":date_index,
                      "num_ind":repeats})


#duplicate based on num_ind
he_df = he_df.loc[he_df.index.repeat(he_df.num_ind)]
#print(he_df.sort_values(by = ["VSIMPLE_INDEX_MASTER","CLIN_CD_10","DIAG_TYP","eventmonth_index"]))

#fancy function
#he_df.drop(["num_ind"])

#interesting testint to see how it works
#he_df.groupby(["VSIMPLE_INDEX_MASTER","CLIN_CD_10","DIAG_TYP","eventmonth_index"]).agg(lambda x: x+1)

#trialling .transform instead
#he_df.groupby(["VSIMPLE_INDEX_MASTER","CLIN_CD_10","DIAG_TYP","eventmonth_index"]).cumcount() + 1
he_df["ind"] = he_df.groupby(["VSIMPLE_INDEX_MASTER","chem_id","dispmonth_index"]).transform(lambda x: [i for i in range(len(x))]) #I HAD IT RIGHT! just a small error
he_df["dispmonth_index"] = he_df["dispmonth_index"] + he_df["ind"]
he_df.drop(["num_ind","ind"],axis = 1, inplace = True)
he_df.reset_index(drop= True, inplace = True)

#he_df.sort_values(by = ["VSIMPLE_INDEX_MASTER","CLIN_CD_10"])
he_df.to_feather(hp.data_dir + "raw_data/PHARMS0913.feather")

In [24]:
import random
from datetime import datetime, timedelta
#time delta supports multiplication with a float

def random_date(start_date, end_date):
    # Convert dates to datetime format
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")

    # Generate a random datetime between the start and end dates
    random_date = start_date + (end_date - start_date) * random.random()

    return random_date.date()  # Return the date part

# Usage
temp = random_date("2009-01-01", "2013-12-31")
print(temp)

print(temp.strftime("%Y-%m-%d"))

def month_ind(x):
    date_y = datetime.strptime("2009-01-01","%Y-%m-%d")
    date_x = datetime.strptime(x,"%Y-%m-%d")
    total_months = (date_x.year - date_y.year) * 12 + date_x.month - date_y.month
    return(total_months)

print(month_ind(temp.strftime("%Y-%m-%d")))


2011-06-15
2011-06-15
29


In [36]:
np.random.seed(1)

#testsafe
num = 100000
egfr = np.random.normal(loc = 90, scale = 20.0,size = num)
hba1c = np.random.normal(loc = 40, scale = 10.0,size = num)
tchdl = np.random.normal(loc = 1, scale = 0.5,size = num)

# result = map([].extend,[egfr,hba1c,tchdl])
result = np.concatenate([egfr,hba1c,tchdl])
tests = np.repeat(["egfr","hba1c","tchdl"], (num))

RESULT_DATE = [random_date("2009-01-01", "2013-12-31") for _ in range(len(tests))]

VSIMPLE_INDEX_MASTER = [np.random.choice(df.VSIMPLE_INDEX_MASTER) for _ in range(len(tests))]

print(len(result),len(tests),len(RESULT_DATE),len(VSIMPLE_INDEX_MASTER))
ts_df = pd.DataFrame({
    "VSIMPLE_INDEX_MASTER":VSIMPLE_INDEX_MASTER,
    "test":tests,
    "RESULT_DATE":RESULT_DATE,
    "result":result
})

ts_df.to_feather(hp.data_dir + "raw_data/TESTSAFE0913.feather")


300000 300000 300000 300000


In [48]:
[month_ind(i) for i in ts_df["RESULT_DATE"].astype(str)]




[17,
 4,
 49,
 32,
 22,
 0,
 32,
 50,
 53,
 20,
 51,
 37,
 13,
 21,
 49,
 27,
 18,
 10,
 37,
 32,
 21,
 37,
 21,
 21,
 16,
 4,
 50,
 14,
 28,
 54,
 10,
 44,
 51,
 40,
 30,
 14,
 56,
 1,
 5,
 40,
 34,
 29,
 24,
 50,
 10,
 25,
 12,
 40,
 55,
 24,
 49,
 57,
 0,
 20,
 59,
 34,
 47,
 17,
 9,
 6,
 59,
 47,
 56,
 47,
 20,
 57,
 17,
 7,
 15,
 6,
 57,
 48,
 43,
 0,
 30,
 25,
 7,
 4,
 6,
 12,
 44,
 25,
 56,
 28,
 36,
 16,
 43,
 58,
 11,
 33,
 56,
 20,
 58,
 13,
 8,
 37,
 33,
 33,
 56,
 19,
 34,
 9,
 45,
 55,
 0,
 25,
 9,
 31,
 7,
 22,
 24,
 21,
 14,
 49,
 58,
 48,
 30,
 39,
 45,
 26,
 4,
 15,
 17,
 51,
 43,
 16,
 3,
 19,
 34,
 51,
 22,
 50,
 48,
 47,
 49,
 49,
 9,
 52,
 50,
 40,
 27,
 23,
 5,
 6,
 6,
 11,
 26,
 51,
 7,
 41,
 21,
 24,
 57,
 59,
 22,
 35,
 5,
 35,
 33,
 0,
 6,
 28,
 24,
 41,
 32,
 28,
 24,
 0,
 6,
 49,
 18,
 55,
 56,
 46,
 13,
 30,
 16,
 54,
 12,
 33,
 34,
 27,
 12,
 31,
 7,
 58,
 13,
 5,
 11,
 17,
 40,
 36,
 56,
 13,
 43,
 3,
 47,
 5,
 41,
 38,
 40,
 38,
 43,
 35,
 34,
 34,
 52,


# <span style="color:red"> **Exploring the data after pre-processing** </span>

In [33]:
#testing format
df = pd.read_feather(hp.data_pp_dir + "Py_VARIANZ_2012_v3-1_pp_females.feather")
he_test = pd.read_feather(hp.data_dir + "pp_data/HE_pp_females.feather")
ph_test = pd.read_feather(hp.data_dir + "pp_data/PH_pp_females.feather")


print(he_test)
print(ph_test)

     VSIMPLE_INDEX_MASTER CLIN_CD_10  DIAG_TYPE  dispmonth_index
0                 abaljmn      fgphf          1               55
1                 abdupxx      bgtoi          2               48
2                 abefgni      vqfjn          1                5
3                 abefgni      vqfjn          1                6
4                 abetzbo      tpkzi          2               18
...                   ...        ...        ...              ...
9217              zzrtfvp      fgphf          1               41
9218              zzrwtoz      ambjy          3               42
9219              zzrwtoz      ambjy          3               43
9220              zzrwtoz      ambjy          3               44
9221              zzrwtoz      ambjy          3               45

[9222 rows x 4 columns]
       VSIMPLE_INDEX_MASTER  chem_id  dispmonth_index  eventmonth_index
0                   aaektmy      147                3                 3
1                   aaektmy      147               

In [None]:
print('Loading medications...')
ph = feather.read_dataframe(hp.data_pp_dir + 'PH_pp_' + gender + '.feather')
ph['TYPE'] = 0
ph.rename(columns={'chem_id': 'CODE', 'dispmonth_index': 'MONTH'}, inplace=True)
print('Loading hospital events...')
he = feather.read_dataframe(hp.data_pp_dir + 'HE_pp_' + gender + '.feather')
he.rename(columns={'CLIN_CD_10': 'CODE', 'dispmonth_index': 'MONTH'}, inplace=True)
he['TYPE'] = 1

In [62]:
from os import listdir
tmp = listdir("../elective_test/pp_data")

[i for i in tmp if ".npz" in i]


5 in [4,5,6]

True