In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

In [2]:
df = dd.read_csv('NIS_2012_CoreCSV.csv', dtype=object)

List of columns with mixed types
21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,62,63,64,65,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104

### List of HCUP column names:

In [3]:
unique = set([])

test = df.KEY_NIS.unique().compute()

# for t in df.KEY_NIS.itertuples():
#     print(t)
#     break

In [10]:
num = 0
for item in df.KEY_NIS.compute(): #test:
    print(item)
    num += 1
    if num == 4:
        break

10000011
10000148
10000174
10000218


In [5]:
print(len(set(test)))

7296968


In [None]:
len(df.DX1.unique())

In [None]:
len(df[df['DX1']=='90']) ##checks column for specified value

In [None]:
len(df[df['DX1']=='44389']) ##checks column for peripheral vascular disease

### Number of unique variables by column

In [None]:
for i in range(15):
    print("PR{}: ".format(i+1), len(df['PR{}'.format(i+1)].unique()))

In [None]:
for i in range(25):
    print("DX{}: ".format(i+1), len(df['DX{}'.format(i+1)].unique()))

### Procedure Matrix

The following parser takes the Procedure codes and their descriptions as elements in the 'PRs' array.

In [3]:
PRs = []
with open("Procedures.txt", "r") as f:
    for line in f:
        code, labels = line.split("=")[0][4:-2], line.split("=")[1][2:-2]
        PRs.append((code,labels))
PRs[:10]

[('    ', '    : BLANK'),
 ('0001', '0001: THERAP ULTRASOUND OF HEAD AND NECK (Begin 2002)'),
 ('0002', '0002: THERAPEUTIC ULTRASOUND OF HEART (Begin 2002)'),
 ('0003', '0003: THERAP ULTRASOUND PERIPHRL VASC VESSELS (Begin 2002)'),
 ('0009', '0009: OTHER THERAPEUTIC ULTRASOUND (Begin 2002)'),
 ('0010', '0010: IMPLANTATION OF CHEMOTHERAPEUTIC AGENT (Begin 2002)'),
 ('0011', '0011: INFUSION DROTRECOGIN ALFA (ACTIVATED) (Begin 2002)'),
 ('0012', '0012: ADMINISTRATION OF INHALED NITRIC OXIDE (Begin 2002)'),
 ('0013', '0013: INJECTION OR INFUSION OF NESIRITIDE (Begin 2002)'),
 ('0014', '0014: INJECT/INFUS OF OXAZOLIDINONE ANTIBTCS (Begin 2002)')]

In [4]:
PRdict = dict(PRs) # converts the key value pairs to a dictionary

In [5]:
PRdict.pop('    ') # remove 'Blank' from dictionary since not an ICD code

'    : BLANK'

In [6]:
PRdict

{'0001': '0001: THERAP ULTRASOUND OF HEAD AND NECK (Begin 2002)',
 '0002': '0002: THERAPEUTIC ULTRASOUND OF HEART (Begin 2002)',
 '0003': '0003: THERAP ULTRASOUND PERIPHRL VASC VESSELS (Begin 2002)',
 '0009': '0009: OTHER THERAPEUTIC ULTRASOUND (Begin 2002)',
 '0010': '0010: IMPLANTATION OF CHEMOTHERAPEUTIC AGENT (Begin 2002)',
 '0011': '0011: INFUSION DROTRECOGIN ALFA (ACTIVATED) (Begin 2002)',
 '0012': '0012: ADMINISTRATION OF INHALED NITRIC OXIDE (Begin 2002)',
 '0013': '0013: INJECTION OR INFUSION OF NESIRITIDE (Begin 2002)',
 '0014': '0014: INJECT/INFUS OF OXAZOLIDINONE ANTIBTCS (Begin 2002)',
 '0015': '0015: HIGH-DOSE INFUSION INTERLEUKIN-2 (IL-2) (Begin 2003)',
 '0016': '0016: PRESSURIZED TREAT GRAFT (Begin 2004)',
 '0017': '0017: INFUSION OF VASOPRESSOR (Begin 2004)',
 '0018': '0018: INFUS IMMUNOSUP ANTIBODY (Begin 2005)',
 '0019': '0019: BBBD VIA INFUSION (Begin 2007)',
 '0021': '0021: IVUS EXTRACRAN CEREB VES (Begin 2004)',
 '0022': '0022: IVUS INTRATHORACIC VES (Begin 2004)'

In [7]:
# makes new dictionary replacing the 'blanks' in the keys with zeros
new_PRdict = {}
for key in PRdict:
    new_PRdict[key.replace(" ", "0")] = PRdict[key]

In [8]:
new_PRdict

{'0001': '0001: THERAP ULTRASOUND OF HEAD AND NECK (Begin 2002)',
 '0002': '0002: THERAPEUTIC ULTRASOUND OF HEART (Begin 2002)',
 '0003': '0003: THERAP ULTRASOUND PERIPHRL VASC VESSELS (Begin 2002)',
 '0009': '0009: OTHER THERAPEUTIC ULTRASOUND (Begin 2002)',
 '0010': '0010: IMPLANTATION OF CHEMOTHERAPEUTIC AGENT (Begin 2002)',
 '0011': '0011: INFUSION DROTRECOGIN ALFA (ACTIVATED) (Begin 2002)',
 '0012': '0012: ADMINISTRATION OF INHALED NITRIC OXIDE (Begin 2002)',
 '0013': '0013: INJECTION OR INFUSION OF NESIRITIDE (Begin 2002)',
 '0014': '0014: INJECT/INFUS OF OXAZOLIDINONE ANTIBTCS (Begin 2002)',
 '0015': '0015: HIGH-DOSE INFUSION INTERLEUKIN-2 (IL-2) (Begin 2003)',
 '0016': '0016: PRESSURIZED TREAT GRAFT (Begin 2004)',
 '0017': '0017: INFUSION OF VASOPRESSOR (Begin 2004)',
 '0018': '0018: INFUS IMMUNOSUP ANTIBODY (Begin 2005)',
 '0019': '0019: BBBD VIA INFUSION (Begin 2007)',
 '0021': '0021: IVUS EXTRACRAN CEREB VES (Begin 2004)',
 '0022': '0022: IVUS INTRATHORACIC VES (Begin 2004)'

In [9]:
Procedure_mtx = pd.DataFrame(columns = new_PRdict.keys()) #create procedure matrix

In [10]:
Procedure_mtx.head()

Unnamed: 0,0001,0002,0003,0009,0010,0011,0012,0013,0014,0015,...,9988,9991,9992,9993,9994,9995,9996,9997,9998,9999


#### Procedure Matrix Populator


In [18]:
icd9_codes = sorted(list(new_PRdict.keys()))

In [None]:
len(icd9_codes)

In [14]:
# Generate list of procedure columns
prcols = []
for i in np.arange(0,15,1):
    prcols.append("PR{}".format(i+1))
prcols.insert(0, "KEY_NIS")

In [15]:
dfpx = df[prcols] # create dataframe of only procedure columns and 'KEY_NIS', which is a unique VISIT identifier

In [16]:
dfpx.head()

Unnamed: 0,KEY_NIS,PR1,PR2,PR3,PR4,PR5,PR6,PR7,PR8,PR9,PR10,PR11,PR12,PR13,PR14,PR15
0,10000011,741.0,7534.0,,,,,,,,,,,,,
1,10000148,9547.0,,,,,,,,,,,,,,
2,10000174,,,,,,,,,,,,,,,
3,10000218,3722.0,8856.0,8853.0,,,,,,,,,,,,
4,10000229,,,,,,,,,,,,,,,


In [19]:
code_to_index = {}
for i, code in enumerate(icd9_codes):
    code_to_index[code] = i

**Dynamic Matrix Creation with Numpy**

In [None]:
# Dynamically create the procedure matrix...computationally costly because of nparray reshaping and manipulaiton
# test = 0
total_mat = None
done = False
# curr_num_to_index = len(icd9_codes)
for row in dfpx.itertuples():
#     print(row)
    np_row = np.zeros((1, len(icd9_codes)))
    for r in row:
        if type(r) == str:
            try:
                np_row[0, code_to_index[r.zfill(4)]] = 1
            except:
#                 print(r.zfill(4))
                # update the dictionary so the indexes don't double
                code_to_index[r.zfill(4)] = len(icd9_codes)
                # append to list
                icd9_codes.append(r.zfill(4))
                # add column to the full matrix
                new_col = np.zeros((total_mat.shape[0], 1))
                total_mat = np.hstack((total_mat, new_col))
#                 print(total_mat.shape)
                # add single item to row
                new_code = np.ones((1, 1))
                np_row = np.hstack((np_row, new_code))
#                 print(np_row.shape)
                done = True
                
    try:
        total_mat = np.vstack((total_mat, np_row))
    except:
        total_mat = np_row
    if total_mat.shape[0] % 100000 == 0:
        print("Step {} Complete!".format(total_mat.shape[0]))

np.save("total_mat", total_mat)

**Preconstructed Numpy Matrix Creation**

In [None]:
# testmtx = np.zeros((7296968, 10000), dtype=np.bool)
# add all codes in here
total_codes = set([]) # len(total_codes)= number columns needed
codes_to_index = {} # store code as key, and col_num as value
row_num = 0 # find the total amount of rows

for f in sorted(os.listdir("dfpr_rows")):
    read = csv.reader(open("dfpr_rows/{}".format(f)))
    test = 0
    for row in read:
        if row[0] != '': # if not header row since header rows don't have a first entry (id)
#             print(row)
#             print("ROW NUM", row_num)
            for code in row[1:]: #iterate through each item in the row (excluding fake index)
                if code == '': # if we've reached the end of the values, leave the row
                    break
                else:
#                     print(code)
                    if code not in codes_to_index:
                        codes_to_index[code] = len(total_codes)
                        # in 'codes_to_index' dictionary, creates key of 'code' & assigns key value of len(total_codes)
                        # to keep track of the cols of the mtx (the ICD9 codes) and their indices
                        total_codes.add(code)
                        # adds the new code to the 'total_codes' set of unique code values
                    try:
                        testmtx[row_num, int(code)] = True
                    except:
                        pass
#                     print("SUM", np.sum(testmtx[row_num]))
            row_num +=1
            if row_num % 100000 == 0:
                print("HERE: ", row_num)         

In [None]:
total_mat.shape

In [None]:
total_mat[92]

In [None]:
total_mat[1, num_to_index["9547"]]

In [None]:
np.sum(total_mat[2])

In [None]:
np_row = np.zeros((1, 4))

In [None]:
r = np.random.random((4, 4))

In [None]:
r

In [None]:
r[:, 1]

In [None]:
r[1, :]

In [None]:
np_row[0, 1] = 1

In [None]:
np_row

In [None]:

one = np.ones((1, 2))
two = np.zeros((1, 2))

In [None]:
one

In [None]:
stack = np.vstack((one, two))

In [None]:
stack

In [None]:
np.hstack()

In [None]:
# Generate list of procedure columns
prcols = []
for i in np.arange(0,15,1):
    prcols.append("PR{}".format(i+1))

In [None]:
prcols

In [None]:
# Create df of only 15 procedure code columns for simplicity
dfpx = df[prcols]

In [None]:
dfpx.head()

In [None]:
for row in dfpx.rows():
    print(row)
    break

In [None]:
for row in dfpx.itertuples():
    print(row[1])
    break

In [None]:
for row in dfpx.itertuples():
    for r in row: # row is a list of strings
        if r in PRdict.keys():
            print(r, "was found in PRdict.")
    break

### Diagnosis Matrix

The following parser takes the Diagnosis codes and their descriptions as elements in the 'DXcol_heads' array.

In [None]:
DXcol_heads = []
with open("Diagnosis.txt", "r") as f:
    for line in f:
        code, labels = line.split("=")[0][4:-2], line.split("=")[1][2:-2]
        code, labels = line.split("=")[0][4:-2], line.split("=")[1][2:-2]
        DXcol_heads.append((code,labels))
DXcol_heads[:10]

In [None]:
DXdict = dict(DXcol_heads)

In [None]:
DXdict

In [None]:
Diagnosis_mtx = pd.DataFrame(columns = DXdict.keys()) #create diagnosis matrix

In [None]:
Diagnosis_mtx.head()

### Attempt at dd.get_dummies

In [None]:
ddf_known = df.categorize() #change to only categorize DXi's and PRi's columns

In [None]:
ddf_known.DX10

In [None]:
Diag_mtx = dd.get_dummies(ddf_known["DX1"]) # get_dummies only works for one column
#error: Diag_mtx = dd.concat([dd.get_dummies(ddf_known[col]) for col in ddf_known], axis=1, keys=ddf_known.columns)

In [None]:
Diag_mtx.head()

In [None]:
exdf = pd.DataFrame({'ID': ['Sam', 'Alex', 'Sam'], 'A': ['a','b','c'], 'B': ['b','a','b'], 'C': ['c','c','a']})


In [None]:
exdf

In [None]:
pd.get_dummies(exdf, columns = ['A','B','C'], sparse=True) #to illustrate that unique identifiers stay paired with observations

In [None]:
for i,x in enumerate(exdf.columns):
        print(i,x)