In [30]:
import sys
import os
import numpy as np
import pandas as pd
import csv

In [207]:
data_table = "../../data/IST_corrected.csv"
variable_labels = "../../data/IST_variables.csv"
cleaned_table = "../../data/IST_cleaned.csv"


In [210]:
def contains_non_numeric_data(col):
    for entry in col:
        entry = entry.replace(".", "0")
        if (entry != "" and entry != "." and not entry.isdigit()):
            return True
    return False

def fill_numeric_entries(col):
    numeric_entries = []
    for entry in col:
        if(entry != ""):
            numeric_entries.append(float(entry))
    average = np.average(numeric_entries)
    filled_col = []
    for entry in col:
        if(entry != ""):
            filled_col.append(float(entry))
        else:
            filled_col.append(average)
    return filled_col
        

def read_data(path):
    """
    Read and process stroke data table
    """
    
    ### Open entire stroke data table
    with open(path, 'rU') as f:
        reader = csv.reader(f, delimiter=',')
        d = list(reader)
    darr = np.array(d)
    
    ### Get the stroke column labels
    column_labels = list(darr[0,:])
    column_label_dict = {label:i for i, label in enumerate(column_labels)}

    ### Delete data columns: RDATE, HOURLOCAL, MINLOCAL, DAYLOCAL
    column_mask_labels = ["RDATE", "HOURLOCAL", "MINLOCAL", "DAYLOCAL", "DMAJNCHX", "DSIDEX", "DNOSTRKX", "DDEADX", "FDEADX"]
    column_mask_idx = [column_label_dict[l] for l in column_mask_labels]
    print "col_mask", column_mask_idx
    darr = np.delete(darr, column_mask_idx, axis=1)
    
    ### Turn every entry in the data file into a numeric. 
    ### If you have ["Y, "N"] as the options then convert to [0,1]
    ### Store a map between {SEX: (['F', 'M'] [0,1]), ...}
    alpha_to_numeric_label_dict = {}
    num_col = darr.shape[1]
    for col_idx in range(num_col):
        col_data = darr[:,col_idx]
        col_header, col_entries = col_data[0], col_data[1:]
        if(contains_non_numeric_data(col_entries)):
            unique_labels = list(np.unique(col_entries))
            alpha_to_numeric_label_dict[col_header] = (unique_labels, range(len(unique_labels)))
            col_entries = [unique_labels.index(entry) for entry in col_entries]
            darr[:, col_idx] = [col_header] + col_entries
#             print "apply rules", col_idx, col_header, unique_labels, [col_header] + col_entries[:10]
        else:
            col_entries = fill_numeric_entries(col_entries)
            darr[:, col_idx] = [col_header] + map(float, col_entries)
#             print col_idx, col_header, map(float, col_entries[:10])
        
#         print col_idx, col_header, map(int, np.unique(col_entries[:10]))
    
#     ### Generate rules for every column
#     print "\n\n### Convert all data entries to numeric"
#     for col_label in column_labels:
#         col_idx = column_label_dict[col_label]
#         small_darr = np.unique(darr[:, col_idx])
#         print col_idx, col_label, small_darr

    print alpha_to_numeric_label_dict
    print darr[0,:]
    print darr[1,:]
    
    dfout = pd.DataFrame(darr)
    dfout.to_csv(cleaned_table)

In [211]:
read_data(data_table)

col_mask [21, 22, 23, 24, 45, 48, 53, 68, 73]
{'RDEF8': (['C', 'N', 'Y'], [0, 1, 2]), 'RXASP': (['N', 'Y'], [0, 1]), 'RVISINF': (['N', 'Y'], [0, 1]), 'DHH14': (['', 'N', 'Y'], [0, 1, 2]), 'DCAA': (['', 'N', 'U', 'Y'], [0, 1, 2, 3]), 'RCT': (['N', 'Y'], [0, 1]), 'RDEF1': (['C', 'N', 'Y'], [0, 1, 2]), 'RDEF2': (['C', 'N', 'Y'], [0, 1, 2]), 'RDEF3': (['C', 'N', 'Y'], [0, 1, 2]), 'RDEF4': (['C', 'N', 'Y'], [0, 1, 2]), 'DASP14': (['', 'N', 'U', 'Y', 'n', 'y'], [0, 1, 2, 3, 4, 5]), 'RDEF6': (['C', 'N', 'Y'], [0, 1, 2]), 'RDEF7': (['C', 'N', 'Y'], [0, 1, 2]), 'DCAREND': (['', 'N', 'U', 'Y'], [0, 1, 2, 3]), 'DDEAD': (['', 'N', 'U', 'Y'], [0, 1, 2, 3]), 'RATRIAL': (['', 'N', 'Y'], [0, 1, 2]), 'DRSH': (['', 'N', 'U', 'Y'], [0, 1, 2, 3]), 'FRECOVER': (['', 'N', 'U', 'Y'], [0, 1, 2, 3]), 'RXHEP': (['H', 'L', 'M', 'N'], [0, 1, 2, 3]), 'STYPE': (['LACS', 'OTH', 'PACS', 'POCS', 'TACS'], [0, 1, 2, 3, 4]), 'FAP': (['', 'N', 'U', 'Y', 'n'], [0, 1, 2, 3, 4]), 'DPE': (['', 'N', 'U', 'Y'], [0, 1, 2, 3]), '