In [1]:
import csv
import json
import os
import random

If you're using a JSON file, you can point to it in the code block below. Otherwise, use the second code block to fill out a dict that describes the dataset you're generating. Your data should look something like this:
```
data_descriptor_json = {
  "defaultLabelColumn": "recommendation", #the default label to be predicted
  "fields": [
    {
      "id": "age", #whatever the field contains
      "type": "categorical", #or continuous, or binary
      "values": {
        #if categorical or binary
        "categories": [],
        #if continuous:
        "min": 0,
        "max": 100
      }
    }
   ]
}
```

In [None]:
data_json_file = open('your_file.json')
data_descriptor_json = json.read(data_json_file)

In [81]:
data_descriptor_json = {
  "defaultLabelColumn": "decision",
  "fields": [
    {
      "id": "age",
      "type": "categorical",
      "values": {
        "categories": ["Under 18", "18-34", "35-60", "Over 60"]
      }
    },
    {
      "id": "gender",
      "type": "categorical",
      "values": {
        "categories": ["male", "female"]
      }
    },
    {
      "id": "race",
      "type": "categorical",
      "values": {
        "categories": ["White", "Black or African American", "Asian", "American Indian or Alaska Native", "Native Hawaiian or Pacific Islander"]
      }
    },
    {
      "id": "bleeding",
      "type": "categorical",
      "values": {
        "categories": ["yes", "no"]
      }
    },
    {
      "id": "shortness_of_breath",
      "type": "categorical",
      "values": {
        "categories": ["yes", "no"]
      }
    },
    {
      "id": "weight",
      "type": "categorical",
      "values": {
        "categories": ["under 100 lbs", "100-150 lbs", "150-200 lbs", "200-300 lbs", "Over 300 lbs"]
      }
    },
    {
      "id": "area_of_pain",
      "type": "categorical",
      "values": {
        "categories": ["head", "chest", "limb", "stomach"]
      }
    },
    {
      "id": "decision",
      "type": "categorical",
      "values": {
        "categories": ["priority", "normal", "return later"]
      }
    }
  ]
}

fields = data_descriptor_json['fields']
label = data_descriptor_json['defaultLabelColumn']




Here we set up our constants. Basically, we want to set up a relationship between potential categories and a label outcome.
``` 
{ 
 "defaultLabelColumn": "is_sick",
 "fields": [
     {
         "id": "nose_status"
         "type: "categorical"
         "values": {
         "categories": [
         "runny", "bleeding", "dry", "exploded"
         ]
     },
     {
         "id": "is_sick"
         "type: "categorical"
         "values": {
         "categories": [
         "no", "maybe", "definitely", "extremely"
         ]
     },
 }
```
 And, as a rule, if a patient's nose is dry they're always fine but otherwise the worse symptoms they have the more likely they are to be sick, we can structure our correlation as follows, using values below 0.5 for negative correlation and above 0.5 for positive:
 
```
correlation_values = {
    #here we use the categories of outcome as keys:
        "no": {
            "freq": #how often this answer should occur
            "correlated_fields": [{"name": "nose_status", "correlated_category_values": {"dry": 1, "exploded": 0}] # so if nose_status = dry, is_sick = "no, while if their nose has exploded they're definitely sick!
        }
     }
```

This block prints out a quick template for converting our values in order to set up our correlation relationships

In [82]:
label_values = []
for i in fields:
    if i['id'] == label:
        label_values = i['values']['categories']
        print("template_values = {")
        for k in i['values']['categories']:
            print('"%s": {"freq": 0,"correlated_fields": [{"name": "field_name", "correlated_category_values": {"feature": 0}}]},' % k)
        print("}}")
    else:
        print('field: ', i['id'])
        print('categories: ', i['values']['categories'])


field:  age
categories:  ['Under 18', '18-34', '35-60', 'Over 60']
field:  gender
categories:  ['male', 'female']
field:  race
categories:  ['White', 'Black or African American', 'Asian', 'American Indian or Alaska Native', 'Native Hawaiian or Pacific Islander']
field:  bleeding
categories:  ['yes', 'no']
field:  shortness_of_breath
categories:  ['yes', 'no']
field:  weight
categories:  ['under 100 lbs', '100-150 lbs', '150-200 lbs', '200-300 lbs', 'Over 300 lbs']
field:  area_of_pain
categories:  ['head', 'chest', 'limb', 'stomach']
template_values = {
"priority": {"freq": 0,"correlated_fields": [{"name": "field_name", "correlated_category_values": {"feature": 0}}]},
"normal": {"freq": 0,"correlated_fields": [{"name": "field_name", "correlated_category_values": {"feature": 0}}]},
"return later": {"freq": 0,"correlated_fields": [{"name": "field_name", "correlated_category_values": {"feature": 0}}]},
}}


In [92]:

correlation_values = {"priority": {"freq": 0.1, "correlated_fields": [{"name": "gender", "correlated_category_values": {"male": .75}}]},
"normal": {"correlated_fields": [{"name": "race", "correlated_category_values": {"White": .75, 'Black or African American': 0.3}}]},
"return later": {"correlated_fields": [{"name": "shortness_of_breath", "correlated_category_values": {"yes": .6, "no": .2}}]}
}

num_rows = 200 # the number of rows in our output dataset
outfile_name = 'medical_priority.csv' # the name of the file we're writing to, should end in '.csv'
print(json.dumps(correlation_values, indent=2))

{
  "priority": {
    "freq": 0.1,
    "correlated_fields": [
      {
        "name": "gender",
        "correlated_category_values": {
          "male": 0.75
        }
      }
    ]
  },
  "normal": {
    "correlated_fields": [
      {
        "name": "race",
        "correlated_category_values": {
          "White": 0.75,
          "Black or African American": 0.3
        }
      }
    ]
  },
  "return later": {
    "correlated_fields": [
      {
        "name": "shortness_of_breath",
        "correlated_category_values": {
          "yes": 0.6,
          "no": 0.2
        }
      }
    ]
  }
}


In [93]:
#write the file
with open(outfile_name, 'w') as outfile:
    writer = csv.writer(outfile, delimiter=',')
    headers = [field['id'] for field in data_descriptor_json['fields']]
    writer.writerow(headers)
    label_val_idx = headers.index(label)
    

        
    for i in range(num_rows):
        row = []
        #object to see how highly they're correlated with diff outcomes
        correlation_counter = {}
        negative_correlation_counter = {}
        for field in data_descriptor_json['fields']:
            field_name = field['id']
            if field_name == label:
                row.append('')
                continue
            values = field["values"]
            categories = values["categories"]
            random_num = random.random()
            random_choice = random.choice(categories)
            
            def check_frequency(choice, val, num, categories):
                if 'freq' in val and num > val['freq']:
                    new_choice = random.choice(categories)
                    new_val = correlation_values[choice]
                    print('new choice: ', new_choice)
                    return check_frequency(new_choice, new_val, num, categories)
                else:
                    return choice
            
            #check against correlation values and increment our relationship accordingly
            for key, value in correlation_values.items():
                for i in value["correlated_fields"]:
                    corr_vals = i['correlated_category_values']
                    if random_choice in corr_vals.keys():
                        if key not in correlation_counter or correlation_counter[key] < corr_vals[random_choice]:
                            if corr_vals[random_choice] > 0.5:
                                correlation_counter[key] = corr_vals[random_choice]
                        if key not in negative_correlation_counter or negative_correlation_counter[key] > corr_vals[random_choice]:
                            if corr_vals[random_choice] < 0.5:
                                negative_correlation_counter[key] = corr_vals[random_choice]
            row.append(random_choice)
        random_label = random.choice(label_values)
        random_number = random.random()
        decision = random_label
        for key, value in correlation_counter.items():
            if decision not in correlation_counter.keys():
                decision = key
            else:
                if correlation_counter[key] > correlation_counter[decision]:
                    decision = key
        trimmed_labels = label_values.copy()
        trimmed_labels.remove(decision)
        
        print('trimmed labels')
        print('initial decision', decision)
        print('random num', random_number)
        decision = check_frequency(decision, correlation_values[decision], random_num, label_values)
        print('final decision', decision)

        if decision in negative_correlation_counter and decision in correlation_counter:
            if 1 - negative_correlation_counter[decision] > correlation_counter[decision] and random_num < negative_correlation_counter[decision]:
                row[label_val_idx] = random.choice(trimmed_labels)
            else:
                if random_num < correlation_counter[decision]:
                    row[label_val_idx] = decision
                else:
                    row[label_val_idx] = random.choice(trimmed_labels)
        else:
            if decision in correlation_counter and random_num < correlation_counter[decision]:
                row[label_val_idx] = decision
            else:
                row[label_val_idx] = random.choice(trimmed_labels)


        print('row, ', row)
        
            
                    
                

                            
                        
      #  writer.writerow(row)

trimmed labels
initial decision return later
random num 0.3611845506462317
final decision return later
row,  ['18-34', 'female', 'Native Hawaiian or Pacific Islander', 'no', 'no', '100-150 lbs', 'chest', 'priority']
trimmed labels
initial decision return later
random num 0.037873160068915945
final decision return later
row,  ['Over 60', 'female', 'Native Hawaiian or Pacific Islander', 'no', 'yes', '100-150 lbs', 'stomach', 'priority']
trimmed labels
initial decision priority
random num 0.6356176722772499
new choice:  priority
new choice:  priority
new choice:  return later
new choice:  priority
final decision priority
row,  ['Under 18', 'male', 'Asian', 'yes', 'yes', '150-200 lbs', 'chest', 'return later']
trimmed labels
initial decision priority
random num 0.7425257472506803
new choice:  priority
new choice:  priority
new choice:  return later
new choice:  normal
final decision normal
row,  ['35-60', 'male', 'American Indian or Alaska Native', 'yes', 'no', '150-200 lbs', 'chest', 'ret

random num 0.7374227753168224
final decision return later
row,  ['35-60', 'female', 'Black or African American', 'yes', 'no', '100-150 lbs', 'limb', 'priority']
trimmed labels
initial decision return later
random num 0.6111592048872286
final decision return later
row,  ['Under 18', 'female', 'American Indian or Alaska Native', 'no', 'yes', '100-150 lbs', 'chest', 'priority']
trimmed labels
initial decision normal
random num 0.07806959676579295
final decision normal
row,  ['Under 18', 'female', 'White', 'yes', 'yes', '150-200 lbs', 'head', 'normal']
trimmed labels
initial decision priority
random num 0.15023449374350706
new choice:  priority
new choice:  priority
new choice:  normal
new choice:  return later
final decision return later
row,  ['35-60', 'male', 'Native Hawaiian or Pacific Islander', 'no', 'yes', '200-300 lbs', 'chest', 'return later']
trimmed labels
initial decision priority
random num 0.581689398095457
new choice:  priority
new choice:  normal
new choice:  normal
final d