In [2]:
import csv
import json
import os
import random

If you're using a JSON file, you can point to it in the code block below. Otherwise, use the second code block to fill out a dict that describes the dataset you're generating. Your data should look something like this:
```
data_descriptor_json = {
  "defaultLabelColumn": "recommendation", #the default label to be predicted
  "fields": [
    {
      "id": "age", #whatever the field contains
      "type": "categorical", #or continuous, or binary
      "values": {
        #if categorical or binary
        "categories": [],
        #if continuous:
        "min": 0,
        "max": 100
      }
    }
   ]
}
```

In [None]:
data_json_file = open('your_file.json')
data_descriptor_json = json.read(data_json_file)

In [3]:
data_descriptor_json = {
  "defaultLabelColumn": "decision",
  "fields": [
    {
      "id": "age",
      "type": "categorical",
      "values": {
        "categories": ["Under 18", "18-34", "35-60", "Over 60"]
      }
    },
    {
      "id": "gender",
      "type": "categorical",
      "values": {
        "categories": ["male", "female"]
      }
    },
    {
      "id": "race",
      "type": "categorical",
      "values": {
        "categories": ["White", "Black or African American", "Asian", "American Indian or Alaska Native", "Native Hawaiian or Pacific Islander"]
      }
    },
    {
      "id": "bleeding",
      "type": "categorical",
      "values": {
        "categories": ["yes", "no"]
      }
    },
    {
      "id": "shortness_of_breath",
      "type": "categorical",
      "values": {
        "categories": ["yes", "no"]
      }
    },
    {
      "id": "weight",
      "type": "categorical",
      "values": {
        "categories": ["under 100 lbs", "100-150 lbs", "150-200 lbs", "200-300 lbs", "Over 300 lbs"]
      }
    },
    {
      "id": "area_of_pain",
      "type": "categorical",
      "values": {
        "categories": ["head", "chest", "limb", "stomach"]
      }
    },
    {
      "id": "decision",
      "type": "categorical",
      "values": {
        "categories": ["priority", "normal", "return later"]
      }
    }
  ]
}

Here we set up our constants. Add more correlation values as needed - the idea is that we assign a dict to a category of the outcome we're predicting. If a certain other feature has a category that correlates to that outcome, add that feature as a key to this dict, and then give it a "values" attribute containing a list with the index of the category or categories that correspond. So, if we have a data description like this, where we're trying to determine if a patient is sick:
``` 
{ 
 "defaultLabelColumn": "is_sick",
 "fields": [
     {
         "id": "nose_status"
         "type: "categorical"
         "values": {
         "categories": [
         "runny", "bleeding", "dry", "exploded"
         ]
     },
     {
         "id": "is_sick"
         "type: "categorical"
         "values": {
         "categories": [
         "no", "maybe", "definitely", "extremely"
         ]
     },
 }
```
 And, as a rule, if a patient's nose is dry they're always fine but otherwise the worse symptoms they have the more likely they are to be sick, we can structure our correlation as follows:
 
```
correlation_values = {
    "label": data_descriptor_json['defaultLabelColumn'],
    "values": {
    #here we use the categories of outcome as keys:
        "no": {
            "correlation": { 1: 1 }, #here we have a one-to-one correlation
            "nose_status": {
                "values": [2] # here we use the index of "dry" to determine that, if the nose is dry, they're fine
            }
        },
        "maybe": {
            "correlation": {1: .6 },
                "nose_status": {
                "values": [0]
            },
         "definitely": {
            "correlation": {1: .7 },
                "nose_status": {
                "values": [1]
            },  
          "extremely": 
             {
           "correlation": {1: .8 },
                "nose_status": {
                "values": [3]
            }, 
        }

In [4]:

label = data_descriptor_json['defaultLabelColumn']
correlation_values = {
    "label": label, 
    "values": {
        "priority": {
            "correlation": {1: .5, 2: .6, 3: .65, 4: .9},
            "bleeding": {
                "values": [0]
            },
            "shortness_of_breath": {
                "values": [0]
            },
            "area_of_pain": {
                "values": [0, 1]
            }, 
            "weight": {
                "values": [0, 4]
            }
        }, 
        "return_later": 
        {
            "correlation": {1: 1},
            "race": {
                "values": [3, 4], 
                "limit": 0.05
            }
        }
    }
}
num_rows = 300 # the number of rows in our output dataset
outfile_name = 'medical_priority.csv' # the name of the file we're writing to, should end in '.csv'
print(json.dumps(correlation_values, indent=2))

{
  "label": "decision",
  "values": {
    "priority": {
      "correlation": {
        "1": 0.5,
        "2": 0.6,
        "3": 0.65,
        "4": 0.9
      },
      "bleeding": {
        "values": [
          0
        ]
      },
      "shortness_of_breath": {
        "values": [
          0
        ]
      },
      "area_of_pain": {
        "values": [
          0,
          1
        ]
      },
      "weight": {
        "values": [
          0,
          4
        ]
      }
    },
    "return_later": {
      "correlation": {
        "1": 1
      },
      "race": {
        "values": [
          3,
          4
        ],
        "limit": 0.05
      }
    }
  }
}


In [28]:
#write the file
with open(outfile_name, 'w') as outfile:
    writer = csv.writer(outfile, delimiter=',')
    headers = [field['id'] for field in data_descriptor_json['fields']]
    writer.writerow(headers)
    label_val_idx = headers.index(label)
    
    #TODO control for limits
    #TODO negative correlation? wait i guess this does work that way since we're just doing positive correlation to a different category
    
    #FIXME uhhh this is real confusing and brute forced
    #abandon hope all ye who enter here
    #definitely need to improve variable names there are about five different 'values' in there
    #reduce nesting?
    #break into functions so we can just skip if field isn't in correlation dict?
    for i in range(num_rows):
        row = []
        #object to see how highly they're correlated with diff outcomes
        correlation_counter = {}
        for field in data_descriptor_json['fields']:
            field_name = field['id']
            if field_name == label:
                row.append('')
                continue
            values = field["values"]
            categories = values["categories"]
            correlation_value_list = correlation_values["values"]

            random_num = random.random()
            random_choice = random.choice(categories)

                #check against correlation values and increment our relationship accordingly
            for key, value in correlation_value_list.items():
                for k, v in value.items():
                    if k == 'correlation':
                        continue
                    if 'limit' in v and k == field_name:

                        #if there's a limit on the number of total datapoints with this value, meet it
                        
                        if random_num > v['limit']:
                            def reset_random_choice(choice):
                                if categories.index(choice) in v['values']:
                                    choice = random.choice(categories)
                                    return reset_random_choice(choice)
                                else:
                                    return choice
                            random_choice = reset_random_choice(random_choice)
                        else:
                            random_choice = categories[random.choice(v['values'])]
                            
                    for i in v['values']:
                        if i < len(categories) and field['id'] == k and random_choice == categories[i]:
                            if key in correlation_counter:
                                correlation_counter[key] += 1
                                break
                            else:
                                correlation_counter[key] = 1
                                break
                                
                        #add randomly selected thing to row if there's not already a correlation forming
            if random_choice not in correlation_counter or random_num > correlation_counter[random_choice]:
                row.append(random_choice)
            else:
                #if there is a correlation forming, select from correlated values
                random_choice = random.choice(correlation_value_list[field['id']]['values'])
                row.append(field['categories'][random_choice])
        # print('correlation counter: ', correlation_counter)
        get_decision = random.random()
        decision_value = ''
        decision_corr = 0
        #check if correlation happened, if so make more likely that we meet that condition
        for key, value in correlation_value_list.items():
            if key not in correlation_counter:
                continue
            if value["correlation"][correlation_counter[key]] > decision_corr:
                decision_corr = value["correlation"][correlation_counter[key]]
                decision_value = key
        if get_decision < decision_corr:
            row[label_val_idx] = decision_value
        else:
            row[label_val_idx] = random.choice(data_descriptor_json['fields'][label_val_idx]['values']['categories'])
        print('row:', row)      
                            
                        
        writer.writerow(row)

row: ['35-60', 'female', 'Black or African American', 'no', 'yes', 'under 100 lbs', 'head', 'priority']
row: ['Over 60', 'male', 'American Indian or Alaska Native', 'no', 'no', '100-150 lbs', 'head', 'return_later']
row: ['Over 60', 'female', 'Black or African American', 'yes', 'yes', 'under 100 lbs', 'chest', 'priority']
row: ['Over 60', 'male', 'White', 'no', 'yes', '200-300 lbs', 'limb', 'return later']
row: ['35-60', 'female', 'Black or African American', 'yes', 'no', 'Over 300 lbs', 'limb', 'priority']
row: ['Under 18', 'male', 'Asian', 'yes', 'no', '200-300 lbs', 'stomach', 'normal']
row: ['35-60', 'female', 'White', 'yes', 'yes', 'Over 300 lbs', 'chest', 'priority']
row: ['Over 60', 'male', 'Black or African American', 'yes', 'yes', '150-200 lbs', 'head', 'priority']
row: ['35-60', 'male', 'White', 'yes', 'yes', 'Over 300 lbs', 'limb', 'priority']
row: ['18-34', 'male', 'Black or African American', 'yes', 'no', '100-150 lbs', 'head', 'priority']
row: ['Over 60', 'female', 'Black