In [1]:
import pandas as pd
from mockcreator import MockCreator

In [2]:
people = pd.DataFrame({
    'id': [1, 2, 3, 4, 5], # unique
    'name': ['Alice', 'Bob', 'Charlie', 'Dave', 'Eve'], # specify first name provider
    'age': [25, 30, 35, 40, 45],
    'is_student': [True, True, True, False, False], # infer the weights
    'favorite_color': ['red', 'blue', 'green', 'red', 'blue'], # infer the weights 
    'height': [162, 178, 175, 180, 165], # gaussian distribution
    'ssn': ['123-45-6789', '987-65-4321', '456-78-9012', '789-01-2345', '321-54-6789'] # fuzzy key
})
conditions = pd.DataFrame({
    'id': [10, 9, 8, 7, 6],
    'condition': ['Heart Disease', 'Diabetes', 'High Blood Pressure', 'Asthma', 'Obesity'],
    'severity': ['Moderate', 'Severe', 'Mild', 'Severe', 'Moderate'],
    'ssn': ['123456789', 'foobar', '456789012', '789-01-2345', '321-54-6789'] # fuzzy key
})

dataframes = [people, conditions]

# specify when to use python random package methods, instead of faker
hints = [
{
    "name": "people",
    "creators": {
        "is_student": "random",
        "favorite_color": "random"
    }
},
{
    "name": "conditions",
    "creators": {
        "condition": "random",
        "severity": "random"
    }
}]
mc = MockCreator.from_dataframes(dataframes, hints=hints)
mc.to_yaml('../metadata/multi_inferred.yaml')

!cat ../metadata/multi_inferred.yaml

tables:
  people:
    fields:
      id:
        faker: pyint
        min_value: 1
        max_value: 5
      name:
        faker: pystr
        min_chars: 3
        max_chars: 7
      age:
        faker: pyint
        min_value: 25
        max_value: 45
      is_student:
        random: choices
        args:
        - - true
          - false
        weights:
        - 0.6
        - 0.4
      favorite_color:
        random: choices
        args:
        - - red
          - blue
          - green
        weights:
        - 0.4
        - 0.4
        - 0.2
      height:
        faker: pyint
        min_value: 162
        max_value: 180
      ssn:
        faker: pystr
        min_chars: 11
        max_chars: 11
  conditions:
    fields:
      id:
        faker: pyint
        min_value: 6
        max_value: 10
      condition:
        random: choices
        args:
        - - Heart Disease
          - Diabetes
          - High Blood Pressure
          - Asthma
          - Obesity
        we

  for column_name, series in df.iteritems():
  for column_name, series in df.iteritems():


In [3]:
mock_creator = MockCreator.from_yaml("../metadata/multi_inferred.yaml")

# specify faker methods, overriding inferred methods
mock_creator.set_field("people", "id", { "faker.unique": "pyint", "min_value": 0, "max_value": 1000 })
mock_creator.set_field("people", "name", { "faker": "first_name" })
# specify gaussian distribution for height
mock_creator.set_field("people", "height", { "random": "gauss", "mu": 170, "sigma": 8 })

## Specify relation

In [4]:
# MULTI TABLE CHANGES
mock_creator.set_field("people", "ssn", { "faker": "ssn" })
# THE BEEF: specify (fuzzy) relation from conditions to people
relation = {
    "relation": "fuzzy_match",
    "ref": "people.ssn",
    "weights": {
        "match": 0.7,
        "fuzzy": 0.1,
        "no_match": 0.2
    }
}
mock_creator.set_field("conditions", "ssn", relation)

generated = mock_creator.generate_data(10)
generated[0]

Unnamed: 0,id,name,age,is_student,favorite_color,height,ssn
0,179,Kimberly,27,True,red,167.264516,838-52-5498
1,574,Michele,27,False,red,181.441067,485-24-2768
2,718,Brenda,42,True,green,155.003935,792-77-4654
3,109,Jeremiah,29,True,green,173.052772,491-26-0332
4,529,Marc,43,True,blue,154.797571,429-24-0767
5,577,Stephanie,30,False,blue,169.377745,568-87-5514
6,194,Debra,34,True,red,179.519765,829-04-6691
7,658,Samantha,26,True,blue,160.659994,192-16-8582
8,826,Vincent,35,True,blue,187.146867,520-42-7403
9,425,Kevin,31,True,green,159.945326,867-97-6562


In [5]:
generated[1]

Unnamed: 0,id,condition,severity,ssn
0,10,Heart Disease,Mild,429-24-0767
1,8,Asthma,Severe,429-24-0767
2,9,High Blood Pressure,Moderate,838-52-5498
3,6,Diabetes,Moderate,792-77-4654
4,7,Diabetes,Mild,520-42-7403
5,7,High Blood Pressure,Mild,429-24-0767
6,10,Diabetes,Moderate,192-16-8582
7,6,Diabetes,Moderate,829-04-6691_
8,10,High Blood Pressure,Mild,TKgQdIKzGLaFfdQTHztf
9,9,Diabetes,Moderate,PMyeSloRbFvMxMXuEGJm


In [6]:
mock_creator.to_yaml("../metadata/multi_final.yaml")
!cat ../metadata/multi_final.yaml

tables:
  people:
    fields:
      id:
        faker.unique: pyint
        min_value: 0
        max_value: 1000
      name:
        faker: first_name
      age:
        faker: pyint
        min_value: 25
        max_value: 45
      is_student:
        random: choices
        args:
        - - true
          - false
        weights:
        - 0.6
        - 0.4
      favorite_color:
        random: choices
        args:
        - - red
          - blue
          - green
        weights:
        - 0.4
        - 0.4
        - 0.2
      height:
        random: gauss
        mu: 170
        sigma: 8
      ssn:
        faker: ssn
  conditions:
    fields:
      id:
        faker: pyint
        min_value: 6
        max_value: 10
      condition:
        random: choices
        args:
        - - Heart Disease
          - Diabetes
          - High Blood Pressure
          - Asthma
          - Obesity
        weights:
        - 0.2
        - 0.2
        - 0.2
        - 0.2
        - 0.2
      se