In [7]:
import pandas as pd
from mockcreator import MockCreator

In [8]:
people = pd.DataFrame({
    'id': [1, 2, 3, 4, 5], # unique
    'name': ['Alice', 'Bob', 'Charlie', 'Dave', 'Eve'], # specify first name provider
    'age': [25, 30, 35, 40, 45],
    'is_student': [True, True, True, False, False], # infer the weights
    'favorite_color': ['red', 'blue', 'green', 'red', 'blue'], # infer the weights 
    'height': [162, 178, 175, 180, 165], # gaussian distribution
    'ssn': ['123-45-6789', '987-65-4321', '456-78-9012', '789-01-2345', '321-54-6789'] # fuzzy key
})
conditions = pd.DataFrame({
    'id': [10, 9, 8, 7, 6],
    'condition': ['Heart Disease', 'Diabetes', 'High Blood Pressure', 'Asthma', 'Obesity'],
    'severity': ['Moderate', 'Severe', 'Mild', 'Severe', 'Moderate'],
    'ssn': ['123456789', 'foobar', '456789012', '789-01-2345', '321-54-6789'] # fuzzy key
})

dataframes = [people, conditions]

# specify when to use python random package methods, instead of faker
hints = [
{
    "name": "people",
    "creators": {
        "is_student": "random",
        "favorite_color": "random"
    }
},
{
    "name": "conditions",
    "creators": {
        "condition": "random",
        "severity": "random"
    }
}]
mc = MockCreator.from_dataframes(dataframes, hints=hints)
mc.to_yaml('../metadata/multi_inferred.yaml')

!cat ../metadata/multi_inferred.yaml

tables:
  people:
    fields:
      id:
        faker: pyint
        min_value: 1
        max_value: 5
      name:
        faker: pystr
        min_chars: 3
        max_chars: 7
      age:
        faker: pyint
        min_value: 25
        max_value: 45
      is_student:
        random: choices
        args:
        - - true
          - false
        weights:
        - 0.6
        - 0.4
      favorite_color:
        random: choices
        args:
        - - red
          - blue
          - green
        weights:
        - 0.4
        - 0.4
        - 0.2
      height:
        faker: pyint
        min_value: 162
        max_value: 180


  for column_name, series in df.iteritems():


In [9]:
mock_creator = MockCreator.from_yaml("../metadata/multi_inferred.yaml")

# specify faker methods, overriding inferred methods
mock_creator.set_field("people", "id", { "faker.unique": "pyint", "min_value": 0, "max_value": 1000 })
mock_creator.set_field("people", "name", { "faker": "first_name" })

# specify gaussian distribution for height
mock_creator.set_field("people", "height", { "random": "gauss", "mu": 170, "sigma": 8 })

generated = mock_creator.generate_data(10)
generated[0]

Unnamed: 0,id,name,age,is_student,favorite_color,height
0,4,pTkTV,43,True,red,174
1,3,PCzj,39,True,blue,179
2,4,eCOWnu,36,False,blue,176
3,5,HsgPcUH,39,False,blue,179
4,5,rtxpD,28,False,red,180
5,5,hYYlHG,35,False,blue,174
6,5,fFZT,43,True,blue,163
7,5,giV,27,True,blue,168
8,5,abhvE,30,True,red,173
9,5,mAifs,31,False,red,170


In [10]:
generated[1]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              10 non-null     int64 
 1   name            10 non-null     object
 2   age             10 non-null     int64 
 3   is_student      10 non-null     bool  
 4   favorite_color  10 non-null     object
 5   height          10 non-null     int64 
dtypes: bool(1), int64(3), object(2)
memory usage: 538.0+ bytes


In [11]:
mock_creator.to_yaml("../metadata/multi_final.yaml")
!cat ../metadata/multi_final.yaml

tables:
  people:
    fields:
      id:
        faker: pyint
        min_value: 1
        max_value: 5
      name:
        faker: pystr
        min_chars: 3
        max_chars: 7
      age:
        faker: pyint
        min_value: 25
        max_value: 45
      is_student:
        random: choices
        args:
        - - true
          - false
        weights:
        - 0.6
        - 0.4
      favorite_color:
        random: choices
        args:
        - - red
          - blue
          - green
        weights:
        - 0.4
        - 0.4
        - 0.2
      height:
        faker: pyint
        min_value: 162
        max_value: 180
