In [0]:

# coding: utf-8

# In[3]:


import dataiku, random
import pandas as pd
client = dataiku.api_client()
project = client.get_default_project()


# In[1]:


NB_INPUT_DATASETS = 13
NB_COLS = 40
NB_RECIPES = 60
USE_ZONES = True
NB_ZONES = 3


# In[4]:


zones = []
if USE_ZONES:
    for i in range(NB_ZONES):
        new_zone = project.get_flow().create_zone("zone %s" % i)
        zones.append(new_zone)


# In[5]:


input_data = {}
for col in range(NB_COLS):
    input_data["col_%s" % col] = [random.randint(0, 70) for row in range(100)]
pd.DataFrame(input_data).to_csv("input_data.csv", index=False)


# In[6]:


for i in range(NB_INPUT_DATASETS +1):
    dataset = project.create_upload_dataset("input_%s" % i)
    with open("input_data.csv", "rb") as f:
        dataset.uploaded_add_file(f, "input_data.csv")
    dataset.autodetect_settings(True).save()
#    project.create_filesystem_dataset("input_%s" % i, "filesystem_managed", "/input_%s" % i)


# In[7]:


managed_datasets = 0

def random_input():
    return "input_%s" % random.randint(0, NB_INPUT_DATASETS)

def random_managed():
    global managed_datasets
    return "managed_%s" % int(random.betavariate(0.6, 0.3) * (managed_datasets-1))

def add_output(creator):
    global managed_datasets
    name = "managed_%s" % managed_datasets
    creator.with_new_output(name, "filesystem_managed", format_option_id="CSV_EXCEL_GZIP")
    managed_datasets += 1
    return name

def add_output_py(creator):
    global managed_datasets
    global project
    name = "managed_%s" % managed_datasets
    creator.with_new_output_dataset(name, "filesystem_managed", format="CSV_EXCEL_GZIP")
    
    dataset = project.get_dataset("managed_%s" % managed_datasets)
    dataset.set_schema({"columns": [{"name":"col_%s" % col, "type": "bigint"} for col in range(NB_COLS)]})
    
    managed_datasets += 1
    return name

    
def add_input(creator):
    global managed_datasets
    r2 = random.random()
    if r2 < 0.2 or managed_datasets <= 3:
        creator.with_input(random_input())
    else:
        creator.with_input(random_managed())
    
for i in range(NB_RECIPES):
    r = random.random()
    r2 = random.random()
    
    created_datasets = []
    
    if r < 0.4:
        # Sync
        creator = project.new_recipe("sync")
        add_input(creator)
        created_datasets.append(add_output(creator))
        creator.create()
        
#    elif r < 0.4:
#        # Prepare
#        creator = project.new_recipe("prepare")
#        add_input(creator)
#        add_output(creator)
#        creator.create()
    elif r < 0.6:
         # Python with 2 inputs
        creator = project.new_recipe("python")
        for inp in range(0,2):
            add_input(creator)
        created_datasets.append(add_output_py(creator))
        creator.with_script("""
import dataiku
from dataiku import recipe
in0 = recipe.get_inputs()[0]
df = in0.get_dataframe()
out0 = recipe.get_outputs()[0]
out0.write_with_schema(df)
""")
        creator.create()
    elif r < 0.8:
        # Python with 2 outputs
        creator = project.new_recipe("python")
        add_input(creator)
        for inp in range(0,2):
            created_datasets.append(add_output_py(creator))
        creator.with_script("""
import dataiku
from dataiku import recipe
in0 = recipe.get_inputs()[0]
df = in0.get_dataframe()
out0 = recipe.get_outputs()[0]
out0.write_with_schema(df)
out1 = recipe.get_outputs()[1]
out1.write_with_schema(df)
""")
        creator.create()
    else:
        creator = project.new_recipe("join")
        for inp in range(0,2):
            add_input(creator)
        created_datasets.append(add_output(creator))
        recipe = creator.create()
        
        settings = recipe.get_settings()
        settings.obj_payload["joins"] = []
        join = settings.add_join()
        settings.add_condition_to_join(join,
                                    column1="col_%s" % random.randint(0, NB_COLS-1),
                                   column2="col_%s" % random.randint(0, NB_COLS-1)
                                  )
        settings.save()

    if USE_ZONES:
        for created_dataset in created_datasets:
            max_zone = int(i * NB_ZONES/NB_RECIPES)
            zone_idx = int(random.betavariate(0.9, 0.1) * (max_zone))
            print("i=%s max_zone=%s zone_idx=%s" % (i, max_zone, zone_idx))
            project.get_dataset(created_dataset).move_to_zone(zones[zone_idx])


# In[8]:


# Move all outputs to an "outputs" zone so that it's not *too* messy
if USE_ZONES:
    target_zone = project.get_flow().create_zone("Outputs", color="#C13344")
    graph = project.get_flow().get_graph()
    
    for node in graph.nodes.values():
        if len(node["successors"]) == 0 and node["type"] == "COMPUTABLE_DATASET":
            print("Moving node %s to output" % node)
            graph._get_object_from_graph_node(node).move_to_zone(target_zone)
    





i=0 max_zone=0 zone_idx=0
i=0 max_zone=0 zone_idx=0
i=1 max_zone=0 zone_idx=0
i=2 max_zone=0 zone_idx=0
i=3 max_zone=0 zone_idx=0
i=4 max_zone=0 zone_idx=0
i=5 max_zone=0 zone_idx=0
i=5 max_zone=0 zone_idx=0
i=6 max_zone=0 zone_idx=0
i=7 max_zone=0 zone_idx=0
i=7 max_zone=0 zone_idx=0
i=8 max_zone=0 zone_idx=0
i=9 max_zone=0 zone_idx=0
i=10 max_zone=0 zone_idx=0
i=11 max_zone=0 zone_idx=0
i=12 max_zone=0 zone_idx=0
i=13 max_zone=0 zone_idx=0
i=14 max_zone=0 zone_idx=0
i=15 max_zone=0 zone_idx=0
i=15 max_zone=0 zone_idx=0
i=16 max_zone=0 zone_idx=0
i=17 max_zone=0 zone_idx=0
i=18 max_zone=0 zone_idx=0
i=18 max_zone=0 zone_idx=0
i=19 max_zone=0 zone_idx=0
i=20 max_zone=1 zone_idx=1
i=21 max_zone=1 zone_idx=0
i=22 max_zone=1 zone_idx=0
i=23 max_zone=1 zone_idx=0
i=23 max_zone=1 zone_idx=0
i=24 max_zone=1 zone_idx=0
i=24 max_zone=1 zone_idx=0
i=25 max_zone=1 zone_idx=0
i=26 max_zone=1 zone_idx=0
i=27 max_zone=1 zone_idx=0
i=28 max_zone=1 zone_idx=0
i=29 max_zone=1 zone_idx=0
i=30 max_zone=