# Using Draco for Visualization Design Space Exploration
https://github.com/cmudig/draco2/blob/main/docs/applications/design_space_exploration.ipynb

In [18]:
# Display utilities
import json

import numpy as np
from IPython.display import Markdown, display


# Handles serialization of common numpy datatypes
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)


def md(markdown: str):
    display(Markdown(markdown))


def pprint(obj):
    md(f"```json\n{json.dumps(obj, indent=2, cls=NpEncoder)}\n```")

## Loading the Data

In [19]:
import altair as alt
import pandas as pd
from vega_datasets import data as vega_data

import draco as drc

us_employment_data : pd.DataFrame = vega_data("us-employment")
print(us_employment_data.head())
print(us_employment_data.dtypes)

        month  nonfarm  private  goods_producing  service_providing  \
0  2006-01-01   135450   113603            22467             112983   
1  2006-02-01   135762   113884            22535             113227   
2  2006-03-01   136059   114156            22572             113487   
3  2006-04-01   136227   114308            22631             113596   
4  2006-05-01   136258   114332            22597             113661   

   private_service_providing  mining_and_logging  construction  manufacturing  \
0                      91136                 656          7601          14210   
1                      91349                 662          7664          14209   
2                      91584                 669          7689          14214   
3                      91677                 679          7726          14226   
4                      91735                 681          7713          14203   

   durable_goods  ...  transportation_and_warehousing  utilities  information  \
0    

We can use the `schema_from_dataframe` function to generate the schema of the dataset, including the data types of each column and their statistical properties.

In [20]:
data_schema = drc.schema_from_dataframe(us_employment_data)
pprint(data_schema)

```json
{
  "number_rows": 120,
  "field": [
    {
      "name": "month",
      "type": "string",
      "unique": 120,
      "entropy": 4787,
      "freq": 1
    },
    {
      "name": "nonfarm",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 129726,
      "max": 143093,
      "std": 3577
    },
    {
      "name": "private",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 107250,
      "max": 120993,
      "std": 3706
    },
    {
      "name": "goods_producing",
      "type": "number",
      "unique": 118,
      "entropy": 4764,
      "min": 17627,
      "max": 22631,
      "std": 1694
    },
    {
      "name": "service_providing",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 111989,
      "max": 123356,
      "std": 2999
    },
    {
      "name": "private_service_providing",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 89507,
      "max": 101256,
      "std": 3149
    },
    {
      "name": "mining_and_logging",
      "type": "number",
      "unique": 103,
      "entropy": 4582,
      "min": 656,
      "max": 904,
      "std": 74
    },
    {
      "name": "construction",
      "type": "number",
      "unique": 116,
      "entropy": 4741,
      "min": 5427,
      "max": 7726,
      "std": 811
    },
    {
      "name": "manufacturing",
      "type": "number",
      "unique": 117,
      "entropy": 4753,
      "min": 11453,
      "max": 14226,
      "std": 916
    },
    {
      "name": "durable_goods",
      "type": "number",
      "unique": 117,
      "entropy": 4753,
      "min": 6985,
      "max": 9028,
      "std": 654
    },
    {
      "name": "nondurable_goods",
      "type": "number",
      "unique": 96,
      "entropy": 4469,
      "min": 4434,
      "max": 5228,
      "std": 270
    },
    {
      "name": "trade_transportation_utilties",
      "type": "number",
      "unique": 115,
      "entropy": 4730,
      "min": 24475,
      "max": 27037,
      "std": 760
    },
    {
      "name": "wholesale_trade",
      "type": "number",
      "unique": 118,
      "entropy": 4760,
      "min": 5439,
      "max": 6041,
      "std": 182
    },
    {
      "name": "retail_trade",
      "type": "number",
      "unique": 119,
      "entropy": 4776,
      "min": 14326,
      "max": 15704,
      "std": 413
    },
    {
      "name": "transportation_and_warehousing",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 4117,
      "max": 4950,
      "std": 196
    },
    {
      "name": "utilities",
      "type": "number",
      "unique": 86,
      "entropy": 4350,
      "min": 546,
      "max": 563,
      "std": 3
    },
    {
      "name": "information",
      "type": "number",
      "unique": 90,
      "entropy": 4392,
      "min": 2634,
      "max": 3055,
      "std": 143
    },
    {
      "name": "financial_activities",
      "type": "number",
      "unique": 113,
      "entropy": 4702,
      "min": 7676,
      "max": 8394,
      "std": 246
    },
    {
      "name": "professional_and_business_services",
      "type": "number",
      "unique": 117,
      "entropy": 4753,
      "min": 16392,
      "max": 19892,
      "std": 931
    },
    {
      "name": "education_and_health_services",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 17946,
      "max": 22318,
      "std": 1185
    },
    {
      "name": "leisure_and_hospitality",
      "type": "number",
      "unique": 116,
      "entropy": 4741,
      "min": 12927,
      "max": 15408,
      "std": 706
    },
    {
      "name": "other_services",
      "type": "number",
      "unique": 100,
      "entropy": 4556,
      "min": 5315,
      "max": 5652,
      "std": 91
    },
    {
      "name": "government",
      "type": "number",
      "unique": 110,
      "entropy": 4672,
      "min": 21810,
      "max": 22996,
      "std": 276
    },
    {
      "name": "nonfarm_change",
      "type": "number",
      "unique": 101,
      "entropy": 4564,
      "min": -802,
      "max": 522,
      "std": 261
    }
  ]
}
```

We transform the data schema into a set of facts that Draco can use to reason about the data when generating recommendations. We use the `dict_to_facts` function to do so which takes a dictionary and returns a list of facts.
The output list of facts encodes the same information as the input dictionary, it is just a different representation that we can feed into [Clingo](https://potassco.org/clingo/) under the hood.

In [21]:
data_schema_facts = drc.dict_to_facts(data_schema)
pprint(data_schema_facts)

```json
[
  "attribute(number_rows,root,120).",
  "entity(field,root,0).",
  "attribute((field,name),0,month).",
  "attribute((field,type),0,string).",
  "attribute((field,unique),0,120).",
  "attribute((field,entropy),0,4787).",
  "attribute((field,freq),0,1).",
  "entity(field,root,1).",
  "attribute((field,name),1,nonfarm).",
  "attribute((field,type),1,number).",
  "attribute((field,unique),1,120).",
  "attribute((field,entropy),1,4787).",
  "attribute((field,min),1,129726).",
  "attribute((field,max),1,143093).",
  "attribute((field,std),1,3577).",
  "entity(field,root,2).",
  "attribute((field,name),2,private).",
  "attribute((field,type),2,number).",
  "attribute((field,unique),2,120).",
  "attribute((field,entropy),2,4787).",
  "attribute((field,min),2,107250).",
  "attribute((field,max),2,120993).",
  "attribute((field,std),2,3706).",
  "entity(field,root,3).",
  "attribute((field,name),3,goods_producing).",
  "attribute((field,type),3,number).",
  "attribute((field,unique),3,118).",
  "attribute((field,entropy),3,4764).",
  "attribute((field,min),3,17627).",
  "attribute((field,max),3,22631).",
  "attribute((field,std),3,1694).",
  "entity(field,root,4).",
  "attribute((field,name),4,service_providing).",
  "attribute((field,type),4,number).",
  "attribute((field,unique),4,120).",
  "attribute((field,entropy),4,4787).",
  "attribute((field,min),4,111989).",
  "attribute((field,max),4,123356).",
  "attribute((field,std),4,2999).",
  "entity(field,root,5).",
  "attribute((field,name),5,private_service_providing).",
  "attribute((field,type),5,number).",
  "attribute((field,unique),5,120).",
  "attribute((field,entropy),5,4787).",
  "attribute((field,min),5,89507).",
  "attribute((field,max),5,101256).",
  "attribute((field,std),5,3149).",
  "entity(field,root,6).",
  "attribute((field,name),6,mining_and_logging).",
  "attribute((field,type),6,number).",
  "attribute((field,unique),6,103).",
  "attribute((field,entropy),6,4582).",
  "attribute((field,min),6,656).",
  "attribute((field,max),6,904).",
  "attribute((field,std),6,74).",
  "entity(field,root,7).",
  "attribute((field,name),7,construction).",
  "attribute((field,type),7,number).",
  "attribute((field,unique),7,116).",
  "attribute((field,entropy),7,4741).",
  "attribute((field,min),7,5427).",
  "attribute((field,max),7,7726).",
  "attribute((field,std),7,811).",
  "entity(field,root,8).",
  "attribute((field,name),8,manufacturing).",
  "attribute((field,type),8,number).",
  "attribute((field,unique),8,117).",
  "attribute((field,entropy),8,4753).",
  "attribute((field,min),8,11453).",
  "attribute((field,max),8,14226).",
  "attribute((field,std),8,916).",
  "entity(field,root,9).",
  "attribute((field,name),9,durable_goods).",
  "attribute((field,type),9,number).",
  "attribute((field,unique),9,117).",
  "attribute((field,entropy),9,4753).",
  "attribute((field,min),9,6985).",
  "attribute((field,max),9,9028).",
  "attribute((field,std),9,654).",
  "entity(field,root,10).",
  "attribute((field,name),10,nondurable_goods).",
  "attribute((field,type),10,number).",
  "attribute((field,unique),10,96).",
  "attribute((field,entropy),10,4469).",
  "attribute((field,min),10,4434).",
  "attribute((field,max),10,5228).",
  "attribute((field,std),10,270).",
  "entity(field,root,11).",
  "attribute((field,name),11,trade_transportation_utilties).",
  "attribute((field,type),11,number).",
  "attribute((field,unique),11,115).",
  "attribute((field,entropy),11,4730).",
  "attribute((field,min),11,24475).",
  "attribute((field,max),11,27037).",
  "attribute((field,std),11,760).",
  "entity(field,root,12).",
  "attribute((field,name),12,wholesale_trade).",
  "attribute((field,type),12,number).",
  "attribute((field,unique),12,118).",
  "attribute((field,entropy),12,4760).",
  "attribute((field,min),12,5439).",
  "attribute((field,max),12,6041).",
  "attribute((field,std),12,182).",
  "entity(field,root,13).",
  "attribute((field,name),13,retail_trade).",
  "attribute((field,type),13,number).",
  "attribute((field,unique),13,119).",
  "attribute((field,entropy),13,4776).",
  "attribute((field,min),13,14326).",
  "attribute((field,max),13,15704).",
  "attribute((field,std),13,413).",
  "entity(field,root,14).",
  "attribute((field,name),14,transportation_and_warehousing).",
  "attribute((field,type),14,number).",
  "attribute((field,unique),14,120).",
  "attribute((field,entropy),14,4787).",
  "attribute((field,min),14,4117).",
  "attribute((field,max),14,4950).",
  "attribute((field,std),14,196).",
  "entity(field,root,15).",
  "attribute((field,name),15,utilities).",
  "attribute((field,type),15,number).",
  "attribute((field,unique),15,86).",
  "attribute((field,entropy),15,4350).",
  "attribute((field,min),15,546).",
  "attribute((field,max),15,563).",
  "attribute((field,std),15,3).",
  "entity(field,root,16).",
  "attribute((field,name),16,information).",
  "attribute((field,type),16,number).",
  "attribute((field,unique),16,90).",
  "attribute((field,entropy),16,4392).",
  "attribute((field,min),16,2634).",
  "attribute((field,max),16,3055).",
  "attribute((field,std),16,143).",
  "entity(field,root,17).",
  "attribute((field,name),17,financial_activities).",
  "attribute((field,type),17,number).",
  "attribute((field,unique),17,113).",
  "attribute((field,entropy),17,4702).",
  "attribute((field,min),17,7676).",
  "attribute((field,max),17,8394).",
  "attribute((field,std),17,246).",
  "entity(field,root,18).",
  "attribute((field,name),18,professional_and_business_services).",
  "attribute((field,type),18,number).",
  "attribute((field,unique),18,117).",
  "attribute((field,entropy),18,4753).",
  "attribute((field,min),18,16392).",
  "attribute((field,max),18,19892).",
  "attribute((field,std),18,931).",
  "entity(field,root,19).",
  "attribute((field,name),19,education_and_health_services).",
  "attribute((field,type),19,number).",
  "attribute((field,unique),19,120).",
  "attribute((field,entropy),19,4787).",
  "attribute((field,min),19,17946).",
  "attribute((field,max),19,22318).",
  "attribute((field,std),19,1185).",
  "entity(field,root,20).",
  "attribute((field,name),20,leisure_and_hospitality).",
  "attribute((field,type),20,number).",
  "attribute((field,unique),20,116).",
  "attribute((field,entropy),20,4741).",
  "attribute((field,min),20,12927).",
  "attribute((field,max),20,15408).",
  "attribute((field,std),20,706).",
  "entity(field,root,21).",
  "attribute((field,name),21,other_services).",
  "attribute((field,type),21,number).",
  "attribute((field,unique),21,100).",
  "attribute((field,entropy),21,4556).",
  "attribute((field,min),21,5315).",
  "attribute((field,max),21,5652).",
  "attribute((field,std),21,91).",
  "entity(field,root,22).",
  "attribute((field,name),22,government).",
  "attribute((field,type),22,number).",
  "attribute((field,unique),22,110).",
  "attribute((field,entropy),22,4672).",
  "attribute((field,min),22,21810).",
  "attribute((field,max),22,22996).",
  "attribute((field,std),22,276).",
  "entity(field,root,23).",
  "attribute((field,name),23,nonfarm_change).",
  "attribute((field,type),23,number).",
  "attribute((field,unique),23,101).",
  "attribute((field,entropy),23,4564).",
  "attribute((field,min),23,-802).",
  "attribute((field,max),23,522).",
  "attribute((field,std),23,261)."
]
```

## Iterating the partial specification query

> Generating recommendations from a minimal input

We start by defining `input_spec_base` which is a list of facts including the data schema, a single view and a single mark.
This is the minimal set of facts that Draco needs to generate recommendations which can be rendered into charts.

We instantiate a `Draco` object, using the default knowledge base, and an `AltairRenderer` object which will be used to render the recommendations into Vega-Lite charts.

In [22]:
from draco.renderer import AltairRenderer

input_spec_base = data_schema_facts + [
    "entity(view,root,v0).",
    "entity(mark,v0,m0).",
]
d = drc.Draco()
renderer = AltairRenderer()

We can now use the `complete_spec` method of the `Draco` object to generate recommendations from incomplete specifications.
The function below is a reusable utility for this example, responsible for generating, rendering and displaying the recommendations.

In [23]:
def recommend_charts(
    spec: list[str], draco: drc.Draco, num: int = 5, labeler=lambda i: f"CHART {i+1}"
) -> dict[str, tuple[list[str], dict]]:
    # Dictionary to store the generated recommendations, keyed by chart name
    chart_specs = {}
    chart_specs['cost'] = []
    for i, model in enumerate(draco.complete_spec(spec, num)):
        chart_name = labeler(i)
        spec = drc.answer_set_to_dict(model.answer_set)
        chart_specs[chart_name] = drc.dict_to_facts(spec), spec
        chart_specs['cost'].append(model.cost)
        print(chart_name)
        print(f"COST: {model.cost}")
        chart = renderer.render(spec=spec, data=us_employment_data)
        # Adjust column-faceted chart size
        if (
            isinstance(chart, alt.FacetChart)
            and chart.facet.column is not alt.Undefined
        ):
            chart = chart.configure_view(continuousWidth=130, continuousHeight=130)
        display(chart)

    return chart_specs

We are using `input_spec_base` as the starting point for our exploration, that is, we are only specifying the data schema, and that we want the recommendations to have at least one view and one mark.

In [24]:
input_spec = input_spec_base
initial_recommendations = recommend_charts(spec=input_spec, draco=d)

CHART 1
COST: [3]


CHART 2
COST: [4]


CHART 3
COST: [4]


CHART 4
COST: [5]


CHART 5
COST: [5]


While the above recommendations are valid, they are not very diverse. We can also observe that the first two recommendations are represented by seemingly identical Vega-Lite specifications, however, they have different costs. We explore this behavior below, by inspecting the Draco specification of the first two charts.

In [25]:
chart_1_key, chart_2_key = "CHART 1", "CHART 2"
(_, chart_1), (_, chart_2) = (
    initial_recommendations[chart_1_key],
    initial_recommendations[chart_2_key],
)

md(f"**Draco Specification of {chart_1_key}**")
pprint(chart_1)

md(f"**Draco Specification of {chart_2_key}**")
pprint(chart_2)

**Draco Specification of CHART 1**

```json
{
  "number_rows": 120,
  "task": "summary",
  "field": [
    {
      "name": "month",
      "type": "string",
      "unique": 120,
      "entropy": 4787,
      "freq": 1
    },
    {
      "name": "nonfarm",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 129726,
      "max": 143093,
      "std": 3577
    },
    {
      "name": "private",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 107250,
      "max": 120993,
      "std": 3706
    },
    {
      "name": "goods_producing",
      "type": "number",
      "unique": 118,
      "entropy": 4764,
      "min": 17627,
      "max": 22631,
      "std": 1694
    },
    {
      "name": "service_providing",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 111989,
      "max": 123356,
      "std": 2999
    },
    {
      "name": "private_service_providing",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 89507,
      "max": 101256,
      "std": 3149
    },
    {
      "name": "mining_and_logging",
      "type": "number",
      "unique": 103,
      "entropy": 4582,
      "min": 656,
      "max": 904,
      "std": 74
    },
    {
      "name": "construction",
      "type": "number",
      "unique": 116,
      "entropy": 4741,
      "min": 5427,
      "max": 7726,
      "std": 811
    },
    {
      "name": "manufacturing",
      "type": "number",
      "unique": 117,
      "entropy": 4753,
      "min": 11453,
      "max": 14226,
      "std": 916
    },
    {
      "name": "durable_goods",
      "type": "number",
      "unique": 117,
      "entropy": 4753,
      "min": 6985,
      "max": 9028,
      "std": 654
    },
    {
      "name": "nondurable_goods",
      "type": "number",
      "unique": 96,
      "entropy": 4469,
      "min": 4434,
      "max": 5228,
      "std": 270
    },
    {
      "name": "trade_transportation_utilties",
      "type": "number",
      "unique": 115,
      "entropy": 4730,
      "min": 24475,
      "max": 27037,
      "std": 760
    },
    {
      "name": "wholesale_trade",
      "type": "number",
      "unique": 118,
      "entropy": 4760,
      "min": 5439,
      "max": 6041,
      "std": 182
    },
    {
      "name": "retail_trade",
      "type": "number",
      "unique": 119,
      "entropy": 4776,
      "min": 14326,
      "max": 15704,
      "std": 413
    },
    {
      "name": "transportation_and_warehousing",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 4117,
      "max": 4950,
      "std": 196
    },
    {
      "name": "utilities",
      "type": "number",
      "unique": 86,
      "entropy": 4350,
      "min": 546,
      "max": 563,
      "std": 3
    },
    {
      "name": "information",
      "type": "number",
      "unique": 90,
      "entropy": 4392,
      "min": 2634,
      "max": 3055,
      "std": 143
    },
    {
      "name": "financial_activities",
      "type": "number",
      "unique": 113,
      "entropy": 4702,
      "min": 7676,
      "max": 8394,
      "std": 246
    },
    {
      "name": "professional_and_business_services",
      "type": "number",
      "unique": 117,
      "entropy": 4753,
      "min": 16392,
      "max": 19892,
      "std": 931
    },
    {
      "name": "education_and_health_services",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 17946,
      "max": 22318,
      "std": 1185
    },
    {
      "name": "leisure_and_hospitality",
      "type": "number",
      "unique": 116,
      "entropy": 4741,
      "min": 12927,
      "max": 15408,
      "std": 706
    },
    {
      "name": "other_services",
      "type": "number",
      "unique": 100,
      "entropy": 4556,
      "min": 5315,
      "max": 5652,
      "std": 91
    },
    {
      "name": "government",
      "type": "number",
      "unique": 110,
      "entropy": 4672,
      "min": 21810,
      "max": 22996,
      "std": 276
    },
    {
      "name": "nonfarm_change",
      "type": "number",
      "unique": 101,
      "entropy": 4564,
      "min": -802,
      "max": 522,
      "std": 261
    }
  ],
  "view": [
    {
      "coordinates": "cartesian",
      "mark": [
        {
          "type": "bar",
          "encoding": [
            {
              "channel": "x",
              "aggregate": "count"
            }
          ]
        }
      ],
      "scale": [
        {
          "type": "linear",
          "channel": "x",
          "zero": "true"
        }
      ]
    }
  ]
}
```

**Draco Specification of CHART 2**

```json
{
  "number_rows": 120,
  "task": "value",
  "field": [
    {
      "name": "month",
      "type": "string",
      "unique": 120,
      "entropy": 4787,
      "freq": 1
    },
    {
      "name": "nonfarm",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 129726,
      "max": 143093,
      "std": 3577
    },
    {
      "name": "private",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 107250,
      "max": 120993,
      "std": 3706
    },
    {
      "name": "goods_producing",
      "type": "number",
      "unique": 118,
      "entropy": 4764,
      "min": 17627,
      "max": 22631,
      "std": 1694
    },
    {
      "name": "service_providing",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 111989,
      "max": 123356,
      "std": 2999
    },
    {
      "name": "private_service_providing",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 89507,
      "max": 101256,
      "std": 3149
    },
    {
      "name": "mining_and_logging",
      "type": "number",
      "unique": 103,
      "entropy": 4582,
      "min": 656,
      "max": 904,
      "std": 74
    },
    {
      "name": "construction",
      "type": "number",
      "unique": 116,
      "entropy": 4741,
      "min": 5427,
      "max": 7726,
      "std": 811
    },
    {
      "name": "manufacturing",
      "type": "number",
      "unique": 117,
      "entropy": 4753,
      "min": 11453,
      "max": 14226,
      "std": 916
    },
    {
      "name": "durable_goods",
      "type": "number",
      "unique": 117,
      "entropy": 4753,
      "min": 6985,
      "max": 9028,
      "std": 654
    },
    {
      "name": "nondurable_goods",
      "type": "number",
      "unique": 96,
      "entropy": 4469,
      "min": 4434,
      "max": 5228,
      "std": 270
    },
    {
      "name": "trade_transportation_utilties",
      "type": "number",
      "unique": 115,
      "entropy": 4730,
      "min": 24475,
      "max": 27037,
      "std": 760
    },
    {
      "name": "wholesale_trade",
      "type": "number",
      "unique": 118,
      "entropy": 4760,
      "min": 5439,
      "max": 6041,
      "std": 182
    },
    {
      "name": "retail_trade",
      "type": "number",
      "unique": 119,
      "entropy": 4776,
      "min": 14326,
      "max": 15704,
      "std": 413
    },
    {
      "name": "transportation_and_warehousing",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 4117,
      "max": 4950,
      "std": 196
    },
    {
      "name": "utilities",
      "type": "number",
      "unique": 86,
      "entropy": 4350,
      "min": 546,
      "max": 563,
      "std": 3
    },
    {
      "name": "information",
      "type": "number",
      "unique": 90,
      "entropy": 4392,
      "min": 2634,
      "max": 3055,
      "std": 143
    },
    {
      "name": "financial_activities",
      "type": "number",
      "unique": 113,
      "entropy": 4702,
      "min": 7676,
      "max": 8394,
      "std": 246
    },
    {
      "name": "professional_and_business_services",
      "type": "number",
      "unique": 117,
      "entropy": 4753,
      "min": 16392,
      "max": 19892,
      "std": 931
    },
    {
      "name": "education_and_health_services",
      "type": "number",
      "unique": 120,
      "entropy": 4787,
      "min": 17946,
      "max": 22318,
      "std": 1185
    },
    {
      "name": "leisure_and_hospitality",
      "type": "number",
      "unique": 116,
      "entropy": 4741,
      "min": 12927,
      "max": 15408,
      "std": 706
    },
    {
      "name": "other_services",
      "type": "number",
      "unique": 100,
      "entropy": 4556,
      "min": 5315,
      "max": 5652,
      "std": 91
    },
    {
      "name": "government",
      "type": "number",
      "unique": 110,
      "entropy": 4672,
      "min": 21810,
      "max": 22996,
      "std": 276
    },
    {
      "name": "nonfarm_change",
      "type": "number",
      "unique": 101,
      "entropy": 4564,
      "min": -802,
      "max": 522,
      "std": 261
    }
  ],
  "view": [
    {
      "coordinates": "cartesian",
      "mark": [
        {
          "type": "bar",
          "encoding": [
            {
              "channel": "x",
              "aggregate": "count"
            }
          ]
        }
      ],
      "scale": [
        {
          "type": "linear",
          "channel": "x",
          "zero": "true"
        }
      ]
    }
  ]
}
```

In [26]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [27]:
from typing import Optional

from langchain_core.prompts import ChatPromptTemplate

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert column extraction algorithm. "
            "You will chose 2 numerical columns that align with graphs and data types. "
            "You will chose most useful data to pick from dataset for visualization. "
            "Start your output with those two names"
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

In [28]:
from langchain_openai import OpenAI

llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)

runnable = prompt | llm

In [29]:
columns = runnable.invoke( [
    us_employment_data.head()
])
print(columns)
columns = columns.strip().split(", ")
print(columns)



month, nonfarm
['month', 'nonfarm']


Taking a good look at the specifications above, we can see that they only differ by their `"task"` attribute value. `CHART 1` has `"task": "summary"`, while `CHART 2` has `"task": "value"`. Thanks to the constraints in the default Draco knowledge base, the logical solver assigns slightly different costs to the two specifications. However, since the two charts use the same fields, scales, marks and encodings, the actual Vega-Lite specifications of the different Draco specifications are identical.

We can extend the input specification to better specify the design space we want to see recommendations for, to get more diverse results.
Let's say, we want the fields `date` and `temp_max` of the weather dataset to be encoded in the charts.
Also, we specify that we want the chart to be a faceted chart.
Note that we are not specifying the mark type, the encoding channels for the fields nor for the facet. We leave this to Draco to decide, based on its underlying knowledge base.

In [30]:
input_spec = input_spec_base + [
    # We want to encode the `date` field
    "entity(encoding,m0,e0).",
    f"attribute((encoding,field),e0,{columns[0]}).",
    # We want to encode the `temp_max` field
    "entity(encoding,m0,e1).",
    f"attribute((encoding,field),e1,{columns[1]})."
]
recommendations = recommend_charts(spec=input_spec, draco=d, num=5)

CHART 1
COST: [28]


CHART 2
COST: [29]


CHART 3
COST: [29]


CHART 4
COST: [29]


CHART 5
COST: [29]


In [31]:
def recommend_chart_cost(
    spec: list[str], draco: drc.Draco) -> int:
    # Dictionary to store the generated recommendations, keyed by chart name
    for i, model in enumerate(draco.complete_spec(spec)):
        return model.cost[0]

In [32]:
def find_best_chart(data, input_spec_base):
    """
    Compare all possible pairs of columns and find the best chart.
    :param data: The dataset with columns to evaluate.
    :param input_spec_base: The base specification to build chart specifications.
    :return: The best column pair and the associated score.
    """
    best_score = -1
    best_pair = None
    
    # Iterate through all pairs of columns in the dataset
    for i in range(len(data.columns)):
        for j in range(i + 1, len(data.columns)):
            # Generate chart spec for this pair of columns
            score = recommend_chart_cost(input_spec,drc.Draco())
            
            # Compare the score with the current best score
            if score > best_score:
                best_score = score
                best_pair = (data.columns[i], data.columns[j])
    
    # Return the best column pair and its score
    return best_pair, best_score

# Example usage (replace `us_employment_data` and `input_spec_base` with actual data):
best_pair, best_score = find_best_chart(us_employment_data, input_spec_base)

print(f"Best column pair: {best_pair}")
print(f"Best chart score: {best_score}")

Best column pair: ('month', 'nonfarm')
Best chart score: 28
