<a href="https://colab.research.google.com/github/ozzafar/NL2LDX/blob/main/NL2LDX_Notebook_(Cross_Validation).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [None]:
import pandas as pd
import os
import multiprocessing
from exploration_plan_generator.models.nl2pandas2ldx import NL2Pd2LDX
from exploration_plan_generator.clients.test_client import TestClient
import json
from typing import List
from nl2ldx_benchmark.evaulation.benchmark_model_api import BenchmarkModelAPI

## Configurations

In [None]:
FILENAME, DOMAIN, NUMERIC_ATTRIBUTES, ATTRIBUTES, EXCLUDE_ATTRIBUTES, SAMPLE = 'dataset_filename', 'domain', 'numeric_attributes', 'attributes',"exclude_attributes","sample"
dataset_configs = [
    {
        FILENAME : "flights.tsv",
        DOMAIN: "flights",
        NUMERIC_ATTRIBUTES: ['scheduled_trip_time'],
        ATTRIBUTES: [], # fetched dynamically
        EXCLUDE_ATTRIBUTES: [],
        SAMPLE: [] # fetched dynamically
    },
    {
        FILENAME : "netflix.tsv",
        DOMAIN: "shows",
        NUMERIC_ATTRIBUTES: ['duration'],
        ATTRIBUTES: [], # fetched dynamically
        EXCLUDE_ATTRIBUTES: [],
        SAMPLE: [] # fetched dynamically
    },
    {
        FILENAME : "play_store.tsv",
        DOMAIN: "apps",
        NUMERIC_ATTRIBUTES: ['rating', 'reviews', 'app_size_kb', 'install', 'price', 'min_android_ver'],
        ATTRIBUTES: [], # fetched dynamically
        EXCLUDE_ATTRIBUTES: [],
        SAMPLE: [] # fetched dynamically
    }
]
AGG_FUNCTIONS = ['count', 'mean','max','min']
AGGS_TO_STR = {
    "count": "number",
    "mean": "mean",
    "max": "maximum",
    "min": "minimum"
}
NON_NUMERICAL_OPERATIONS = ["eq", "ne", "contains"]
NUMERICAL_OPERATIONS = ["eq","ne","gt","lt"]
OPERATIONS_TO_STR = {
    "eq": "equal",
    "ne": "not equal",
    "contains": "contains",
    "gt": "greater than",
    "lt": "lower than"
}

# Placeholders
DOMAIN_PLACEHOLDER = "<domain>"
ATTRIBUTE_PLACEHOLDER = "<attribute>"
VALUE_PLACEHOLDER = "<value>"
VALUE1_PLACEHOLDER = "<value1>"
VALUE2_PLACEHOLDER = "<value2>"
FILTER_OPERATION_PLACEHOLDER = "<filter_operation>"
AGG_KEY_PLACEHOLDER = "<agg_key>"
AGG_COLUMN_PLACEHOLDER = "<agg_column>"
AGG_FUNC_PLACEHOLDER = "<agg_func>"

GPT35 = "gpt-3.5-turbo"
GPT4 = "gpt-4"

In [None]:
# create manipulated queries from templates N times
N = 10
test_dataset = []

for dataset_config in dataset_configs:
    df = pd.read_csv("../../datasets/" + dataset_config[FILENAME], sep='\t', header=0)
    dataset_config[ATTRIBUTES] = list(df.columns)[1:]
    dataset_config[ATTRIBUTES] = [item for item in dataset_config[ATTRIBUTES] if item not in dataset_config[EXCLUDE_ATTRIBUTES]]
    dataset_config[SAMPLE] = df.drop(columns=['Unnamed: 0']).head().to_string(index=False, justify='left')

## Run Experiments

In [None]:
# Experiment Config
OUT_DOMAIN = False
domain = "out_domain" if OUT_DOMAIN else "in_domain"
models: List[BenchmarkModelAPI] = [NL2Pd2LDX(TestClient(model="Test"))] # the models which would be evaluated, should implement the BenchmarkModelAPI interface

In [None]:
# Load dataset
test_dataset_name = "NL2LDX-benchmark"
print(test_dataset_name)
with open(f'../{test_dataset_name}.json') as f:
  file_contents = f.read()
test_dataset = json.loads(file_contents)

In [None]:
number_of_errors = 0
with multiprocessing.Pool(processes=2) as pool:

    for model in models:
        
        results_all, results_loo, failed = [],[],[]
        
        for i,(id, task, ldx_expected, dataset) in enumerate(test_dataset):
                        
            print(f"iter {i} of {len(test_dataset)}")
            
            config = next(filter(lambda conf: conf[FILENAME] == dataset, dataset_configs))
        
            try:
                
                input_values = [
                    (dataset, config[ATTRIBUTES], config[SAMPLE], task, [], OUT_DOMAIN),
                    (dataset, config[ATTRIBUTES], config[SAMPLE], task,[int(id)], OUT_DOMAIN)
                ]    

                results = pool.starmap(model.nl2ldx, input_values)

                # All      
                ldx_all_generated = results[0]
                print(f"{str(model)}_ldx_all_generated:\n{ldx_all_generated}")
                results_all.append((id, task, ldx_expected, ldx_all_generated))
                
                # Leave-one-task-out
                ldx_loo_generated = results[1]
                print(f"{str(model)}_ldx_loo_generated:\n{ldx_loo_generated}")
                results_loo.append((id, task, ldx_expected, ldx_loo_generated))
                                
            except Exception as e:
                failed.append((id, task, ldx_expected, dataset))
                print(f"new exception: {e}") 
                continue
    
                
        print(f"****** number_of_errors is: {number_of_errors}") 
        
        # save results
        target_path = f"results/{domain}_generalization/{str(model)}"
        if not os.path.exists(target_path):
            os.makedirs(target_path)
        with open(f"{target_path}/all.json", "w") as fp:
            json.dump(results_all, fp)
        with open(f"{target_path}/leave_one_out.json", "w") as fp:
            json.dump(results_loo, fp)

## Evaluation

In [None]:
from nl2ldx_benchmark.evaulation.metrics import *
from pathlib import Path
import numpy as np

model = ""
all_metrics = []
for p in Path('.').glob(f'results/{domain}_generalization/*{model}/*.json'):

    model_name = f"{p.parts[2]}-{p.stem}"
    results = json.loads(p.read_text())
    
    metrics = Metrics()
    for id, text, ldx_expected, ldx_generated in results:            
        ldx_generated=ldx_generated.replace("  ","").replace("\"","").replace("\'","").strip('\n').lower()
        ldx_expected=ldx_expected.replace("  ","").lower()
        nodes_jaccard_score = nodes_jaccard_distance(ldx_expected, ldx_generated)
        nodes_levenshtein_score = nodes_levenshtein_distance(ldx_expected, ldx_generated)
        nodes_number_score = nodes_number_distance(ldx_expected, ldx_generated)
        structure_levenshtein_score = structure_levenshtein_distance(ldx_expected, ldx_generated)
        xted_score,e = normalized_xted(ldx_expected, ldx_generated)
                            
        metrics.count += 1
        metrics.nodes_jaccard_score += nodes_jaccard_score
        metrics.nodes_levenshtein_score += nodes_levenshtein_score
        metrics.structure_levenshtein_score += structure_levenshtein_score
        metrics.xted_score += xted_score

    # average results
    metrics.nodes_jaccard_score /= metrics.count
    metrics.nodes_levenshtein_score /= metrics.count
    metrics.structure_levenshtein_score /= metrics.count
    metrics.xted_score /= metrics.count
    all_metrics.append((model_name,metrics))

## Export Tables

In [None]:
print(f"{OUT_DOMAIN=}")

gpt35_all_results = [metrics for metrics in all_metrics if GPT35 in metrics[0] and "all" in metrics[0]]
gpt35_loo_results = [metrics for metrics in all_metrics if GPT35 in metrics[0] and "leave_one_out" in metrics[0]]
gpt4_all_results = [metrics for metrics in all_metrics if GPT4 in metrics[0] and "all" in metrics[0]]
gpt4_loo_results = [metrics for metrics in all_metrics if GPT4 in metrics[0] and "leave_one_out" in metrics[0]]

configs = [
            ('all',[(GPT35,gpt35_all_results), (GPT4,gpt4_all_results)]),
            ('leave_one_out',[(GPT35,gpt35_loo_results),(GPT4,gpt4_loo_results)])
         ]

all = []
for i,config in enumerate(configs):
    model_name = config[0]
    
    for model,results in config[1]:    
        y = [metrics[1] for metrics in results]
        y_react = np.array([strategy.xted_score for strategy in y])
        y_structure = np.array([strategy.structure_levenshtein_score for strategy in y])
        y_nodes = np.array([strategy.nodes_levenshtein_score for strategy in y])
        y_unified = 2 * (y_structure * y_nodes)/(y_structure + y_nodes)
                    
        result = np.transpose(np.vstack((
            [metrics[0] for metrics in results],
            np.round(y_react,2),
            np.round(y_unified,2),
        )))
        
        all.append(result)
        
vertical_stack = np.vstack(all)
    
df = pd.DataFrame(vertical_stack)
df.columns = ['Model', 'REACT', 'F1 Score']

print(df.to_string(index=False, justify='right'))   
# df.to_excel(f"{OUT_DOMAIN=}.xlsx",index=False)