In [4]:
import os
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig
import torch
import warnings

warnings.filterwarnings('ignore', message='MatMul8bitLt.*')
warnings.filterwarnings('ignore', message='.*use_reentrant.*')
warnings.filterwarnings('ignore', message='.*torch.utils.checkpoint.*')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [5]:
SYSTEM_PROMPT = """
You are an expert PostgreSQL query optimizer specialized in generating pg_hint_plan commands. Your task is to analyze SQL queries along with their database statistics and generate optimal hint commands to improve query execution plans.

pg_hint_plan hints include:
- Scan method hints: SeqScan, IndexScan, BitmapScan, TidScan
- Join method hints: NestLoop, HashJoin, MergeJoin
- Join order hints: Leading(...)
- Row number correction hints: Rows(...)
Generate hints that will result in the most efficient execution plan based on the provided statistics."""

USER_PROMPT_TEMPLATE = """Analyze the following PostgreSQL query and generate optimal pg_hint_plan commands.

**Query:**
{query}
**Table Cardinalities:**
{card_tb}

**Column Statistics (NDV):**
{ndv}

**Most Frequent Values:**
{main_value}

**Value Ranges (Min/Max):**
{min_max}
Generate the optimal pg_hint_plan hints for this query."""

In [6]:
def format_ndv_stats(ndv_string):
    lines = ndv_string.strip().split('\n')
    formatted = [f"  - {x.replace(' : ', ': ')}" for x in lines if ' : ' in x]
    return '\n'.join(formatted)

def format_main_values(main_value_string):
    lines = main_value_string.strip().split('\n')
    formatted = [f"  - {x.replace(' : ', ': ')}" for x in lines if ' : ' in x]
    return '\n'.join(formatted)

def format_min_max(min_max_string):
    lines = min_max_string.strip().split('\n')
    formatted = [f"  - {x.replace(' : ', ': ')}" for x in lines if ' : ' in x]
    return '\n'.join(formatted)

def format_training_data(entry):
    user_prompt = USER_PROMPT_TEMPLATE.format(
        query=entry['query'],
        card_tb=entry['Card_Tb'],
        ndv=format_ndv_stats(entry['NDV']),
        main_value=format_main_values(entry['Main_Value']),
        min_max=format_min_max(entry['Min_Max'])
    )
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT.strip()},
            {"role": "user", "content": user_prompt.strip()},
            {"role": "assistant", "content": entry['best_hints'].strip()}
        ]
    }

def prepare_training_dataset(json_path):
    with open(json_path, 'r') as f:
        raw_data = json.load(f)
    formatted_data = [format_training_data(e) for e in raw_data]
    dataset = Dataset.from_list(formatted_data)
    return dataset

In [7]:
def split_dataset(dataset, test_size=0.1, seed=42):
    return dataset.train_test_split(test_size=test_size, seed=seed)

In [8]:
def get_model_and_tokenizer(model_name):
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_compute_dtype=torch.float16,
        bnb_8bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    #tokenizer.model_max_length = 4096
    with open("chat_template.jinja", "r", encoding="utf-8") as f:
        chat_template = f.read()
    tokenizer.chat_template = chat_template
    print(tokenizer.chat_template)
    
    return model, tokenizer

def get_lora_config():
    return LoraConfig(
        r=32,
        lora_alpha=64,
        lora_dropout=0.05,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        bias="none",
        task_type="CAUSAL_LM"
    )

In [9]:
def get_training_args():
    return SFTConfig(
        output_dir="./qwen-pg-hint-optimizer",
        num_train_epochs=3,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=16,
        learning_rate=2e-4,
        warmup_ratio=0.05,
        logging_steps=1,
        eval_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
        fp16=False,
        optim="paged_adamw_8bit",
        max_grad_norm=0.3,
        report_to="tensorboard",
        assistant_only_loss=True,
        packing=False,
    )

In [10]:
'''from transformers import TrainerCallback
class DebugMaskCallback(TrainerCallback):
    def on_train_batch_begin(self, args, state, control, inputs=None, **kwargs):
        masks = inputs.get("assistant_masks")
        if masks is not None:
            print(f"Batch {state.global_step} assistant_masks:", masks[0].tolist())'''

In [11]:
from transformers import DataCollatorForSeq2Seq
def main(json_path, model_name="Qwen/Qwen2.5-3B-Instruct"):

    dataset = prepare_training_dataset(json_path)
    train_val = split_dataset(dataset)
    train_ds, val_ds = train_val['train'], train_val['test']

    print(f"Training samples: {len(train_ds)}")
    print(f"Validation samples: {len(val_ds)}")
    print(f"Example training data:")

    model, tokenizer = get_model_and_tokenizer(model_name)
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    lora_config = get_lora_config()
    training_args = get_training_args()
    
    '''data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        padding="longest",     
        max_length=None
    )'''

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        peft_config=lora_config,
        processing_class=tokenizer,
        #callbacks=[DebugMaskCallback()]
    )

    print("Starting training")
    trainer.train()
    trainer.save_model("./final_model")
    tokenizer.save_pretrained("./final_model")
    print("Training complete")

In [18]:
main('all_queries_statistics.json')

Training samples: 101
Validation samples: 12
Example training data:




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:43<00:43, 43.93s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [01:09<00:00, 34.78s/it][A[A



{# — Последовательно рендерим user и assistant сообщения #}
{% for message in messages %}
  {% if message.role == "user" %}
<|im_start|>user
{{ message.content }}<|im_end|>
  {% elif message.role == "assistant" %}
<|im_start|>assistant
{% generation %}
{{ message.content }}
{% endgeneration %}<|im_end|>
  {% endif %}
{% endfor %}

{# — Гарантируем открытый ассистентский маркер на конце всегда #}
<|im_start|>assistant





Tokenizing train dataset:   0%|          | 0/101 [00:00<?, ? examples/s][A[A

Tokenizing train dataset:  16%|█▌        | 16/101 [00:00<00:00, 144.36 examples/s][A[A

Tokenizing train dataset:  32%|███▏      | 32/101 [00:00<00:00, 149.35 examples/s][A[A

Tokenizing train dataset:  49%|████▊     | 49/101 [00:00<00:00, 153.85 examples/s][A[A

Tokenizing train dataset:  71%|███████▏  | 72/101 [00:00<00:00, 148.12 examples/s][A[A

Tokenizing train dataset: 100%|██████████| 101/101 [00:00<00:00, 130.38 examples/s][A[A


Truncating train dataset: 100%|██████████| 101/101 [00:00<00:00, 8820.92 examples/s]


Tokenizing eval dataset:   0%|          | 0/12 [00:00<?, ? examples/s][A[A

Tokenizing eval dataset: 100%|██████████| 12/12 [00:00<00:00, 112.28 examples/s][A[A


Truncating eval dataset: 100%|██████████| 12/12 [00:00<00:00, 2002.05 examples/s]
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation c

Starting training...


 10%|▉         | 2/21 [15:07<2:23:37, 453.55s/it]

  0%|          | 0/21 [00:00<?, ?it/s][A
  5%|▍         | 1/21 [00:42<14:15, 42.79s/it][A
                                              
  5%|▍         | 1/21 [03:16<14:04, 42.22s/it][A

{'loss': 2.992, 'grad_norm': 13.2738618850708, 'learning_rate': 0.0, 'entropy': 0.8887458518147469, 'num_tokens': 16384.0, 'mean_token_accuracy': 0.0703125, 'epoch': 0.16}



 10%|▉         | 2/21 [01:25<13:29, 42.62s/it][A
                                              
  5%|▍         | 1/21 [03:58<14:04, 42.22s/it][A

{'loss': 2.1673, 'grad_norm': 9.491329193115234, 'learning_rate': 0.0001, 'entropy': 0.9523928053677082, 'num_tokens': 32757.0, 'mean_token_accuracy': 0.04438405856490135, 'epoch': 0.32}



 14%|█▍        | 3/21 [02:07<12:44, 42.49s/it][A
                                              
  5%|▍         | 1/21 [04:40<14:04, 42.22s/it][A

{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 0.0002, 'entropy': 0.9136723764240742, 'num_tokens': 49141.0, 'mean_token_accuracy': 0.0, 'epoch': 0.48}



 19%|█▉        | 4/21 [02:49<11:58, 42.26s/it][A
                                              
  5%|▍         | 1/21 [05:22<14:04, 42.22s/it][A

{'loss': 1.0384, 'grad_norm': 8.501145362854004, 'learning_rate': 0.00018947368421052632, 'entropy': 0.9327629059553146, 'num_tokens': 65525.0, 'mean_token_accuracy': 0.048076923936605453, 'epoch': 0.63}



 24%|██▍       | 5/21 [03:31<11:13, 42.12s/it][A
                                              
  5%|▍         | 1/21 [06:04<14:04, 42.22s/it][A

{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 0.00017894736842105264, 'entropy': 0.9802480526268482, 'num_tokens': 81909.0, 'mean_token_accuracy': 0.0, 'epoch': 0.79}



 29%|██▊       | 6/21 [04:13<10:30, 42.05s/it][A
                                              
  5%|▍         | 1/21 [06:46<14:04, 42.22s/it][A

{'loss': 0.7005, 'grad_norm': 5.804986000061035, 'learning_rate': 0.00016842105263157895, 'entropy': 0.9683047607541084, 'num_tokens': 98281.0, 'mean_token_accuracy': 0.10947651043534279, 'epoch': 0.95}



 33%|███▎      | 7/21 [04:26<07:36, 32.61s/it][A
                                              
  5%|▍         | 1/21 [06:59<14:04, 42.22s/it][A

{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 0.00015789473684210527, 'entropy': 1.0326840162277222, 'num_tokens': 103401.0, 'mean_token_accuracy': 0.0, 'epoch': 1.0}



 38%|███▊      | 8/21 [05:08<07:44, 35.70s/it][A
                                              
  5%|▍         | 1/21 [07:42<14:04, 42.22s/it][A

{'loss': 0.4759, 'grad_norm': 2.258043050765991, 'learning_rate': 0.00014736842105263158, 'entropy': 1.0143660753965378, 'num_tokens': 119773.0, 'mean_token_accuracy': 0.057065218687057495, 'epoch': 1.16}



 43%|████▎     | 9/21 [05:51<07:33, 37.81s/it][A
                                              
  5%|▍         | 1/21 [08:24<14:04, 42.22s/it][A

{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 0.0001368421052631579, 'entropy': 0.9706712551414967, 'num_tokens': 136157.0, 'mean_token_accuracy': 0.0, 'epoch': 1.32}



 48%|████▊     | 10/21 [06:33<07:09, 39.07s/it][A
                                               
  5%|▍         | 1/21 [09:06<14:04, 42.22s/it]][A

{'loss': 0.1513, 'grad_norm': 2.266556978225708, 'learning_rate': 0.0001263157894736842, 'entropy': 1.012984674423933, 'num_tokens': 152541.0, 'mean_token_accuracy': 0.23736806213855743, 'epoch': 1.48}



 52%|█████▏    | 11/21 [07:15<06:39, 39.97s/it][A
                                               
  5%|▍         | 1/21 [09:48<14:04, 42.22s/it]][A

{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 0.00011578947368421053, 'entropy': 1.0012114867568016, 'num_tokens': 168925.0, 'mean_token_accuracy': 0.0, 'epoch': 1.63}



 57%|█████▋    | 12/21 [07:57<06:05, 40.56s/it][A
                                               
  5%|▍         | 1/21 [10:30<14:04, 42.22s/it]][A

{'loss': 0.2194, 'grad_norm': 2.631427049636841, 'learning_rate': 0.00010526315789473685, 'entropy': 0.940270908176899, 'num_tokens': 185298.0, 'mean_token_accuracy': 0.05978260934352875, 'epoch': 1.79}



 62%|██████▏   | 13/21 [08:39<05:27, 41.00s/it][A
                                               
  5%|▍         | 1/21 [11:12<14:04, 42.22s/it]][A

{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 9.473684210526316e-05, 'entropy': 0.9713254682719707, 'num_tokens': 201682.0, 'mean_token_accuracy': 0.0, 'epoch': 1.95}



 67%|██████▋   | 14/21 [08:52<03:48, 32.63s/it][A
                                               
  5%|▍         | 1/21 [11:25<14:04, 42.22s/it]][A

{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 8.421052631578948e-05, 'entropy': 1.0138582944869996, 'num_tokens': 206802.0, 'mean_token_accuracy': 0.0, 'epoch': 2.0}



 71%|███████▏  | 15/21 [09:34<03:33, 35.56s/it][A
                                               
  5%|▍         | 1/21 [12:08<14:04, 42.22s/it]][A

{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 7.368421052631579e-05, 'entropy': 0.9687216058373451, 'num_tokens': 223186.0, 'mean_token_accuracy': 0.0, 'epoch': 2.16}



 76%|███████▌  | 16/21 [10:16<03:07, 37.55s/it][A
                                               
  5%|▍         | 1/21 [12:50<14:04, 42.22s/it]][A

{'loss': 0.1974, 'grad_norm': 1.998416781425476, 'learning_rate': 6.31578947368421e-05, 'entropy': 1.0288858115673065, 'num_tokens': 239570.0, 'mean_token_accuracy': 0.18333333358168602, 'epoch': 2.32}



 81%|████████  | 17/21 [10:58<02:35, 38.86s/it][A
                                               
  5%|▍         | 1/21 [13:32<14:04, 42.22s/it]][A

{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 5.2631578947368424e-05, 'entropy': 0.9828330390155315, 'num_tokens': 255954.0, 'mean_token_accuracy': 0.0, 'epoch': 2.48}



 86%|████████▌ | 18/21 [11:40<01:59, 39.79s/it][A
                                               
  5%|▍         | 1/21 [14:14<14:04, 42.22s/it]][A

{'loss': 0.2336, 'grad_norm': 3.5591208934783936, 'learning_rate': 4.210526315789474e-05, 'entropy': 0.9518195502460003, 'num_tokens': 272326.0, 'mean_token_accuracy': 0.05797101557254791, 'epoch': 2.63}



 90%|█████████ | 19/21 [12:22<01:20, 40.43s/it][A
                                               
  5%|▍         | 1/21 [14:55<14:04, 42.22s/it]][A

{'loss': 0.0723, 'grad_norm': 1.5242300033569336, 'learning_rate': 3.157894736842105e-05, 'entropy': 0.9971456155180931, 'num_tokens': 288699.0, 'mean_token_accuracy': 0.06159420311450958, 'epoch': 2.79}



 95%|█████████▌| 20/21 [13:04<00:40, 40.96s/it][A
                                               
  5%|▍         | 1/21 [15:38<14:04, 42.22s/it]][A

{'loss': 0.0058, 'grad_norm': 0.16505832970142365, 'learning_rate': 2.105263157894737e-05, 'entropy': 0.958176739513874, 'num_tokens': 305083.0, 'mean_token_accuracy': 0.0625, 'epoch': 2.95}



100%|██████████| 21/21 [13:18<00:00, 32.62s/it][A
                                               
  5%|▍         | 1/21 [15:51<14:04, 42.22s/it]][A

{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 1.0526315789473684e-05, 'entropy': 1.050365114212036, 'num_tokens': 310203.0, 'mean_token_accuracy': 0.0, 'epoch': 3.0}


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: caa3216c-c793-4f4c-a124-1eab8fd2366e)')' thrown while requesting HEAD https://huggingface.co/Qwen/Qwen2.5-3B-Instruct/resolve/main/config.json
Retrying in 1s [Retry 1/5].

                                               
100%|██████████| 21/21 [14:06<00:00, 40.33s/it][A

{'train_runtime': 846.9306, 'train_samples_per_second': 0.358, 'train_steps_per_second': 0.025, 'train_loss': 0.39304245658041465, 'epoch': 3.0}
Saving model...





Training complete!


In [16]:
with open("chat_template.jinja", "r", encoding="utf-8") as f:
    chat_template = f.read()
tokenizer.chat_template = chat_template
print(tokenizer.chat_template)

{% if messages[0].role == "system" %}
<|im_start|>system
{{ messages[0].content }}<|im_end|>
{% else %}
<|im_start|>system
You are a helpful assistant.<|im_end|>
{% endif %}


{% for message in messages %}
  {% if message.role == "user" %}
<|im_start|>user
{{ message.content }}<|im_end|>
  {% elif message.role == "assistant" %}
<|im_start|>assistant
{% generation %}
{{ message.content }}
{% endgeneration %}<|im_end|>
  {% endif %}
{% endfor %}


<|im_start|>assistant



In [17]:
tokenizer.model_max_length

131072

In [19]:
example = {
    "messages": [
        prepare_training_dataset('all_queries_statistics.json')[0]['messages'][1],
        prepare_training_dataset('all_queries_statistics.json')[0]['messages'][2]
    ]
}
out = tokenizer.apply_chat_template(
    example["messages"],
    tokenize=True,
    return_dict=True,
    return_assistant_tokens_mask=True
)
print("assistant_masks:", out['assistant_masks'])

assistant_masks: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [21]:
def generate_hints_for_query(query, statistics, model, tokenizer):
    user_prompt = USER_PROMPT_TEMPLATE.format(
        query=query,
        card_tb=statistics["Card_Tb"],
        ndv=format_ndv_stats(statistics["NDV"]),
        main_value=format_main_values(statistics["Main_Value"]),
        min_max=format_min_max(statistics["Min_Max"])
    )
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT.strip()},
        {"role": "user", "content": user_prompt.strip()},
    ]
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

In [2]:
from peft import AutoPeftModelForCausalLM
model = AutoPeftModelForCausalLM.from_pretrained("./final_model")
tokenizer = AutoTokenizer.from_pretrained("./final_model")

Loading checkpoint shards: 100%|██████████| 2/2 [00:48<00:00, 24.20s/it]


In [25]:
test_statistics = {"Card_Tb": "rt : 1 (12)\nci : 2746 (63439712)\nt : 1 (3298978)\nmc : 6 (4954445)\ncn : 1 (362131)\nchn : 1 (4314880)\nct : 4 (4)",
    "NDV": "char_name.name_pcode_nf : 12420.0\nchar_name.id : -1.0\nchar_name.name : -1.0\nchar_name.imdb_index : 0.0\nchar_name.imdb_id : 0.0\nchar_name.md5sum : -1.0\nchar_name.surname_pcode : 4954.0\ncast_info.person_role_id : 20143.0\ncast_info.person_id : 65346.0\ncast_info.movie_id : 1719595.0\ncast_info.note : 4263.0\ncast_info.id : -1.0\ncast_info.nr_order : 145.0\ncast_info.role_id : 11.0\ncompany_name.name_pcode_sf : 11132.0\ncompany_name.md5sum : -1.0\ncompany_name.id : -1.0\ncompany_name.imdb_id : 0.0\ncompany_name.country_code : 173.0\ncompany_name.name : -0.87158513\ncompany_name.name_pcode_nf : 11834.0\ncompany_type.id : -1.0\ncompany_type.kind : -1.0\nmovie_companies.company_id : 20333.0\nmovie_companies.note : 6353.0\nmovie_companies.movie_id : -0.16448805\nmovie_companies.id : -1.0\nmovie_companies.company_type_id : 2.0\nrole_type.id : -1.0\nrole_type.role : -1.0\ntitle.imdb_index : 12.0\ntitle.kind_id : 6.0\ntitle.imdb_id : 0.0\ntitle.id : -1.0\ntitle.title : 78695.0\ntitle.episode_nr : 1917.0\ntitle.episode_of_id : 20982.0\ntitle.series_years : 271.0\ntitle.md5sum : -1.0\ntitle.phonetic_code : 10595.0\ntitle.season_nr : 74.0\ntitle.production_year : 118.0",
    "Main_Value": "char_name.name_pcode_nf : ['H5241', 143110]\nchar_name.surname_pcode : ['M5', 36245]\ncast_info.person_role_id : ['2', 2159065]\ncast_info.person_id : ['2554625', 27491]\ncast_info.note : ['(producer)', 2634863]\ncast_info.nr_order : ['1', 2087167]\ncast_info.role_id : ['1', 20427587]\ncompany_name.name_pcode_sf : ['L2145', 1123]\ncompany_name.country_code : ['[us]', 139312]\ncompany_name.name : ['AXN', 72]\ncompany_name.name_pcode_nf : ['P6325', 1171]\nmovie_companies.company_id : ['67', 75142]\nmovie_companies.note : ['(in association with)', 47728]\nmovie_companies.company_type_id : ['2', 2889102]\ntitle.kind_id : ['7', 3025933]\ntitle.title : ['(#1.1)', 25622]\ntitle.episode_nr : ['1', 143616]\ntitle.episode_of_id : ['628404', 10887]\ntitle.series_years : ['2015-????', 8467]\ntitle.phonetic_code : ['A1416', 9567]\ntitle.season_nr : ['1', 1308265]\ntitle.production_year : ['2014', 189251]",
    "Min_Max": "char_name.id : [1, 4314864]\ncast_info.id : [1, 63475835]\ncast_info.person_id : [1, 6226526]\ncast_info.movie_id : [1, 4730370]\ncast_info.person_role_id : [1, 4314864]\ncast_info.nr_order : [-2068070866, 1776839230]\ncast_info.role_id : [1, 11]\ncompany_name.id : [1, 362131]\ncompany_type.id : [1, 4]\nmovie_companies.id : [1, 4958296]\nmovie_companies.movie_id : [2, 4698791]\nmovie_companies.company_id : [1, 362131]\nmovie_companies.company_type_id : [1, 2]\nrole_type.id : [1, 12]\ntitle.id : [100000, 3399999]\ntitle.kind_id : [1, 8]\ntitle.production_year : [1888, 2115]\ntitle.episode_of_id : [99685, 3300011]\ntitle.season_nr : [1, 2015]\ntitle.episode_nr : [1, 91334]"}
sql = "SELECT MIN(chn.name) AS uncredited_voiced_character,\n       MIN(t.title) AS russian_movie\nFROM char_name AS chn,\n     cast_info AS ci,\n     company_name AS cn,\n     company_type AS ct,\n     movie_companies AS mc,\n     role_type AS rt,\n     title AS t\nWHERE ci.note LIKE '%(voice)%'\n  AND ci.note LIKE '%(uncredited)%'\n  AND cn.country_code = '[ru]'\n  AND rt.role = 'actor'\n  AND t.production_year > 2005\n  AND t.id = mc.movie_id\n  AND t.id = ci.movie_id\n  AND ci.movie_id = mc.movie_id\n  AND chn.id = ci.person_role_id\n  AND rt.id = ci.role_id\n  AND cn.id = mc.company_id\n  AND ct.id = mc.company_type_id;"
hints = generate_hints_for_query(sql, test_statistics, model, tokenizer)
print(hints)

system
You are an expert PostgreSQL query optimizer specialized in generating pg_hint_plan commands. Your task is to analyze SQL queries along with their database statistics and generate optimal hint commands to improve query execution plans.

pg_hint_plan hints include:
- Scan method hints: SeqScan, IndexScan, BitmapScan, TidScan
- Join method hints: NestLoop, HashJoin, MergeJoin
- Join order hints: Leading(...)
- Row number correction hints: Rows(...)
Generate hints that will result in the most efficient execution plan based on the provided statistics.


user
Analyze the following PostgreSQL query and generate optimal pg_hint_plan commands.

**Query:**
SELECT MIN(chn.name) AS uncredited_voiced_character,
       MIN(t.title) AS russian_movie
FROM char_name AS chn,
     cast_info AS ci,
     company_name AS cn,
     company_type AS ct,
     movie_companies AS mc,
     role_type AS rt,
     title AS t
WHERE ci.note LIKE '%(voice)%'
  AND ci.note LIKE '%(uncredited)%'
  AND cn.country_co