In [None]:
!pip install together

Collecting together
  Downloading together-1.5.13-py3-none-any.whl.metadata (15 kB)
Collecting eval-type-backport<0.3.0,>=0.1.3 (from together)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting typer<0.16,>=0.9 (from together)
  Downloading typer-0.15.4-py3-none-any.whl.metadata (15 kB)
Collecting click<9.0.0,>=8.1.7 (from together)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading together-1.5.13-py3-none-any.whl (90 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.7/90.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading eval_type_backport-0.2.2-py3-none-any.whl (5.8 kB)
Downloading typer-0.15.4-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m7.5 MB/s[0m eta [36m

In [43]:
import ast
import re
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from itertools import product
from typing import Literal
import pandas as pd
from tabulate import tabulate
from tqdm import tqdm
from together import Together

In [None]:
def encode_labels(m: int) -> str:
  if m == 1:
    return "world"
  elif m == 2:
    return "sports"
  elif m ==3:
    return "business"
  elif m == 4:
    return "sci/tech"

In [58]:
url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv"
df = pd.read_csv(url, header=None, names=["label","title","description"])

df["text"] = df["title"] + ". " + df["description"]
df = df.sample(n=500, random_state=42).reset_index(drop=True)

df['label'] = df['label'].map(encode_labels)

texts = df["text"].tolist()
df = df[["label", "text"]]

In [59]:
size = 200
df_lim = (df.groupby("label", sort=False).head(size).reset_index(drop=True))
df_shuffled = df_lim.sample(frac=1, random_state=42).reset_index(drop=True)

In [60]:
NEWS_CATEGORIES = {
    "sports":   "Sports events, athletes, competitions",
    "world":    "Global news, international affairs",
    "sci/tech": "Science and technology, research, innovations",
    "business": "Economy, markets, finance, companies"
}

SYSTEM_PROMPT = """
You are a news categorization expert. Your task is to classify news texts into predefined categories.

Available categories:
{categories_list}

Guidelines:
- Be precise and consistent in your categorization
- Consider the main theme and context of the text
- If a text could fit multiple categories, choose the most dominant one
- For soft classification, provide probability scores that sum to 1
- For hard classification, select the single most appropriate category
""".strip().format(
    categories_list="\n".join(f"- {cat}: {desc}" for cat, desc in NEWS_CATEGORIES.items())
)

CLASSIFY_SOFT_PROMPT_TEMPLATE = """
Assign a probability score (0 < score < 1) to each category so they sum to 1.
Wrap your response in <answer></answer> tags.

# Expected format:
<answer>
{{
    "sports":    <probability>,
    "world":     <probability>,
    "sci/tech":  <probability>,
    "business":  <probability>
}}
</answer>

News Text:
{description}

Provide only the JSON response without any additional text or explanations.
""".strip()

CLASSIFY_HARD_PROMPT_TEMPLATE = """
Select the most fitting category (among provided) for the given news text.
Wrap your response in <answer></answer> tags.

# Expected format:
<answer>
{{
    "Category": "<selected category>"
}}
</answer>

News Text:
{description}

Provide only the JSON response without any additional text or explanations.
""".strip()

In [61]:
API_KEY = "482616ebff87428b1a490e5a30e95bd730e7c52f38bba3d3e71c39039008aea0"
client = Together(api_key=API_KEY)
batch_size = 16

In [64]:
def run_evaluation(model_name: str, mode: Literal["soft", "hard"]) -> float:
    template = (
        CLASSIFY_SOFT_PROMPT_TEMPLATE if mode == "soft" else CLASSIFY_HARD_PROMPT_TEMPLATE
    )
    prompts = [template.format(description=row.text) for row in df_shuffled.itertuples()]
    targets = list(df_shuffled.label)

    def _worker(text_prompt: str) -> str:
      resp = client.chat.completions.create(
          model=model_name,
          messages=[
              {"role": "system", "content": SYSTEM_PROMPT},
              {"role": "user", "content": text_prompt},
          ],
      )
      content = resp.choices[0].message.content.strip()
      match = re.search(r"<answer>(.*?)</answer>", content, re.DOTALL)
      if not match:
          return "ERROR"
      raw = match.group(1)
      # clean single-line and block comments, fix trailing commas
      raw = re.sub(r"//.*", "", raw)
      raw = re.sub(r"/\*.*?\*/", "", raw, flags=re.DOTALL)
      raw = re.sub(r",\s*([}\]])", r"\1", raw)
      try:
          result = ast.literal_eval(raw.strip())
      except Exception:
          return "ERROR"
      return max(result, key=result.get) if mode == "soft" else result.get("Category", "ERROR")


    predictions = []
    for start in tqdm(range(0, len(prompts), batch_size), desc="Processing Batches"):
        batch = prompts[start : start + batch_size]
        with ThreadPoolExecutor(max_workers=batch_size) as executor:
            predictions.extend(executor.map(_worker, batch))

    accuracy = sum(p == t for p, t in zip(predictions, targets)) / len(targets)
    print(f"[{Path(model_name).name}][{mode}] Accuracy: {accuracy:.4f}")
    return accuracy


In [65]:
results = []
models = ["lgai/exaone-3-5-32b-instruct"]
for mode, model in product(["hard", "soft"], models):
    acc = run_evaluation(model, mode)
    results.append([mode, model, acc])

Processing Batches: 100%|██████████| 32/32 [00:29<00:00,  1.08it/s]


[exaone-3-5-32b-instruct][hard] Accuracy: 0.8320


Processing Batches: 100%|██████████| 32/32 [00:55<00:00,  1.74s/it]

[exaone-3-5-32b-instruct][soft] Accuracy: 0.8480





In [66]:
df = pd.DataFrame(results, columns=["Mode", "Model", "Accuracy"])
print(df.to_string(index=False))

Mode                        Model  Accuracy
hard lgai/exaone-3-5-32b-instruct     0.832
soft lgai/exaone-3-5-32b-instruct     0.848


## Выводы

soft бьет hard (привет pirelli)

1.	Гибкость формата
- В soft-режиме мы используем все вероятности, которые модель приписывает каждой категории. Даже если правильная метка стоит не на первом месте, её «вес» учитывается при аргмаксе.

2.	Сложности парсинга
- В hard-режиме модель должна чётко придерживаться JSON-формата и вернуть именно поле "Category". Любая мелкая неточность или дополнительный текст (комментарий, лишняя запятая) приводит к сбою и ошибкам парсинга, что снижает надёжность.