In [1]:
pip install "huggingface_hub[hf_xet]"

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
INPUT_PATH = "datasets/reviews.xlsx"
OUTPUT_PATH = "datasets/reviews_out.xlsx"

df = pd.read_excel(INPUT_PATH)

if df.shape[1] == 1:
    df.columns = ["review"]

REVIEW_COLUMN = "review"
if REVIEW_COLUMN not in df.columns:
    REVIEW_COLUMN = df.columns[0]

df[REVIEW_COLUMN] = df[REVIEW_COLUMN].astype(str)
df.head()


Unnamed: 0,review
0,"Ю Юлия А. Цвет товара: синий, Российский разме..."
1,"С Сергей Ш. Цвет товара: синий, Российский раз..."
2,"Цвет товара: серый, Российский размер: 50, Раз..."
3,"Ф Флера М. Цвет товара: белый, Российский разм..."
4,"С Сергей Т. изменен Цвет товара: черный, Росси..."


In [4]:
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto",
    device_map="auto"
)

print("cuda:", torch.cuda.is_available())
print("model device:", next(model.parameters()).device)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 2 files: 100%|█████████████████████████████████████████████████████████████████| 2/2 [07:41<00:00, 230.73s/it]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.34s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


cuda: True
model device: cuda:0


In [5]:
SYSTEM_PROMPT = (
    "Ты аналитик отзывов маркетплейса. "
    "Выдай строго JSON без пояснений и без markdown. "
    "Поля: sentiment (positive|neutral|negative), aspect (quality|price|delivery|service|other), summary (коротко 5-15 слов). "
    "Игнорируй мусор, артикулы, случайные символы, обрывки."
)

def _extract_json(text: str) -> dict:
    m = re.search(r"\{.*\}", text, flags=re.S)
    if not m:
        return {"sentiment": "neutral", "aspect": "other", "summary": ""}
    try:
        return json.loads(m.group(0))
    except Exception:
        return {"sentiment": "neutral", "aspect": "other", "summary": ""}

@torch.inference_mode()
def analyze_review(review: str) -> dict:
    review = str(review).strip()
    if not review:
        return {"sentiment": "neutral", "aspect": "other", "summary": ""}

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": review[:4000]},
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    out = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=False,
        temperature=0.0,
        pad_token_id=tokenizer.eos_token_id
    )

    gen = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
    data = _extract_json(gen)

    sentiment = data.get("sentiment", "neutral")
    aspect = data.get("aspect", "other")
    summary = data.get("summary", "")

    if sentiment not in {"positive", "neutral", "negative"}:
        sentiment = "neutral"
    if aspect not in {"quality", "price", "delivery", "service", "other"}:
        aspect = "other"
    if not isinstance(summary, str):
        summary = str(summary)

    return {"sentiment": sentiment, "aspect": aspect, "summary": summary}


In [6]:
tqdm.pandas()

res = df[REVIEW_COLUMN].progress_apply(analyze_review)
res_df = pd.json_normalize(res)

df["sentiment"] = res_df["sentiment"]
df["aspect"] = res_df["aspect"]
df["summary"] = res_df["summary"]

df.head()


  0%|                                                                                           | 0/71 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|██████████████████████████████████████████████████████████████████████████████████| 71/71 [39:26<00:00, 33.33s/it]


Unnamed: 0,review,sentiment,aspect,summary
0,"Ю Юлия А. Цвет товара: синий, Российский разме...",neutral,other,
1,"С Сергей Ш. Цвет товара: синий, Российский раз...",negative,quality,Ткань не соответствует ожиданиям
2,"Цвет товара: серый, Российский размер: 50, Раз...",neutral,other,
3,"Ф Флера М. Цвет товара: белый, Российский разм...",positive,quality,Худи хорошего качества
4,"С Сергей Т. изменен Цвет товара: черный, Росси...",neutral,other,


In [7]:
ext = os.path.splitext(OUTPUT_PATH)[1].lower()
if ext in [".xlsx", ".xls"]:
    df.to_excel(OUTPUT_PATH, index=False)
else:
    df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")

OUTPUT_PATH


'datasets/reviews_out.xlsx'