## 总结item的关键词

In [1]:
from modelscope import AutoModelForCausalLM, AutoTokenizer
from modelscope import GenerationConfig
import os
import torch

In [2]:
# 设置GPU
torch.cuda.set_device(0)

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
device

device(type='cuda', index=0)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("damo/nlp_polylm_qwen_7b_text_generation", revision = 'v1.0.1', trust_remote_code=True)

Downloading Model to directory: /home/yuqihang/.cache/modelscope/hub/models/damo/nlp_polylm_qwen_7b_text_generation


2025-02-24 11:37:46,577 - modelscope - INFO - Use user-specified model revision: v1.0.1


In [6]:
model = AutoModelForCausalLM.from_pretrained("/data/yuqihang/model/DeepSeek-R1-Distill-Qwen-1.5B").eval().to(device)

In [5]:
model = AutoModelForCausalLM.from_pretrained("/home/yuqihang/.cache/modelscope/hub/models/damo/nlp_polylm_qwen_7b_text_generation", revision = 'v1.0.1',trust_remote_code=True, bf16=True).eval().to(device)

KeyError: 'qwen2'

In [83]:
# 创建 GenerationConfig 对象
generation_config = GenerationConfig.from_pretrained(
    "damo/nlp_polylm_qwen_7b_text_generation",
    revision='v1.0.1',
    trust_remote_code=True
)

# 更新配置参数
generation_config.max_new_tokens = 32
generation_config.min_length = 1
generation_config.do_sample = False  # 禁用采样，启用贪婪搜索
generation_config.num_beams = 4      # 使用束搜索，宽度为 4
generation_config.num_return_sequences = 1  # 返回一个序列
generation_config.repetition_penalty = 1.2  # 重复惩罚系数
generation_config.no_repeat_ngram_size = 2  # 不重复的 n-gram 大小
generation_config.early_stopping = True

# 将配置应用到模型
model.generation_config = generation_config

Downloading Model to directory: /home/yuqihang/.cache/modelscope/hub/models/damo/nlp_polylm_qwen_7b_text_generation


2025-02-24 12:52:11,985 - modelscope - INFO - Use user-specified model revision: v1.0.1


In [13]:
import numpy as np
import pandas as pd
import json

In [88]:
json_path = '/home/yuqihang/projects/CoLLM/collm-datasets/booknew/id2title.json'
rawdict = json.load(open(json_path, 'r'))

In [80]:
data_types = ['book','movie']
data_type_id = 0
prompt_prefix = f'You are a skilled text summarizer. Your task is to extract up to ten key words from the given profile of the {data_types[data_type_id]} above. Do not include the word "book" in your summary. Answers should contain only keywords, which should be separated by commas.\nKeywords:'

In [94]:
def get_desc(meta:dict)->str:
    description = ''
    candidates = ['category','brand','price','description']
    for item in candidates:
        if item in meta:
            description = f'{description}"{item.capitalize()}":{" ".join(meta[item].replace("...",".").replace("&","and").replace("Book","").replace("book","").split()[:100])}.'
    return description

In [71]:
testdic = {
    'description':"\"I didn't want to put it down the entire time I was reading ...I still can't stop thinking about it.\" --Megan, Starky Reviews \"Great storytelling ... This isNOT a cookie-cutter dystopian read.\" --Dianne, Tome Tender Book Blog \"This book was AWESOME!... An epic read ... If you ... love books like The Hunger Games, Divergent, and The Maze Runner, this book is right up your alley ...Five stars!\" --Stephanie, TeacherofYA's Book Blog \"This book is incredible! ... Imaginative ... fast paced ... full of action ... filled with a lot of twists and turns ... and the world building is truly fantastic ... 5 Brilliant Stars.\" --Karen Jo, Sincerely Karen Jo \"5 Stars!!!!!! Unputdownable!! ... My new favorite book ... If YOU love a good dystopian + romance ... Then you need this book in your life.\" --Megan,i fall in love book blog \"An intensely awesome read ... The suspense and the thrilling scenarios kept me on edge ... Very creative and brilliant ... Highly recommended.\" --Denise, Goodreads \"This bookoffers everything from action to romance with twists that keep you guessing. If you enjoyedthe Divergent Seriesyou will definitely enjoy this! A must-read.\" --Ashley, Goodreads \"This post-apocalyptic, young adult, science fiction is a page turner from start to finish!Similar to The Giver in the beginning and things just get more and more interesting ... Highly recommend!\" -- Susie, Goodreads \"An amazing read .. Most stories seem predictable at some point but this one surprised me. At only one single point was I able to say, yep I see what's going to happen next. Yeah, right. I was totally wrong.\" --Amanda, Goodreads",
    'brand':'Yilin Publisher',
    'price':'$87.1'
}
print(get_desc(testdic))

"Description":"I didn't want to put it down the entire time I was reading .I still can't stop thinking about it." --Megan, Starky Reviews "Great storytelling . This isNOT a cookie-cutter dystopian read." --Dianne, Tome Tender Book Blog "This book was AWESOME!. An epic read . If you . love books like The Hunger Games, Divergent, and The Maze Runner, this book is right up your alley .Five stars!" --Stephanie, TeacherofYA's Book Blog "This book is incredible! . Imaginative . fast paced . full of action . filled with a lot of twists and turns . and the world building is."Brand":Yilin Publisher."Price":$87.1.


In [86]:
inputs = tokenizer(f'{get_desc(testdic)}\n{prompt_prefix}', return_tensors='pt')
inputs = inputs.to(device)
pred = model.generate(**inputs)
raw = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
print(raw)
keywords = ', '.join(raw.split('Keywords:')[-1].strip().replace('"','').split(', ')[:10])
print(keywords)

"Description":"I didn't want to put it down the entire time I was reading .I still can't stop thinking about it." --Megan, Starky Reviews "Great storytelling . This isNOT a cookie-cutter dystopian read." --Dianne, Tome Tender Book Blog "This book was AWESOME!. An epic read . If you . love books like The Hunger Games, Divergent, and The Maze Runner, this book is right up your alley .Five stars!" --Stephanie, TeacherofYA's Book Blog "This book is incredible! . Imaginative . fast paced . full of action . filled with a lot of twists and turns . and the world building is."Brand":Yilin Publisher."Price":$87.1.
You are a skilled text summarizer. Your task is to extract up to ten key words from the given profile of the book above. Do not include the word "book" in your summary. Answers should contain only keywords, which should be separated by commas.
Keywords: dystopia, post-apocalyptic, science fiction, survival, young adult, action, adventure, romance, love, family, friendship, betrayal, lo

In [100]:
from tqdm import tqdm

In [None]:
sbar = tqdm(total=5)
for idx,meta in list(rawdict.items())[:5]:
    inputs = tokenizer(f'{get_desc(meta)}\n{prompt_prefix}:', return_tensors='pt')
    inputs = inputs.to(device)
    pred = model.generate(**inputs)
    raw = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
    raw_words = raw.split('Keywords:')[-1].strip()
    cleaned_words = raw_words.replace('"','').replace(':','').replace('  ',' ').split(', ')
    filer_words = list(filter(lambda x: x not in ['book','books','Book','Books'], cleaned_words))[:10]
    # print(raw)
    keywords = ', '.join(filer_words)
    # rawdict[idx]['keywords'] = keywords
    torch.cuda.empty_cache()
    sbar.set_postfix(keywords=keywords)
    sbar.update()

 20%|██        | 1/5 [00:06<00:24,  6.04s/it]

 literature, fiction, dramas, plays


 40%|████      | 2/5 [00:12<00:18,  6.28s/it]

 literature, fiction, classics, brand, visit, amazon, robert, louis, stevenson, page


 60%|██████    | 3/5 [00:15<00:09,  4.93s/it]

 Shakespeare, William, Biography, Drama, History, Literature, Poetry, Plays, Sonnets


 80%|████████  | 4/5 [00:19<00:04,  4.46s/it]

 fiction, literature, piper, watty, price, description, when, other, engines, refuses


100%|██████████| 5/5 [00:28<00:00,  5.62s/it]

 book, children's, classic, dr, seuss, page, price, summary, text, title



