## Prepare Dependencies

In [1]:
!pip install langchain
!pip install -U langchain-community
!pip install tiktoken
!pip install -U qdrant-client
!pip install langchain_experimental
!pip install "langchain[docarray]"
!pip install langchain-huggingface text-generation transformers google-search-results numexpr langchainhub sentencepiece jinja2

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [2]:
from langchain.document_loaders import TextLoader
from qdrant_client import QdrantClient
from langchain_community.vectorstores import Qdrant
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface import ChatHuggingFace
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

  from .autonotebook import tqdm as notebook_tqdm
2024-06-22 22:40:48.891279: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from langchain_huggingface.llms import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    model_id="Aaron080108/Chinese-Poetry-Generation",
    device=0,
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 300},
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading shards: 100%|██████████| 4/4 [00:01<00:00,  3.25it/s]
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.18it/s]


In [4]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chains import RetrievalQA

## Data Preparing

In [8]:
#put in the original json file
import json
with open("poet.tang.1000.json", "r") as f:
    RAG_data = json.load(f)
with open("唐诗——简.txt", "a") as f:
    for each in RAG_data:
        for i in each["paragraphs"]:
            f.write(i)

In [9]:
#Prepare the text of Peter Pan and load it
txt_file_path = '唐诗——简.txt'
loader = TextLoader(file_path=txt_file_path, encoding="utf-8")
doc = loader.load()

In [10]:
#Chunk the text into pieces of "chunks", or "nodes" in the llama index
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=50)
chunks = text_splitter.split_documents(doc)
print('done.')

done.


In [11]:
# Take a look of your documents
print (f'Now you have {len(chunks)} documents')
print(chunks[5].page_content)

Now you have 4388 documents
抵长城，金徽暎高阙。遥心万余里，直望三边月。霜静影逾悬，露晞光渐没。思君不可见，空叹将焉歇。塞北狂胡旅，城南敌汉围。巉岩一鼓气，拔利五兵威。虏骑瞻山哭，王师拓地飞。不应须宠战，当遂勒金徽。亡国秦韩代，


In [12]:
#Prepare different embedding model for LOTR. 
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import SentenceTransformerEmbeddings
!pip install sentence-transformers

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [13]:
h_embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-large-zh")
embedding = h_embedding

story_h = Qdrant.from_documents(
    chunks,
    embedding,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="my_documents",
)

  warn_deprecated(


In [14]:
#Build a retriever
retriever = story_h.as_retriever(search_type="mmr") # Maximal marginal relevance 

## Base line of wenbopan/Faro-Qwen-1.8B

In [15]:
# Define a LLM
llm = ChatHuggingFace(llm=llm)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [78]:
# Define prompt template
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
system_prompt = (
    """
    你是一位诗歌专家，请根据给定的文本围绕给定的题目写一首诗。

    "\n\n"
    "文本：{context}
    主题：{input}

    """
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [80]:
question = "柳树"
response = rag_chain.invoke({"input":question})
response["answer"]

'<|im_start|>system\n\n    你是一位诗歌专家，请根据给定的文本围绕给定的题目写一首诗。\n\n    "\n\n"\n    "文本：。花惊燕地雪，叶映楚池波。谁堪别离此，征戍在交河。东城攀柳叶，柳叶低着草。少壮莫轻年，轻年有人老。柳发遍川岗，登高堪断肠。雨烟轻漠漠，何树近君乡？赠君折杨柳，颜色岂能久。上客莫霑巾，佳人正回首。新柳送\n\n，万物无态。唯有吾庭前杉松树枝，枝枝健在。冰峰撑空寒矗矗，雪凝水冻埋海陆。杀物之性，伤人之欲。既不能断绝蒺藜荆棘之根株，又不能展凤皇麒麟之拳跼。如此则何如为和煦，为膏雨。自然天下之荣枯，融融于万户。北\n\n鸦。树叶无声神去后，纸钱灰出木绵花。可怜杨叶复杨花，雪净烟深碧玉家。乌栖不定枝条弱，城头夜半声哑哑。浮萍摇荡门前水，任罥芙蓉莫堕沙。黄云城边乌欲栖，归飞哑哑枝上啼。机中织锦秦川女，碧纱如烟隔窗语。停梭\n\n，翠叶贯寒霜。拂牖分龙影，临池待凤翔。散影玉阶柳，含翠隐鸣蝉。微形藏叶里，乱响出风前。盘根直盈渚，交干横倚天。舒华光四海，卷叶荫三川。近谷交萦橤，遥峰对出莲。径细无全磴，松小未含烟。疾风知劲草，板荡识\n    主题：柳树\n\n    <|im_end|>\n<|im_start|>user\n柳树<|im_end|>\n<|im_start|>assistant\n花惊燕地雪，叶映楚池波。 谁堪别离此，征戍在交河。 东城攀柳叶，柳叶低着草。 少壮莫轻年，轻年有人老。 柳发遍川岗，登高堪断肠。 雨烟轻漠漠，何树近君乡？ 赠君折杨柳，颜色岂能久。 上客莫霑巾，佳人正回首。 新柳送，万物无态。 唯有吾庭前杉松树枝，枝枝健在。'

In [32]:
!pip install langchain-openai

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting langchain-openai
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/15/bb/e8f080c8408673609f15436237ec8f9d8c39ab67986807d0ca2663acd7a0/langchain_openai-0.1.9-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m360.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: langchain-openai
Successfully installed langchain-openai-0.1.9


In [76]:
import os
import getpass
os.environ['OPENAI_API_KEY'] = getpass.getpass()

 ···················································


In [77]:
from langchain_openai import ChatOpenAI
grade_model = ChatOpenAI(model="gpt-4", base_url="https://api.chatanywhere.tech/v1")
grade_system_template = "你是一个中国古代诗歌评论家，接下来我会给你一首诗歌，请你依据以下标准来评价这首诗歌的得分。评分标准为：①格式。生成的唐诗是否遵循写作规则,尤其注意对仗、句数、字数、平仄（0至5分）；②创新性。生成的唐诗是否抄袭现成的诗句，是否具有创新性（0至5分）；③相关性。生成的唐诗与给定主题是否相关（0至5分）；④美学。生成的唐诗是否符合中国诗词的审美，包括寓情于物等（0至5分）；⑤整体。从整体情况评价生成的唐诗（0至10分）.请你在最后给出这几项的得分，并且你不需要给出评判的过程和任何文字只需要输出最后的5个数字。"
grade_prompt_template = ChatPromptTemplate.from_messages(
    [("system", grade_system_template), ("user", "{text}")]
)
grade_chain = grade_prompt_template | grade_model | parser

In [82]:
theme = ["青春", "梦想", "高山", "深海", "星空", "失落", "爱情", "孤独", "希望", "迷茫",
    "成长", "离别", "重逢", "夕阳", "清晨", "冬雪", "春花", "夏雨", "秋叶", "往事",
    "忧伤", "欢乐", "旅行", "归宿", "朋友", "家庭", "传统", "创新", "自由", "约束",
    "城市", "乡村", "河流", "山谷", "荒漠", "林间", "夜晚", "黎明", "阴雨", "晴天",
    "风景", "人群", "孤单", "热闹", "平静", "激情", "压力", "解脱", "寻找", "发现",
    "失去", "获得", "旧时光", "新生", "悲伤", "喜悦", "分手", "团聚", "哲理", "纷争",
    "和平", "冒险", "安全", "学习", "教育", "成就", "失败", "胜利", "竞争", "合作",
    "梦魇", "幻想", "现实", "逃避", "面对", "变化", "恒常", "追忆", "遗忘", "诗歌",
    "绘画", "音乐", "舞蹈", "雕塑", "建筑", "文学", "历史", "未来", "科技", "自然",
    "灾害", "恢复", "衰退", "繁荣", "衰弱", "强健", "疾病", "治愈", "传说", "神话"]
result = []
score = []
for each in theme:
    result.append(rag_chain.invoke({"input":each}))
print(result)

[{'input': '青春', 'context': [Document(page_content='风。年年岁岁花相似，岁岁年年人不同。寄言全盛红颜子，须怜半死白头翁。此翁白头真可怜，伊昔红颜美少年。公子王孙芳树下，清歌妙舞落花前。光禄池台文锦绣，将军楼阁画神仙。一朝卧病无人识，三春行乐在谁边。宛转', metadata={'source': '唐诗——简.txt', '_id': 'a40cb5f7c5b54be6b4bce6c2ca19138c', '_collection_name': 'my_documents'}), Document(page_content='而已波，笋在苞兮高不见节。矧桃李之当春，竞众人之攀折。我自顾悠悠而若云，又安能保君皓皓之如雪。感破镜之分明，睹泪痕之余血。幸他人之既不我先，又安能使他人之终不我夺。已焉哉！织女别黄姑，一年一度暂相见，', metadata={'source': '唐诗——简.txt', '_id': '26e869633fd84cf49500876274e3021f', '_collection_name': 'my_documents'}), Document(page_content='衰。白露霑长早，青春每到迟。秋之水兮其色幽幽，我将济兮不得其由。涉其浅兮石啮我足，乘其深兮龙入我舟，我济而悔兮将安归尤？归乎归乎，无与石鬬兮无应龙求。龟之气兮不能云雨，龟之枿兮不中梁柱，龟之大兮祗以奄', metadata={'source': '唐诗——简.txt', '_id': '2c0fc53970054f81aa5b875e9e617ac8', '_collection_name': 'my_documents'}), Document(page_content='情。独思作霖雨，流润及生灵。幽人惜春暮，潭上折芳草。佳期何时还，欲寄千里道。曲阜国，尼丘山。周公邈难问，夫子犹启关。履风雩兮若见，游夏兴兮鲁颜。天孙天孙，何为今兮学且难，负星明而东游闲闲。君不见渔阳八', metadata={'source': '唐诗——简.txt', '_id': 'aa47027846014e52919d374e485e3399', '_collection_name': 'my_documents'})], 'answer': 

TypeError: list indices must be integers or slices, not str