In [None]:
!pip install pypdf unstructured markdown faiss-cpu > /dev/null

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

# 定义文件路径
pdf_file = "./data/2023级计算机科学与技术学术硕士研究生培养方案.pdf"
md_file1 = "./data/历史沿革.md"
md_file2 = "./data/学院简介.md"

# 加载文件内容
pdf_loader = PyPDFLoader(file_path=pdf_file)
md_loader1 = UnstructuredMarkdownLoader(file_path=md_file1)
md_loader2 = UnstructuredMarkdownLoader(file_path=md_file2)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split(chunk_size, doc):
    text_splitter = RecursiveCharacterTextSplitter(
    	chunk_size = chunk_size,
    	chunk_overlap  = chunk_size // 10,
	)
    return text_splitter.split_documents(doc)

In [None]:
pdf_documents = split(chunk_size=250, doc=pdf_loader.load())
md_documents1 = split(chunk_size=50, doc=md_loader1.load())
md_documents2 = split(chunk_size=100, doc=md_loader2.load())
documents = pdf_documents + md_documents1 + md_documents2
print(f'chunk nums = {len(documents)}')

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import DashScopeEmbeddings
import os

embeddings = DashScopeEmbeddings(
    model="text-embedding-v1", dashscope_api_key=os.environ["DASHSCOPE_API_KEY"]
)


In [None]:
db = FAISS.from_documents(documents, embeddings) # 构建数据库
retriever = db.as_retriever( # 转换为检索器 返回最相关的k个文档
    search_kwargs = {
        'k': 10
	}
)


In [None]:
result = retriever.get_relevant_documents("2019年7月6日，华东师范大学成立了哪个学院？")
result

In [None]:
# from langchain_community.llms import Tongyi
# llm = Tongyi( model_name="qwen2-72b-instruct", temperature=0.7, top_p=0.7 )
# print( llm.invoke("你好吗？") )

from langchain_community.llms import Tongyi
llm = Tongyi( model_name="qwen1.5-1.8b-chat", temperature=0.95, top_p=0.7, max_tokens=10 )
print( llm.invoke("你是一个基于华东师范大学计算机科学与技术学院知识库的问答助手，请打个招呼") )

In [None]:
import langchain
langchain.debug = False
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever,verbose=True)
qa.combine_documents_chain.llm_chain.prompt.template = '''
Use the following pieces of context to answer the question at the end by a sentence without any additional information. if you don't know the answer, just say that you don't know, don't try to make up an answer. 

{context}

Question: {question}
Answer:
'''


In [None]:
query = '2019年7月6日，华东师范大学成立了哪个学院？'
qa.invoke(query)

In [None]:
from langchain.evaluation.qa import QAEvalChain
eval_chain = QAEvalChain.from_llm(llm)

import langchain
langchain.debug = True

examples = [
    # {
    #     'query':'学院在未来的发展目标是什么？',
    #     'answer': '学院致力于建设成为世界一流的计算机科学与技术学院'
	# },
    {
        'query':'2019年7月6日，华东师范大学成立了哪个学院？',
        'answer': '计算机科学与技术学院'
	},
    # {
    #     'query':'硕士研究生的总学分要求是多少？',
    #     'answer': '硕士研究生的总学分要求是23学分'
	# }
]

predictions = qa.batch(inputs=examples)

In [None]:
# print(eval_chain.prompt.template)
# eval_chain.prompt.template = '''


# Instructions:
# You are a teacher grading a quiz.
# You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either CORRECT or INCORRECT.

# Example Format:
# QUESTION: question here
# STUDENT ANSWER: student's answer here
# TRUE ANSWER: true answer here
# GRADE: CORRECT or INCORRECT here

# Guidelines:

# - Ensure that the number of main entities in the student's answer matches the number of main entities in the question.
# - Ignore minor differences in wording and punctuation.
# - Ensure that there are no conflicting statements in the student's answer. If the answer contains conflicting information or incorrect entities, it should be marked INCORRECT.
# - Additional information is acceptable only if it does not conflict with the true answer and does not introduce additional main entities.

# QUESTION: {query}
# STUDENT ANSWER: {result}
# TRUE ANSWER: {answer}
# GRADE:
# '''

In [None]:
graded_outputs = eval_chain.evaluate(examples, predictions)

In [None]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]["query"])
    print("Real Answer: " + predictions[i]["answer"])
    print("Predicted Answer: " + predictions[i]["result"])
    print("Predicted Grade: " + graded_outputs[i]["results"])
    print()


In [None]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(eval_chain.prompt.template)
query = predictions[0]['query']
answer = predictions[0]['answer']
result = predictions[0]['result']

prompt_txt =  prompt.format(query=query, answer=answer, result=result)
print(prompt_txt)

llm.invoke(prompt_txt)

In [None]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import Tongyi

# 初始化模型
llm = Tongyi(
    model_name="qwen1.5-1.8b-chat",
    temperature=0.95,
    top_p=0.7
)

# 创建回调处理器实例
callbacks = [StreamingStdOutCallbackHandler()]

# 使用模型进行预测，并启用流式输出
response = llm("你的问题或者提示", stream=True, callbacks=callbacks)