In [1]:
import os

HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [3]:
from langchain_community.llms import HuggingFaceHub
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

In [4]:
### 创建prompt 模板
question = "Where is the capital of China? "

template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question" ])

In [18]:
repo_id = "google/flan-t5-base"  # 具体可以参考 https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads 

In [None]:
llm = HuggingFaceHub(
    repo_id=repo_id, 
)
llm_chain = LLMChain(prompt=prompt, llm=llm  , llm_kwargs = {"temperature":0, "max_length":512})

print(llm_chain.run(question))

In [22]:
from langchain.document_loaders import PyPDFLoader

###加载文件
loader = PyPDFLoader("../data//baichuan.pdf")
pages = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

###文本切分
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 300,chunk_overlap = 50,)

docs = text_splitter.split_documents(pages[:4])

ImportError: pypdf package not found, please install it with `pip install pypdf`

In [None]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS


embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
)

db = FAISS.from_documents(docs, embeddings)

query = "How large is the baichuan2 vocabulary size?"
result_simi = db.similarity_search(query , k = 3)

In [None]:
source_knowledge = "\n".join([x.page_content for x in result_simi])

In [None]:
augmented_prompt = """Using the contexts below, answer the query.

contexts:
{source_knowledge}

query: {query}"""

In [None]:
prompt = PromptTemplate(template=augmented_prompt, input_variables=["source_knowledge" ,"query"])


llm_chain = LLMChain(prompt=prompt, llm=llm  , llm_kwargs = {"temperature":0, "max_length":1024})

print(llm_chain.run( {"source_knowledge":source_knowledge ,"query" : query }))

In [None]:
augmented_prompt_2 = f"""Using the contexts below, answer the query.

contexts:
{source_knowledge}

query: {query}"""

In [None]:
print(augmented_prompt_2)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig

In [None]:
import torch
from modelscope import snapshot_download, Model
model_dir = snapshot_download("baichuan-inc/Baichuan2-7B-Chat", revision='master')
model = Model.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)
messages = []
messages.append({"role": "user", "content": "讲解一下“温故而知新”"})
response = model(messages)
print(response)

In [None]:
content = '''Using the contexts below, answer the query.

contexts:
have taken both these aspects into account. We
have expanded the vocabulary size from 64,000
in Baichuan 1 to 125,696, aiming to strike a
balance between computational efficiency and
model performance.
Tokenizer V ocab Size Compression Rate ↓
LLaMA 2 32,000 1.037
Bloom 250,680 0.501
improve after training on more than 2.6 trillion
tokens. By sharing these intermediary results,
we hope to provide the community with greater
insight into the training dynamics of Baichuan 2.
Understanding these dynamics is key to unraveling
the inner working mechanism of large language
Baichuan 2: Open Large-scale Language Models
Aiyuan Yang, Bin Xiao, Bingning Wang, Borong Zhang, Chao Yin, Chenxu Lv, Da Pan
Dian Wang, Dong Yan, Fan Yang, Fei Deng, Feng Wang, Feng Liu, Guangwei Ai
Guosheng Dong, Haizhou Zhao, Hang Xu, Haoze Sun, Hongda Zhang, Hui Liu, Jiaming Ji

query: How large is the baichuan2 vocabulary size?
'''

In [None]:
messages = []
messages.append({"role": "user", "content": content})
response = model(messages)
print(response)