In [1]:
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO, force=True)

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(
    "llm-jp/llm-jp-13b-instruct-full-jaster-v1.0",
)
model = AutoModelForCausalLM.from_pretrained(
    "llm-jp/llm-jp-13b-instruct-full-jaster-v1.0",
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

# パイプラインの準備
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256
)
# llm取得
llm = HuggingFacePipeline(pipeline=pipe)

In [4]:
# 埋め込みモデルの準備
from langchain.embeddings import HuggingFaceEmbeddings
# from llama_index import LangchainEmbedding
from llama_index.embeddings.langchain import LangchainEmbedding
from typing import Any, List

# 埋め込みクラスにqueryを付加
class HuggingFaceQueryEmbeddings(HuggingFaceEmbeddings):
    def __init__(self, **kwargs: Any):
        super().__init__(**kwargs)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return super().embed_documents(["query: " + text for text in texts])

    def embed_query(self, text: str) -> List[float]:
        return super().embed_query("query: " + text)

# 埋め込みモデルの準備
embed_model = LangchainEmbedding(
    HuggingFaceQueryEmbeddings(model_name="intfloat/multilingual-e5-large")
)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: intfloat/multilingual-e5-large
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda


In [11]:
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_properties(0).total_memory / (1024*1024*1024))

t = torch.cuda.get_device_properties(0).total_memory / (1024*1024)
r = torch.cuda.memory_reserved(0) / (1024*1024)
a = torch.cuda.memory_allocated(0) / (1024*1024)
f = r-a  # free inside reserved

print(t)
print(r)
print(a)
print(f)

2.3.0+cu121
True
14.74810791015625
15102.0625
14926.0
14906.5732421875
19.4267578125


In [6]:
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter
import tiktoken

Settings.llm = llm
Settings.embed_model = embed_model
Settings.node_parser = TokenTextSplitter(separator="。 ", chunk_size=512, chunk_overlap=100, tokenizer=tiktoken.get_encoding("cl100k_base").encode)

In [7]:
from llama_index.core import SimpleDirectoryReader

# ドキュメントの読み込み
documents = SimpleDirectoryReader(
    input_dir="data"
).load_data()

In [8]:
from llama_index.core import VectorStoreIndex

# インデックスの作成
index = VectorStoreIndex.from_documents(documents)

OutOfMemoryError: CUDA out of memory. Tried to allocate 100.00 MiB. GPU 