forked from langchain-ai/langchain
/
huggingface.py
164 lines (126 loc) · 5.57 KB
/
huggingface.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""Wrapper around HuggingFace embedding models."""
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Extra, Field
from langchain.embeddings.base import Embeddings
DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-large"
DEFAULT_EMBED_INSTRUCTION = "Represent the document for retrieval: "
DEFAULT_QUERY_INSTRUCTION = (
"Represent the question for retrieving supporting documents: "
)
class HuggingFaceEmbeddings(BaseModel, Embeddings):
"""Wrapper around sentence_transformers embedding models.
To use, you should have the ``sentence_transformers`` python package installed.
Example:
.. code-block:: python
from langchain.embeddings import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
"""
client: Any #: :meta private:
model_name: str = DEFAULT_MODEL_NAME
"""Model name to use."""
cache_folder: Optional[str] = None
"""Path to store models.
Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Key word arguments to pass to the model."""
def __init__(self, **kwargs: Any):
"""Initialize the sentence_transformer."""
super().__init__(**kwargs)
try:
import sentence_transformers
except ImportError as exc:
raise ValueError(
"Could not import sentence_transformers python package. "
"Please install it with `pip install sentence_transformers`."
) from exc
self.client = sentence_transformers.SentenceTransformer(
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs
)
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
def embed_documents(self, texts: List[str], **encode_kwargs) -> List[List[float]]:
"""Compute doc embeddings using a HuggingFace transformer model.
Args:
texts: The list of texts to embed.
encode_kwargs: Additional kwargs to pass into the `encode` method of the
SentenceTransformer.
Returns:
List of embeddings, one for each text.
"""
texts = list(map(lambda x: x.replace("\n", " "), texts))
embeddings = self.client.encode(texts)
return embeddings.tolist()
def embed_query(self, text: str, **encode_kwargs) -> List[float]:
"""Compute query embeddings using a HuggingFace transformer model.
Args:
text: The text to embed.
encode_kwargs: Additional kwargs to pass into the `encode` method of the
SentenceTransformer.
Returns:
Embeddings for the text.
"""
text = text.replace("\n", " ")
embedding = self.client.encode(text)
return embedding.tolist()
class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
"""Wrapper around sentence_transformers embedding models.
To use, you should have the ``sentence_transformers``
and ``InstructorEmbedding`` python package installed.
Example:
.. code-block:: python
from langchain.embeddings import HuggingFaceInstructEmbeddings
model_name = "hkunlp/instructor-large"
model_kwargs = {'device': 'cpu'}
hf = HuggingFaceInstructEmbeddings(
model_name=model_name, model_kwargs=model_kwargs
)
"""
client: Any #: :meta private:
model_name: str = DEFAULT_INSTRUCT_MODEL
"""Model name to use."""
cache_folder: Optional[str] = None
"""Path to store models.
Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Key word arguments to pass to the model."""
embed_instruction: str = DEFAULT_EMBED_INSTRUCTION
"""Instruction to use for embedding documents."""
query_instruction: str = DEFAULT_QUERY_INSTRUCTION
"""Instruction to use for embedding query."""
def __init__(self, **kwargs: Any):
"""Initialize the sentence_transformer."""
super().__init__(**kwargs)
try:
from InstructorEmbedding import INSTRUCTOR
self.client = INSTRUCTOR(
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs
)
except ImportError as e:
raise ValueError("Dependencies for InstructorEmbedding not found.") from e
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Compute doc embeddings using a HuggingFace instruct model.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
instruction_pairs = [[self.embed_instruction, text] for text in texts]
embeddings = self.client.encode(instruction_pairs)
return embeddings.tolist()
def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using a HuggingFace instruct model.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
instruction_pair = [self.query_instruction, text]
embedding = self.client.encode([instruction_pair])[0]
return embedding.tolist()