Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(serialisation): vllm ignore #324

Merged
merged 4 commits into from
Sep 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog.d/324.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
vLLM now should support safetensors loading format, so `--serlisation` should be agnostic of backend now

Removed some legacy check and default behaviour
2 changes: 1 addition & 1 deletion openllm-python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ openai = ["openai", "tiktoken"]
opt = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
starcoder = ["bitsandbytes"]
vllm = ["vllm>=0.1.4", "ray"]
vllm = ["vllm>=0.1.6", "ray"]

[tool.hatch.version]
fallback-version = "0.0.0"
Expand Down
18 changes: 13 additions & 5 deletions openllm-python/src/openllm/_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,8 @@ class LLM(LLMInterface[M, T], ReprMixin):

def __attrs_init__(self,
config: LLMConfig,
quantize: t.Optional[LiteralQuantise],
quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig]],
quantize: t.Optional[LiteralQuantise],
model_id: str,
model_decls: TupleAny,
model_attrs: DictStrAny,
Expand Down Expand Up @@ -446,8 +446,6 @@ def from_pretrained(cls,
# in case users input `tokenizer` to __init__, default to the _model_id
if quantize == 'gptq': attrs.setdefault('tokenizer', _model_id)
quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
if quantize == 'gptq': serialisation = 'safetensors'
elif cls.__llm_backend__ == 'vllm': serialisation = 'legacy' # Currently working-in-progress

# NOTE: LoRA adapter setup
if adapter_map and adapter_id:
Expand Down Expand Up @@ -534,12 +532,12 @@ def __init__(self,
model_id: str,
llm_config: LLMConfig,
quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None,
_adapters_mapping: AdaptersMapping | None,
_tag: bentoml.Tag,
_quantize: LiteralQuantise | None,
_model_version: str,
_tag: bentoml.Tag,
_serialisation: t.Literal['safetensors', 'legacy'],
_local: bool,
_adapters_mapping: AdaptersMapping | None,
**attrs: t.Any,
):
'''Initialize the LLM with given pretrained model.
Expand Down Expand Up @@ -941,6 +939,16 @@ def __call__(self, prompt: str, **attrs: t.Any) -> t.Any:
prompt, generate_kwargs, postprocess_kwargs = self.sanitize_parameters(prompt, **attrs)
return self.postprocess_generate(prompt, self.generate(prompt, **generate_kwargs), **postprocess_kwargs)

def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]
return [{'generated_text': result}]

def generate(self, prompt: str, **attrs: t.Any) -> t.List[t.Any]:
# TODO: support different generation strategies, similar to self.model.generate
for it in self.generate_iterator(prompt, **attrs):
Expand Down
10 changes: 0 additions & 10 deletions openllm-python/src/openllm/models/falcon/modeling_falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,3 @@ def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
attention_mask=inputs['attention_mask'],
generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()),
skip_special_tokens=True)

def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]
return [{'generated_text': result}]
10 changes: 0 additions & 10 deletions openllm-python/src/openllm/models/llama/modeling_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,3 @@ def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
masked_embeddings = data * mask
sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(), num_tokens=int(torch.sum(attention_mask).item()))

def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]
return [{'generated_text': result}]
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,3 @@ def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
# TODO: We will probably want to return the tokenizer here so that we can manually process this
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]
return [{'generated_text': result}]
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,6 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
_, tokenizer_attrs = llm.llm_parameters
quantize = llm._quantize
safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation == 'safetensors')
# Disable safe serialization with vLLM
if llm.__llm_backend__ == 'vllm': safe_serialisation = False
metadata: DictStrAny = {'safe_serialisation': safe_serialisation}
if quantize: metadata['_quantize'] = quantize
architectures = getattr(config, 'architectures', [])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,21 @@ class HfIgnore:

@classmethod
def ignore_patterns(cls, llm: openllm.LLM[M, T]) -> list[str]:
if llm.__llm_backend__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors, cls.gguf]
if llm.__llm_backend__ == 'vllm':
base = [cls.tf, cls.flax, cls.gguf]
if has_safetensors_weights(llm.model_id) or llm._serialisation == 'safetensors': base.append(cls.pt)
else: base.append(cls.safetensors)
elif llm.__llm_backend__ == 'tf': base = [cls.flax, cls.pt, cls.gguf]
elif llm.__llm_backend__ == 'flax':
base = [cls.tf, cls.pt, cls.safetensors, cls.gguf] # as of current, safetensors is not supported with flax
elif llm.__llm_backend__ == 'pt':
base = [cls.tf, cls.flax, cls.gguf]
if has_safetensors_weights(llm.model_id): base.append(cls.pt)
if has_safetensors_weights(llm.model_id) or llm._serialisation == 'safetensors': base.append(cls.pt)
else: base.append(cls.safetensors)
elif llm.__llm_backend__ == 'ggml':
base = [cls.tf, cls.flax, cls.pt, cls.safetensors]
else:
raise ValueError('Unknown backend (should never happen at all.)')
# filter out these files, since we probably don't need them for now.
base.extend(['*.pdf', '*.md', '.gitattributes', 'LICENSE.txt'])
base.extend(['*.pdf', '*.md', '.gitattributes', 'LICENSE.txt', 'Notice'])
return base
4 changes: 2 additions & 2 deletions openllm-python/src/openllm/utils/dummy_pt_objects.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions openllm-python/src/openllm/utils/dummy_vllm_objects.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tools/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def create_classifier(identifier: str, *decls: t.Any) -> str:
def create_python_classifier(implementation: list[str] | None = None, supported_version: list[str] | None = None) -> list[str]:
if supported_version is None: supported_version = ['3.8', '3.9', '3.10', '3.11', '3.12']
if implementation is None: implementation = ['CPython', 'PyPy']
base = [Classifier.create_classifier('language', 'Python'), Classifier.create_classifier('language', 'Python', '3'),]
base = [Classifier.create_classifier('language', 'Python'), Classifier.create_classifier('language', 'Python', '3')]
base.append(Classifier.create_classifier('language', 'Python', '3', 'Only'))
base.extend([Classifier.create_classifier('language', 'Python', version) for version in supported_version])
base.extend([Classifier.create_classifier('language', 'Python', 'Implementation', impl) for impl in implementation])
Expand Down
Loading