In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
from pathlib import Path

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# hugging face model repo
MODEL_REPO = "lmsys/vicuna-7b-v1.5"
# cuda device
CUDA_DEVICE = "cuda:0"
# local cache dir
MODEL_CACHE_DIR = Path("./tmp/models/")
# create cache dir
MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
# max token length
MAX_LENGTH = 2048

# 1. Huggingface

In [3]:
# load model, if not exist, download from model repo
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO, cache_dir=MODEL_CACHE_DIR)
    model = AutoModelForCausalLM.from_pretrained(MODEL_REPO, cache_dir=MODEL_CACHE_DIR, torch_dtype=torch.float16, low_cpu_mem_usage=True)
    model.to(CUDA_DEVICE)
    model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [5]:
query = "What is Ghent University?"

In [6]:
# This code is used to generate text based on an initial input query using a language model.

# Disable gradient computation. This is done to save memory and speed as we only want inference (no backpropagation).
with torch.no_grad():

    # Get the length of the initial input query.
    l_prompt = len(query)

    # Set the temperature for sampling. A lower value makes the output more deterministic.
    temperature = 0.0

    # Set the maximum number of tokens the model can generate in one forward pass.
    max_new_tokens = MAX_LENGTH

    # Tokenize the input query to get the input IDs (numeric representation of the input text).
    input_ids = tokenizer(query).input_ids

    # Initialize the list of output IDs with the input IDs.
    output_ids = list(input_ids)

    # Calculate the maximum source length to ensure the input doesn't exceed the model's max sequence length.
    max_src_len = 4096 - max_new_tokens - 8
    input_ids = input_ids[-max_src_len:]

    # Loop over the maximum number of tokens to generate.
    for i in range(max_new_tokens):

        # If it's the first token to be generated.
        if i == 0:
            # Pass the input IDs to the model to get logits and past key values.
            out = model(
                torch.as_tensor([input_ids]).to(CUDA_DEVICE), use_cache=True)
            logits = out.logits
            past_key_values = out.past_key_values

        # For subsequent tokens.
        else:
            # Create an attention mask to pay attention to the current tokens.
            attention_mask = torch.ones(
                1, past_key_values[0][0].shape[-2] + 1, device=CUDA_DEVICE)
            
            # Pass the last generated token and past key values to the model.
            out = model(input_ids=torch.as_tensor([[token]], device=CUDA_DEVICE),
                        use_cache=True,
                        attention_mask=attention_mask,
                        past_key_values=past_key_values)
            logits = out.logits
            past_key_values = out.past_key_values

        # Extract the logits for the last token.
        last_token_logits = logits[0][-1]

        # If temperature is very low, pick the token with the highest logit directly.
        if temperature < 1e-4:
            token = int(torch.argmax(last_token_logits))
        # Otherwise, use temperature sampling.
        else:
            probs = torch.softmax(last_token_logits / temperature, dim=-1)
            token = int(torch.multinomial(probs, num_samples=1))

        # Append the generated token to the output IDs.
        output_ids.append(token)

        # Check if the generated token is the end-of-sequence token.
        if token == tokenizer.eos_token_id:
            stopped = True
        else:
            stopped = False

        # Decode the output IDs to get the generated text.
        output = tokenizer.decode(output_ids, skip_special_tokens=True)
        print(f"Iteration {i}:\n{output}")

        # If the generation should stop, break out of the loop.
        if stopped:
            break

Iteration 0:
What is Ghent University?

Iteration 1:
What is Ghent University?
 nobody
Iteration 2:
What is Ghent University?
 nobody knows
Iteration 3:
What is Ghent University?
 nobody knows


### Query use model's generate method
understand the parameters: https://huggingface.co/blog/how-to-generate

In [7]:
with torch.no_grad():
    inputs = tokenizer(query, padding=False, add_special_tokens=False, return_tensors="pt").to(CUDA_DEVICE)    
    output_sequences = model.generate(input_ids=inputs["input_ids"], max_length=MAX_LENGTH,  do_sample=True, temperature=0.9, top_p=0.6)
    text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    print(text)

What is Ghent University?
Ghent University is a public research university located in Ghent, Belgium. It was founded in 1817 and is one of the oldest and most prestigious universities in the country. The university has a strong focus on research and has made significant contributions to a wide range of fields, including medicine, engineering, social sciences, and humanities.

Ghent University has three campuses located in Ghent, and it offers a wide range of undergraduate and graduate programs in various fields of study. The university has a diverse student body, with students from over 120 different countries.

The university is known for its high-quality education and research, and it has a strong reputation in the academic community. Ghent University is also home to several research centers and institutes, including the Ghent Institute for Biotechnology, the Ghent Center for Conflict and Security Studies, and the Ghent Institute for International Studies.

Overall, Ghent University 

### Query use pipeline

In [8]:

chat_pipeline = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=MAX_LENGTH, device=CUDA_DEVICE)
result = chat_pipeline(query, do_sample=True, temperature=0.9, top_p=0.6)
text = result[0]["generated_text"]
print(text)

What is Ghent University?
 Ghent University is a public research university located in Ghent, Belgium. It was founded in 1817 and is one of the oldest and most prestigious universities in Belgium. The university has 11 faculties and offers a wide range of undergraduate and graduate programs in various fields such as science, engineering, medicine, law, economics, and social sciences. Ghent University is known for its high-quality education, innovative research, and strong international collaboration. It has a diverse student body and a faculty of over 6,000 employees. The university is also home to several research institutes and centers, including the Ghent Institute for Biotechnology and the Ghent Center for Conflict and Development Studies.


# 2. Langchain

In [9]:
from typing import Any, List, Mapping, Optional

from langchain.llms.base import LLM
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

### Wrap the LLM into a LangChain compatible model

In [10]:
# https://python.langchain.com/docs/modules/model_io/models/llms/custom_llm

class CustomLLM(LLM):
    model_name = MODEL_REPO    
    
    def __init__(self, *args, **kwargs):        
        super().__init__(*args, **kwargs)
        object.__setattr__(self, 'model_pipeline', chat_pipeline)

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
    ) -> str:        
        result = self.model_pipeline(prompt, do_sample=True, temperature=0.9, top_p=0.6)
        text = result[0]["generated_text"]
        return text

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"model_name": self.model_name}
    
    @property
    def _llm_type(self) -> str:
        return "custom"    
llm = CustomLLM()    

### Example query

In [11]:
query = [
    "Please list three facts about Ghent university.", 
    "Please repeat the second fact."
]

### query without memory

In [12]:
with warnings.catch_warnings():
    # avoid the sequential using pipelines warning
    warnings.simplefilter('ignore')
    for qid in range(2):
        print(f"Query {qid}: {query[qid]}")
        response = llm(query[qid])
        print(f"Response: {response}")
        print("\n")

Query 0: Please list three facts about Ghent university.
Response: Please list three facts about Ghent university. (1) Ghent University is a public research university located in Ghent, Belgium. (2) It was founded in 1905 and is one of the oldest universities in Belgium. (3) The university has a strong focus on research and has a number of research centers and institutes, including the Ghent Institute for Biotechnology and the Ghent Center for Conflict and Security Studies.


Query 1: Please repeat the second fact.
Response: Please repeat the second fact.1. The sun is the star at the center of the solar system.2. The sun is the star at the center of the solar system.




### query with memory

In [13]:
# Notice that "chat_history" is present in the prompt template
template = """You are a nice chatbot having a conversation with a human.

Previous conversation:
{chat_history}

New human query: {question}
Response:"""
prompt = PromptTemplate.from_template(template)

In [14]:
# Notice that we need to align the `memory_key`
memory = ConversationBufferMemory(memory_key="chat_history")
conversation = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=False,
    memory=memory
)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    for qid in range(2):
        print(f"Query {qid}: {query[qid]}")
        result = conversation({"question": query[qid]})["text"]
        print(f"Response: {result.split('Response:')[-1].strip()}")
        print("\n")

Query 0: Please list three facts about Ghent university.
Response: 1. Ghent University is located in Ghent, Belgium.
2. It was founded in 1817 and is one of the oldest universities in Belgium.
3. Ghent University has a strong focus on research and innovation, with over 150 research groups and a wide range of collaborations with industry and other institutions.


Query 1: Please repeat the second fact.
Response: 2. Ghent University was founded in 1817 and is one of the oldest universities in Belgium.




# 3. LlamaIndex

In [15]:
from pathlib import Path
from typing import Any

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import ServiceContext, LangchainEmbedding, VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms import CustomLLM, CompletionResponse, CompletionResponseGen, LLMMetadata
from llama_index.llms.base import llm_completion_callback

In [16]:
EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large"
CONTEXT_WINDOW = 4096
NUM_OUTPUT = 2048
DEFAULT_BATCH_SIZE = 512

data_folder = Path("./tmp/examples/llama_index/data")
Path(data_folder).mkdir(parents=True, exist_ok=True)

# download data if not yet downloaded
# ! wget https://raw.githubusercontent.com/run-llama/llama_index/main/examples/paul_graham_essay/data/paul_graham_essay.txt -P ./tmp/examples/llama_index/data

### Wrap the Langchain LLM into a llama_index LLM

In [17]:
class LlamaIndexLLM(CustomLLM):
    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=CONTEXT_WINDOW,
            num_output=NUM_OUTPUT,
            model_name=MODEL_REPO
        )

    @llm_completion_callback()
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
        prompt_length = len(prompt)             
        response = chat_pipeline(prompt, do_sample=True, temperature=0.9, top_p=0.6, max_length=CONTEXT_WINDOW)[0]["generated_text"]
        # only return newly generated tokens
        text = response[prompt_length:]
        return CompletionResponse(text=text)
    
    @llm_completion_callback()
    def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
        raise NotImplementedError()
    
llm = LlamaIndexLLM()    

### Setup the index for querying

In [18]:
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        model_kwargs={'device': CUDA_DEVICE},        
    ),
    embed_batch_size=DEFAULT_BATCH_SIZE
)

service_context = ServiceContext.from_defaults(
    llm=llm, 
    embed_model=embed_model,
    context_window=CONTEXT_WINDOW, 
    num_output=NUM_OUTPUT,    
)
documents = SimpleDirectoryReader(data_folder).load_data()
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [21]:
import logging
import sys
logging.basicConfig(level=logging.DEBUG)

In [22]:
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
# print(response)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:llama_index.indices.utils:> Top 2 nodes:
> [Node a5acc068-1893-4bc4-9307-7745545afeb6] [Similarity score:             0.798015] What I Worked On

February 2021

Before college the two main things I worked on, outside of schoo...
> [Node f4a2c494-1dd4-4666-a8d8-53ddd8638d6c] [Similarity score:             0.796443] Now all I had to do was learn Italian.

Only stranieri (foreigners) had to take this entrance exa...
DEBUG:llama_index.llm_predictor.base:
The author grew up writing short stories and trying to learn computer programming. He didn't start studying philosophy in college as he initially planned, but instead switched to AI. He also had an interest in painting and attended the Accademia di Belle Arti in Florence, Italy, where he painted still lives at night in his bedroom.
