In [1]:
from torch import cuda, bfloat16
import transformers

model_name = '/home/ubuntu/model/falcon-40b-instruct'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto'
)
model.eval()
print(f"Model loaded on {device}")

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/ubuntu/dev/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
ERROR: /home/ubuntu/dev/bin/python: undefined symbol: cudaRuntimeGetVersion
CUDA SETUP: libcudart.so path is None
CUDA SETUP: Is seems that your cuda installation is not in your path. See https://github.com/TimDettmers/bitsandbytes/issues/85 for more information.
CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 00
CUDA SETUP: Loading binary /home/ubuntu/dev/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Loading checkpoint shards: 100%|██████████| 9/9 [00:41<00:00,  4.61s/it]

Model loaded on cuda:0





In [2]:
import transformers
from transformers import StoppingCriteria, StoppingCriteriaList
import torch


tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
# we create a list of stopping criteria
stop_token_ids = [
    tokenizer.convert_tokens_to_ids(x) for x in [
        ['Human', ':'], ['AI', ':']
    ]
]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

In [3]:
from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [4]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

The model 'RWForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'R

In [5]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=generate_text)

In [6]:
# Import os to set API key
import os
# Import OpenAI as main LLM service
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
# Bring in streamlit for UI/app interface
# Import PDF document loaders...there's other ones as well!
from langchain.document_loaders import PyPDFLoader
# Import chroma as the vector store 
from langchain.vectorstores import Chroma

# Import vector store stuff
from langchain.agents.agent_toolkits import (
    create_vectorstore_agent,
    VectorStoreToolkit,
    VectorStoreInfo
)

# Set APIkey for OpenAI Service
# Can sub this out for other LLM providers
os.environ['OPENAI_API_KEY'] = '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'

In [7]:
embeddings = OpenAIEmbeddings()

In [16]:
text = "hello word"
embed = embeddings.embed_query(text)
print(len(embed))

1536

In [8]:
# Create and load PDF Loader
loader = PyPDFLoader('paper.pdf')
# Split pages from pdf 
pages = loader.load_and_split()[0:2]


In [12]:
# Load documents into vector database aka ChromaDB
embeddings = OpenAIEmbeddings()
db5 = Chroma.from_documents(pages, embeddings)

In [13]:
vectorstore_info = VectorStoreInfo(
    name="paper",
    description="a scietfic paper as a pdf",
    vectorstore=db5
)


In [14]:
# Convert the document store into a langchain toolkit
toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info,llm = llm)

In [15]:
agent_executor = create_vectorstore_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True
)

In [16]:
agent_executor.run("what does this paper propose?")

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I am going to read the paper
Action: paper
Action Input: "https://arxiv.org/abs/2001.09846"[0m

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



Observation: [36;1m[1;3m
The authors propose a new method for solar irradiance forecasting called Solar Irradiance Anticipative Transformer (SIAT). SIAT utilizes a self-attention based backbone network that creates feature representations for each frame in a sequence of all-sky images and then uses a Generative Pre-trained Transformer (GPT) to generate future irradiance values. The authors compare their model to other existing methods on three different datasets and demonstrate its effectiveness.[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: The paper proposes a new method for solar irradiance forecasting called Solar Irradiance Anticipative Transformer (SIAT).[0m

[1m> Finished chain.[0m


'The paper proposes a new method for solar irradiance forecasting called Solar Irradiance Anticipative Transformer (SIAT).'

In [None]:
import tiktoken 
tiktoken.encoding_for_model('falcon')