### References
1. [Run Falcon on Smaller Machines](https://vilsonrodrigues.medium.com/run-your-private-llm-falcon-7b-instruct-with-less-than-6gb-of-gpu-using-4-bit-quantization-ff1d4ffbabcc)


### Prompt Generation

In [1]:
system_role_prompt = "Act as a Lawyer drafting European Legislative documents to be published on Eur-Lex website."

In [2]:
system_instruction_prompt = "Define the term: {term}, based on the sentences provided between the triple dashes where different sentences are splitted by new line character \n. ---{sentences}---"

In [3]:
system_context_prompt = "Provide a clear and concise definition strictly within 35 to 40 words that accurately conveys its meaning within the context of the sentences provided between the triple dashes."

In [4]:
system_output_prompt = """Give your output in JSON format with following keys: [term, definition] and definition must be strictly in the format "'term' means". Just return the JSON, do not add ANYTHING, NO INTERPRETATION!"""

In [5]:
# Generate term definition
template = f"""
          {system_role_prompt}\n
          {system_instruction_prompt}\n
          {system_context_prompt}\n
          {system_output_prompt}
          """

In [6]:
term = "energy infrastructure bottleneck"
sentences = """The following specific criteria shall apply to projects of common interest falling within specific energy infrastructure categories: (a) for electricity transmission, distribution and storage projects falling under the energy infrastructure categories set out in point (1)(a), (b), (c), (d) and (f) of Annex II, the project contributes significantly to sustainability through the integration of renewable energy into the grid, the transmission or distribution of renewable generation to major consumption centres and storage sites, and to reducing energy curtailment, where applicable, and contributes to at least one of the following specific criteria:(i)market integration, including through lifting the energy isolation of at least one Member State and reducing energy infrastructure bottlenecks, competition, interoperability and system flexibility;(ii)security of supply, including through interoperability, system flexibility, cybersecurity, appropriate connections and secure and reliable system operation;"""

In [7]:
# Common required libraries
!pip install -q transformers einops accelerate langchain bitsandbytes peft safetensors

In [8]:
import torch
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# model_id="tiiuae/falcon-7b-instruct"
model_id = "mrm8488/falcoder-7b"

model_4bit = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        quantization_config=quantization_config,
        trust_remote_code=True
        )

tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
flacon_pipeline = pipeline(
        "text-generation",
        model=model_4bit,
        tokenizer=tokenizer,
        use_cache=True,
        device_map="auto",
        max_length=1000,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
)

### Falcon

In [None]:
# from langchain import HuggingFacePipeline # Use the pipeline
# from transformers import AutoTokenizer, pipeline
# import torch

# from transformers import AutoModelForCausalLM

# model_id="tiiuae/falcon-7b-instruct" #tiiuae/falcon-40b-instruct
# tokenizer=AutoTokenizer.from_pretrained(model_id)
# model=AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

# # model = "mrm8488/falcoder-7b"

# falcon_pipeline = pipeline(
#     "text-generation", #task
#     model=model,
#     tokenizer=tokenizer,
#     torch_dtype=torch.bfloat16,
#     trust_remote_code=True, #since Flacon model is not a part of transformers
#     device_map="auto",
#     max_length=1000,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     eos_token_id=tokenizer.eos_token_id
# )

In [None]:
llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})

In [None]:
from langchain import PromptTemplate,  LLMChain

prompt = PromptTemplate(template=template, input_variables=["term", "sentences"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
falcon_response = llm_chain.run({"term": term,"sentences": sentences})

In [None]:
response_output = {}
falcon_response = " "
chatgpt_response = " "

In [None]:
response_output[term] = {
    "term": term,
    "sentences": sentences,
    "llama_generated_definition": llama_response,
    "falcon_generated_definition": falcon_response,
    "openai_generated_definition": chatgpt_response,
}