In [None]:
!nvidia-smi

In [1]:
import asyncio

In [2]:
import torch
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, TextStreamer, pipeline , GenerationConfig
from deep_translator import GoogleTranslator
from auto_gptq import AutoGPTQForCausalLM
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(DEVICE)

cuda:0


# Data

In [4]:
loader = PyPDFDirectoryLoader("pdfs")
docs = loader.load()
len(docs)

118

In [5]:
embeddings = HuggingFaceInstructEmbeddings(
        model_name="intfloat/multilingual-e5-large",
        model_kwargs={"device": DEVICE},
    )

load INSTRUCTOR_Transformer
max_seq_length  512


In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(docs)
len(texts)

143

In [7]:
%%time
db = Chroma.from_documents(texts, embeddings, persist_directory="db")

CPU times: total: 8.8 s
Wall time: 10.4 s


In [8]:
model_name_or_path = "TheBloke/Llama-2-7B-chat-GPTQ"
model_basename = "model"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

In [None]:
# model_id = "TheBloke/Llama-2-7B-chat-GPTQ"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# quantization_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer , use_cuda_fp16 = True)

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=quantization_config)

In [9]:
model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    # revision="gptq-4bit-128g-actorder_True",
    model_basename=model_basename,
    use_safetensors=True,
    trust_remote_code=True,
    inject_fused_attention=False,
    device=DEVICE,
    quantize_config=None,
)

Skipping module injection for FusedLlamaMLPForQuantizedModel as currently not supported with use_triton=False.


In [74]:
# DEFAULT_SYSTEM_PROMPT = """
# You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should be based on supported books in your knowledge base. Please ensure that your responses are socially unbiased and positive in nature.

# If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
# """.strip()


# def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
#     return f"""
# [INST] <>
# {system_prompt}
# <>

# {prompt} [/INST]
# """.strip()

In [10]:
DEFAULT_SYSTEM_PROMPT = """
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
""".strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
[INST] <>
{system_prompt}
<>

{prompt} [/INST]
""".strip()

In [11]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

In [12]:
# Avoid warning 
generation_config = GenerationConfig.from_pretrained(model_name_or_path)

In [13]:
text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1024,
    temperature=0.7,
    # top_p=0.95,
    repetition_penalty=1.15,
    do_sample=True,
    streamer=streamer,
    generation_config=generation_config,
)

The model 'LlamaGPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartFo

In [14]:
llm = HuggingFacePipeline(pipeline=text_pipeline)#, model_kwargs={"temperature": 0}

In [22]:
# SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer."
SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, and try to make up an answer."
template = generate_prompt(
    """
{context}

Question: {question}
""",
    system_prompt=SYSTEM_PROMPT,
)

In [23]:
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [24]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler  # for streaming response
from langchain.callbacks.manager import CallbackManager

In [25]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [26]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
    callbacks=callback_manager,
)

In [27]:
while True:
    query = input("\nEnter a query: ")
    if query == "exit":
        break
    # Get the answer from the chain
    query = GoogleTranslator(source = 'auto' , target = 'en').translate(query)
    res = qa_chain(query)
    answer, docs = res["result"], res["source_documents"]

    # Print the result
    print("\n\n> Question:")
    print(query)
    print("\n> Answer:")
    print(GoogleTranslator(source = 'auto' , target = 'ar').translate(answer))


Enter a query:  the cup we can drink in it . rate my answer according to context 'Stanford-Binet Intelligence Scale' from 1 to 10


 Based on the provided context, I cannot accurately estimate the level of support for the question "the cup we can drink in it" as it is not relevant to the given information. The context provides information about a child's communication skills, interactive social skills, and problem-solving abilities, but does not provide any information about cups or drinking. Therefore, I cannot provide a rating for this question based on the Stanford-Binet Intelligence Scale.


> Question:
the cup we can drink in it . rate my answer according to context 'Stanford-Binet Intelligence Scale' from 1 to 10

> Answer:
بناء على السياق المقدم، لا أستطيع تقدير مستوى التأييد لسؤال "الكوب الذي نشرب فيه" بشكل دقيق لأنه لا علاقة له بالمعلومات المقدمة. يوفر السياق معلومات حول مهارات التواصل لدى الطفل، والمهارات الاجتماعية التفاعلية، وقدراته على حل المشكلات، لكنه لا يقدم أي معلومات حول الأكواب أو الشرب. لذلك، لا يمكنني تقديم تقييم لهذا السؤال بناءً على مقياس ستانفورد بينيه للذكاء.


KeyboardInterrupt: Interrupted by user

In [29]:
output = ''
target_lang = 'en'

In [34]:
question = "من هو مؤلف كتاب مقياس التأخر النمائى"

In [35]:
input_text = GoogleTranslator(source = 'auto' , target = 'en').translate(question)

In [37]:
result = qa_chain(input_text)

 Based on the provided context, the author of the book "The Developmental Delay Scale" is Dr. Abdel Mawjoud Abdel Samie.


In [38]:
print(GoogleTranslator(source = 'auto' , target = 'ar').translate(result['result']))

وبناء على السياق المذكور فإن مؤلف كتاب "مقياس التأخر النمائي" هو الدكتور عبد الموجود عبد السميع.


In [None]:
len(result['result'])

In [None]:
print(result["source_documents"][0].page_content)

In [None]:
result = qa_chain("What is the per share revenue for Tesla during 2023?")

In [None]:
print(result["source_documents"][0].page_content)