# Load Data

In [None]:
! pip install langchain llamaapi langchain_experimental openai faiss-cpu tiktoken awadb -q

In [15]:
from llamaapi import LlamaAPI
from langchain_experimental.llms import ChatLlamaAPI
from langchain.chains import LLMChain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import AlephAlphaAsymmetricSemanticEmbedding,AlephAlphaSymmetricSemanticEmbedding,LlamaCppEmbeddings
from langchain.embeddings.awa import AwaEmbeddings



In [16]:
PATH = '/kaggle/input/books-ds/blueprints-for-text-analytics-using-python-machine-learning-based-solutions-for-common-real-world-nlp-applications-149207408x-9781492074083_compress.pdf'
loader = PyPDFLoader(PATH)
splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=100)
data =  loader.load_and_split(splitter)
data[0]

Document(page_content='Jens Albrecht,  \nSidharth Ramachandran  \n& Christian Winkler\nBlueprints  \n for Text Analytics \nUsing Python\nMachine Learning-Based Solutions for  \nCommon Real World (NLP) Applications', metadata={'source': '/kaggle/input/books-ds/blueprints-for-text-analytics-using-python-machine-learning-based-solutions-for-common-real-world-nlp-applications-149207408x-9781492074083_compress.pdf', 'page': 0})

Now we have load and split our data into chunks , then we have to embed them to store in vector database

In [None]:
! pip install openai==0.28.1

In [17]:
import os
os.environ['OPENAI_API_KEY'] = 'sk-'
query = "What is the content of the document?"
embeddings = OpenAIEmbeddings(show_progress_bar=True)
v1 = embeddings.embed_query(query)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
len(v1) 

In [18]:
import numpy as np
from numpy.linalg import norm

def cosine_similarity(vector1:np.array,vector2:np.array):
    return (vector1@vector2)/(norm(vector1)*norm(vector2))
vector1 = np.array(embeddings.embed_query('AI'))
vector2 = np.array(embeddings.embed_query('Deep learning'))
print(cosine_similarity(vector1,vector2))

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0.8197981424155507


In [None]:
! pip install cohere

In [19]:
from langchain.embeddings import CohereEmbeddings
os.environ['COHERE_API_KEY'] = ''
embeddings = CohereEmbeddings(model="embed-english-light-v3.0")

In [20]:
len(embeddings.embed_query('AI'))

384

In [21]:
# create index in the vector database FAISS is local database similer to sqlite
index = FAISS.from_documents(data,embeddings)

In [22]:
index.similarity_search_with_relevance_scores(
    "NLP Pipeline"
)

[(Document(page_content="The function takes a spaCy Doc object (named doc) as a parameter and returns a Doc.\nTherefore, we can use it as a another pipeline component and simply add it to the\nexisting pipeline:\nnlp.add_pipe (norm_entities )\nNow we can repeat the process on the example sentences and check the result:\ndoc = nlp(text)\nprint(*[([t.text for t in e], e.label_) for e in doc.ents], sep='\\n')\nOut:\n(['Baker', 'International'], 'ORG')\n(['New', 'York', 'Stock', 'Exchange'], 'ORG')\nMerging Entity Tokens", metadata={'source': '/kaggle/input/books-ds/blueprints-for-text-analytics-using-python-machine-learning-based-solutions-for-common-real-world-nlp-applications-149207408x-9781492074083_compress.pdf', 'page': 353}),
  0.3947095102504933),
 (Document(page_content='if ref._.ref_n != \'\':\n                        token._.ref_n = ref._.ref_n\n                        token._.ref_t = ref._.ref_t\n                        break\n    return doc\nAgain, we add this resolver to our 

In [23]:
[doc.page_content.replace('\n','') for doc in index.max_marginal_relevance_search("NLP Pipeline",k=6)]

["The function takes a spaCy Doc object (named doc) as a parameter and returns a Doc.Therefore, we can use it as a another pipeline component and simply add it to theexisting pipeline:nlp.add_pipe (norm_entities )Now we can repeat the process on the example sentences and check the result:doc = nlp(text)print(*[([t.text for t in e], e.label_) for e in doc.ents], sep='\\n')Out:(['Baker', 'International'], 'ORG')(['New', 'York', 'Stock', 'Exchange'], 'ORG')Merging Entity Tokens",
 'if ref._.ref_n != \'\':                        token._.ref_n = ref._.ref_n                        token._.ref_t = ref._.ref_t                        break    return docAgain, we add this resolver to our pipeline and check the result:nlp.add_pipe (anaphor_coref )doc = nlp(text)display_ner (doc).query("ref_n != \'\'" ) \\  [[\'text\', \'ent_type\' , \'main_coref\' , \'ref_n\', \'ref_t\']]Out:text ent_type main_coref ref_n ref_t0 Hughes Tool Co ORG Hughes Tool Co Hughes Tool Co ORG',
 "call nlp.pipe .batch_size  =

# Load LLM

In [24]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import StdOutCallbackHandler
handler = StdOutCallbackHandler()
retriever = index.as_retriever()
retriever.search_kwargs['fetch_k'] = 20
retriever.search_kwargs['k'] = 10
retriever.search_kwargs['max_marginal_relevance'] = True
llm = ChatOpenAI(temperature=0.8)
chain = RetrievalQA.from_chain_type(llm=llm,retriever=retriever,verbose=True)


In [32]:
chain.run('NLP Pipeline' ,
         callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
The function takes a spaCy Doc object (named doc) as a parameter and returns a Doc.
Therefore, we can use it as a another pipeline component and simply add it to the
existing pipeline:
nlp.add_pipe (norm_entities )
Now we can repeat the process on the example sentences and check the result:
doc = nlp(text)
print(*[([t.text for t in e], e.label_) for e in doc.ents], sep='\n')
Out:
(['Baker', 'International'], 'ORG')
(['New', 'York', 'Stock', 'Exchange'], 'ORG')
Merging Entity Tokens

if ref._.ref_n != '':
                        token._.ref_n = ref._.ref_n
                        token._.ref_t = ref._.ref_t
                    

'The NLP pipeline is a series of steps or components that are applied to text in order to process and analyze it. These components can include tasks such as tokenization, part-of-speech tagging, dependency parsing, named entity recognition, and coreference resolution. The pipeline is typically defined using a NLP library, such as spaCy, and can be customized by adding or removing components based on the specific needs of the task or application.'

As we notice the previous answer was from the Parameteric knowledge not from the source knowledge 

# Mistral LLM


In [None]:
!pip install -q torch datasets
!pip install -q accelerate==0.21.0 \
                peft==0.4.0 \
                bitsandbytes==0.40.2 \
                transformers==4.34.0 \
                trl==0.4.7

- You have to restart the kernel after the above installation 

In [1]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
device = "cuda" if torch.cuda.is_available() else 'cpu'
model_name='/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1'

In [None]:
!ls $model_name

In [2]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
model_config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=True)
model_config

MistralConfig {
  "_name_or_path": "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1",
  "architectures": [
    "MistralForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.34.0",
  "use_cache": true,
  "vocab_size": 32000
}

In [6]:

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [8]:
from timeit import default_timer
t0 = default_timer()
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    config=model_config,
    quantization_config=bnb_config,
    torch_dtype = torch.bfloat16,
    device_map="auto"
)
print(default_timer() - t0, 'sec.')



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

154.99799609000047 sec.


In [29]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.chains import LLMChain
text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.8,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=300,
    pad_token_id = tokenizer.eos_token_id
)

mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


In [30]:
qa = RetrievalQA.from_chain_type(
    llm=mistral_llm, 
    retriever=retriever, 
    verbose=True
)

In [31]:
qa.run('What is text summarization algorthms',callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

vate others. Text summarization  is defined as the method used for generating a con‐
cise summary of longer text while still conveying useful information and without
losing the overall context. This is a method that we are quite familiar with: when
243

right approach for any application.
Text Summarization
It is likely that you have undertaken a summarization task knowingly or unknowingly
at some point in life. Examples are telling a friend about a movie you watched last
night and trying to explain your work to your family. We all like to provide a brief
summary of our experiences to the rest of the world to share our feelings and moti‐
vate othe




[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


' Text summarization algorithms are computational methods used to automatically generate a concise summary of longer text while still conveying useful information and without losing the overall context.'