# Splitting and Embedding Text Using LangChain

In [1]:
from dotenv import load_dotenv, find_dotenv
from langchain.embeddings import OpenAIEmbeddings

load_dotenv(find_dotenv(filename="pinecone.env"), override=True)
load_dotenv(find_dotenv(filename="openai.env"), override=True)

True

## Importiamo il "TextSplitter"

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('../Resources/churchill_speech.txt') as f:
    churchill_speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

## Creiamo i chunks

In [3]:
chunks = text_splitter.create_documents([churchill_speech])
# print(chunks[2])
# print(chunks[10].page_content)
print(f'Now you have {len(chunks)} chunks')

Now you have 300 chunks


## Calcoliamo il costo degli embeddings (creati con OpenAI ADA002)
Sempre meglio sapere prima quanto andremo a spendere

In [4]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')


print_embedding_cost(chunks)

Total Tokens: 4820
Embedding Cost in USD: 0.001928


## Importiamo e istanziamo OpenAI embeddings

In [5]:
embeddings = OpenAIEmbeddings() #la API Key è una variabile di ambiente, quindi possiamo eccitare di passarla come argomento

## Creiamo gli embeddings dal primo chunk

In [6]:
vector = embeddings.embed_query(chunks[0].page_content)
vector

[-0.04455330253056585,
 -0.03779507081507403,
 -0.0028754349527285493,
 -0.008014238493844727,
 0.01577344920006286,
 0.02260818534028782,
 -0.02843556386142967,
 -0.009742049566406832,
 0.001093430335597406,
 0.00718539911538088,
 0.00781659228439702,
 0.03279653213805732,
 0.007389421159217638,
 -0.011769518335996314,
 0.006324681496294353,
 -0.0053906431787176,
 0.013210423380309146,
 -0.0025279598947671547,
 0.013516455980402993,
 -0.011049065813839897,
 -0.008090746643868189,
 -0.0268033875107356,
 0.029583187974426762,
 -0.003796721601527198,
 -0.014447306380785221,
 -0.018476741513730556,
 0.010876922010625819,
 -0.01860425447622127,
 0.0030252635684115924,
 -0.01437079823076176,
 0.00711526679974647,
 -0.0086135529565769,
 -0.01650027755660833,
 0.005173869931764028,
 -0.018310973544905528,
 -0.02389607408455373,
 -0.02242966756532985,
 -0.008747442684779248,
 0.022582683865376773,
 -0.012770501052012951,
 0.013669473211772499,
 0.004676566490950235,
 0.008766569256623823,
 0.0

## Inseriamo gli embeddings in un indice Pinecone

In [7]:
import os
import pinecone
from langchain.vectorstores import Pinecone

# Inizializzazione del Pinecone Client
pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

  from tqdm.autonotebook import tqdm


In [8]:
from tqdm.autonotebook import tqdm

In [9]:
# cancelliamo gli indici esistenti - (il piano gratuito ne concede uno solo)
indexes = pinecone.list_indexes()
for i in indexes:
    print('Deleting all indexes ... ', end='')
    pinecone.delete_index(i)
    print('Done')

Deleting all indexes ... Done


In [10]:
# creiamo un nuovo indice
index_name = 'churchill-speech'
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name} ...')
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    print('Done!')

Creating index churchill-speech ...
Done!


In [11]:
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)  #in questo caso crea 300 vectors

## Fare domande (Questions) secondo la "Similarity Search"

In [12]:
# Eseguo la ricerca per similarità
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and', metadata={}), Document(page_content='front, now on that, fighting', metadata={}), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing', metadata={}), Document(page_content='Winston Churchill Speech - We Shall Fight on the Beaches\nWe Shall Fight on the Beaches\nJune 4, 1940', metadata={})]


In [13]:
# Stampo solo il testo
for r in result:
    print(r.page_content)
    print('-' * 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
front, now on that, fighting
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940
--------------------------------------------------


## Rispondere in Linguaggio Naturale mediante impiego di una LLM

In [14]:
# Alimentiamo una LLM con i "most relevant chuncks"
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=retriever
)  #stuff usa tutto il testo del prompt


In [15]:
query = 'Where should we fight?'
answer = chain.run(query)
print(answer)

We should fight on the beaches, on the landing grounds, in the fields, in France, on the seas and oceans.


In [16]:
query = 'Who was the king of Belgium at that time?'
# query = 'What about the French Armies??'
answer = chain.run(query)
print(answer)

The king of Belgium at that time was King Leopold.
