In [9]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter,TokenTextSplitter
from langchain.document_loaders import PyPDFLoader


def create_splits_langchain(sample_data_path):
    loader = PyPDFLoader(sample_data_path)
    documents = loader.load()
    doc_splitter_sample = TokenTextSplitter(
        chunk_size = 50,
        chunk_overlap  = 5
    )
    texts_sample=doc_splitter_sample.split_documents(documents)
    return texts_sample
texts_sample = create_splits_langchain("McDonalds.pdf")

In [11]:
texts_sample

[Document(page_content=' \n Purpose & Impact \nGlobal Progress Summary \n2021 – 2022 \nMcDonald’s Corporation ', metadata={'source': 'McDonalds.pdf', 'page': 0}),
 Document(page_content=' \n \n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ', metadata={'source': 'McDonalds.pdf', 'page': 1}),
 Document(page_content=' \n \n \n \n \n   \n \n \n \n  \n \n \n \n  \n Purpose & Impact Progress Summary 2021–2022 Introduction  Food Quality & Sourcing  Our', metadata={'source': 'McDonalds.pdf', 'page': 1}),
 Document(page_content=' & Sourcing  Our Planet  Jobs, Inclusion & Empowerment   Community Connection \nA Message \nFrom Our CEO \nThis past year showed us what \nmakes McDonald’s unique, once \nagain.', metadata={'source': 'McDonalds.pdf', 'page': 1}),
 Document(page_content=' once \nagain. Guided by our core values, \nwe’ve experienced first-hand \nhow our focused actions – both \nbig and small – can translate \ninto meaningful experiences \nfor our customers, bringing'

In [25]:
import plotly.express as px

len_doc=[]
for i in texts_sample:
    len_doc.append(len(i.page_content))
    
px.scatter(x=range(1,len(len_doc)+1),y=len_doc,title=f'Size of each chunk in each docuement with total documents = {len(len_doc)}')

In [21]:
from langchain.docstore.document import Document
new_doc=[]
for t in texts_sample:
    if len(t.page_content.strip()) > 100: ##Define this number as per your application , for small sentences this filter can be ignored 
        new_doc.append(Document(page_content=t.page_content.strip(),metadata=t.metadata))

In [211]:
import plotly.express as px

len_doc_new=[]
for i in new_doc:
    len_doc_new.append(len(i.page_content))
    
px.scatter(x=range(1,len(len_doc_new)+1),y=len_doc_new,title=f'Size of each chunk in each docuement after L1 filter with total documents = {len(len_doc_new)}').update_layout(
    xaxis_title="Chunk number", yaxis_title="Size of chunk in number of Characters"
)

In [40]:
new_doc

[Document(page_content='& Sourcing  Our Planet  Jobs, Inclusion & Empowerment   Community Connection \nA Message \nFrom Our CEO \nThis past year showed us what \nmakes McDonald’s unique, once \nagain.', metadata={'source': 'McDonalds.pdf', 'page': 1}),
 Document(page_content='once \nagain. Guided by our core values, \nwe’ve experienced first-hand \nhow our focused actions – both \nbig and small – can translate \ninto meaningful experiences \nfor our customers, bringing', metadata={'source': 'McDonalds.pdf', 'page': 1}),
 Document(page_content='for our customers, bringing \nour purpose to feed and foster community to life each day. \nWith the strength of our full System, we’ve \nworked together to build a more diverse, \nequitable and inclusive business', metadata={'source': 'McDonalds.pdf', 'page': 1}),
 Document(page_content='equitable and inclusive business, source \nmore food responsibly, adopt more \nsustainable practices, and implement \ninnovative and credible solutions in our \n

In [35]:
#please mind that these documents are in chronological order and carry huge amount of importance for this chronology , lets now see how we can visualize our chunks and 

# we will stay away from creating embedding vector store from langchain , the langchain is pretty usefull when it comes to the optimizing the code for generation of the embeddings , here we will use direct implementation of the OpenAI APIs
import openai
from openai import OpenAI
import os

embeddings_text_3_small = []
embeddings_text_3_large = []
embeddings_text_ada = []

openai.api_key = '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'
os.environ["OPENAI_API_KEY"]='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'

client = OpenAI()

for t in new_doc:
    embeddings_text_3_small.append(client.embeddings.create(input=f"{t.page_content}",model="text-embedding-3-small"))
    embeddings_text_3_large.append(client.embeddings.create(input=f"{t.page_content}",model="text-embedding-3-large"))
    embeddings_text_ada.append(client.embeddings.create(input=f"{t.page_content}",model="text-embedding-ada-002"))

In [38]:
#Lets save the embeddings locally
import pickle 
with open('C:\\Users\\Rideema Malji\\OneDrive\\Desktop\\Others\\Medium-Post\\embeddings\\embeddings_text_3_small.pkl', 'wb') as f:
    pickle.dump(embeddings_text_3_small, f)
with open('C:\\Users\\Rideema Malji\\OneDrive\\Desktop\\Others\\Medium-Post\\embeddings\\embeddings_text_3_large.pkl', 'wb') as f:
    pickle.dump(embeddings_text_3_large, f)
with open('C:\\Users\\Rideema Malji\\OneDrive\\Desktop\\Others\\Medium-Post\\embeddings\\embeddings_text_ada.pkl', 'wb') as f:
    pickle.dump(embeddings_text_ada, f)

In [94]:
#These saved data are of different sizes like 1572*1 , 3072*1 , we can employ different techniques like PCA , UMAP or TSNE for feature reduction , however below are some validated drawbacks of these feature reduction technique 
#1.PCA is bascially a Linear transformational technique , the basic component of PCA is to showcase the reduced components of given input matrix , provided that the variance of explanation of these input matrix can be easily seperable linearly
#i.e if the data is non linear in nature then we might not be able to seperate or reduce the inout componenets into Eigen vectors and thus PCA fails , In the case of NLU the embeddings are Machine learning models which tries to capture the essence of the input text data into vectors and this data is 99% of times would be linearly inseperable 
#The best way to visualize your text data is to calculate the distance between the embeddings and analyze the clusters 


#Lets calculate the distance of chunk1 vs chunk2,chunk3,chunk4....chunk267
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
extracted_embeddings_raw = [emb.data[0].embedding for emb in embeddings_text_ada]
df_chunk_emb = pd.DataFrame(extracted_embeddings_raw)
df_chunk_emb_l1=df_chunk_emb.T
df_chunk_emb_l1.columns = ['v'+str(i) for i in range(1,len(extracted_embeddings_raw)+1)]

dist_cal = []
calculated_vect = []
for col in df_chunk_emb_l1.columns:
        dist_cal.append(cosine_similarity(df_chunk_emb_l1['v1'].values.reshape(-1, 1), df_chunk_emb_l1[col].values.reshape(-1, 1)))
#similarity = cosine_similarity(word1_embedding, word2_embedding)

In [104]:
import numpy as np
from numpy.linalg import norm
cosine = np.dot(df_chunk_emb_l1['v1'].values,df_chunk_emb_l1['v3'].values)/(norm(df_chunk_emb_l1['v1'].values)*norm(df_chunk_emb_l1['v2'].values))
print("Cosine Similarity:", cosine)

Cosine Similarity: 0.8429257180615184


In [111]:
import plotly.express as px
dist_cos = pd.DataFrame(cosine_similarity(df_chunk_emb_l1.T))
df_cos.columns = ['v'+str(i) for i in range(1,len(extracted_embeddings_raw)+1)]
df_cos.index = ['v'+str(i) for i in range(1,len(extracted_embeddings_raw)+1)]

In [114]:
%matplotlib

Using matplotlib backend: QtAgg


In [218]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(df_cos)
plt.title('The heat map of the similarity between each chunk in given document')

Text(0.5, 1.0, 'The heat map of the similarity between each chunk in given document')

In [219]:
px.scatter(x=['v'+str(i) for i in range(1,len(extracted_embeddings_raw[:10])+1)][:10],y=cosine_similarity(df_chunk_emb_l1.T)[0][:10],title='cos Similarity of V1 till V10').update_layout(
    xaxis_title="Chunk Index", yaxis_title="Similarity of chunk from chunk1"
)

In [124]:
px.scatter(x=['v'+str(i) for i in range(1,len(extracted_embeddings_raw[:10])+1)][:10],y=cosine_similarity(df_chunk_emb_l1.T)[8][:10],title='cos Similarity of V8 around 10 chunks')

In [132]:
px.scatter(x=['v'+str(i) for i in range(75,75+len(extracted_embeddings_raw[0:10]))],y=cosine_similarity(df_chunk_emb_l1.T)[80][75:85],title='cos Similarity of V80 around 10 chunks')

In [None]:
# Lets see the chunk 80

In [133]:
new_doc[80]

Document(page_content='Goals, our internal, cross-functional network shares nutrition best practices across the Company. Externally, we participate in various pledges to drive industry-wide practices and are proud \n of  \nour long history as an  industry', metadata={'source': 'McDonalds.pdf', 'page': 5})

In [134]:
#And what about the surround chunk of 80??
total_para = ''
total_para.join([i.page_content for i in new_doc[75:85]])

'major markets  \noffer\xa0 balanced  meals  We’re focused on evolving the Happy  \nM\neal and making balanced meals  \nmore accessible. McDonald’s will continue to approach this responsibly, offeringapproach this responsibly, offering balanced options and promoting menu items that contribute to recommended food groups, such as fruits, vegetables and low-fat\n  dairy. 12 major markets   \nhave met our goal   \non simplifyingon simplifying  \ningredients \n100% of ads  \nshown\xa0to children in  \n2019 featured water,  \nmilk or juice as the  \nHappy Meal beverage  \nand fruitsand fruits, vegetables  \nor dairy as\xa0Happy \xa0 \nMeal side Our Focus \non\xa0Nutrition\xa0and  Responsible Marketing \nMcDonald’ s aims to help \nconsumers makeconsumers make informed nutrition choices. We deliver nutrition labeling, calories on menu boards, mobile apps, kiosks, online nutrition calculators and defined marketing \n goals. • \nWorking toward our Happy Meal Nutrition Goals, our internal,Goals, 

In [144]:
##Lets see how putting the sequencial chunks can help us improve the quality of our data , the traditional methodologies to get the RAG answers are 
#1.Build Chroma db
#2.Build FAISS
#3.
#4.
#n. Get the similar vectors from any stored vectordb
#lets create 2 seperate types of the vectordb

from langchain.vectorstores import FAISS
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

def get_store_chroma_goals(texts_sample,action):
    persist_directory_chroma='chromadb'
    if action == 'get_db_instance':
        vectordb = Chroma.from_documents(texts_sample, embeddings)
        return(vectordb)
    
    if action=='store':
        vectordb = Chroma.from_documents(documents=texts_sample, 
                                 embedding=embeddings,
                                 persist_directory=persist_directory_chroma)
        vectordb.persist()
    
def get_store_faiss_goals(text_samples,action):
    persist_directory_faiss = "faiss"
    if action == 'get_db_instance':
        db_faiss = FAISS.from_documents(text_samples, embeddings)
        return(db_faiss)
    if action == 'store':
        db_faiss = FAISS.from_documents(text_samples, embeddings)
        db_faiss.save_local(persist_directory_faiss)


In [145]:
chromadb_store = get_store_chroma_goals(new_doc,'get_db_instance')
faiss_store = get_store_faiss_goals(new_doc,'get_db_instance')

In [216]:
answer = chromadb_store.similarity_search('What is company doing to make sure that user is taking nutritious choice?')
print([new_doc.index(i) for i in answer])

[79, 76, 69, 78]


In [180]:
#To get this asnwer chromadb has fetched 4 different chunks from different page number , lets see the content of this contex
'--------/n'.join(i.page_content for i in answer)

'consumers make informed nutrition choices. We deliver nutrition labeling, calories on menu boards, mobile apps, kiosks, online nutrition calculators and defined marketing \n goals. • \nWorking toward our Happy Meal Nutrition Goals, our internal,--------/napproach this responsibly, offering balanced options and promoting menu items that contribute to recommended food groups, such as fruits, vegetables and low-fat\n  dairy. 12 major markets   \nhave met our goal   \non simplifying--------/nCommitment to Serving \nSafe, Quality Food \nHow our food is produced and where it comes from \nmatters to our customers, our communities and our shared environment. We provide a variety of choices and bring families together to share--------/nand fruits, vegetables  \nor dairy as\xa0Happy \xa0 \nMeal side Our Focus \non\xa0Nutrition\xa0and  Responsible Marketing \nMcDonald’ s aims to help \nconsumers make'

In [192]:
#Lets compare this with the sequecial chunks we visualized earlier
'--------/n'.join([i.page_content for i in new_doc[78:82]])

'and fruits, vegetables  \nor dairy as\xa0Happy \xa0 \nMeal side Our Focus \non\xa0Nutrition\xa0and  Responsible Marketing \nMcDonald’ s aims to help \nconsumers make--------/nconsumers make informed nutrition choices. We deliver nutrition labeling, calories on menu boards, mobile apps, kiosks, online nutrition calculators and defined marketing \n goals. • \nWorking toward our Happy Meal Nutrition Goals, our internal,--------/nGoals, our internal, cross-functional network shares nutrition best practices across the Company. Externally, we participate in various pledges to drive industry-wide practices and are proud \n of  \nour long history as an  industry--------/nindustry  \nleader in responsible  marketing  \nto children, guided by  our  \nGlobal Marketing to \nChildren  Guidelines. 46,241 restaurant food safety audits conducted. \n12 major markets  \nhave'

In [205]:
#lets see what kind of answer do we get from both of these different contexts:
#{"role": "system", "content":'you are a professional user story testing engineer , you need to create a list of possible title for given user story,please give answer in JSON format , please give only test titles as your asnwer , please dont give any other text'}
def call_openai_new(context,query,model='gpt-4-1106-preview'):
        completion = openai.chat.completions.create(
            model="gpt-4",
            messages=[{"role":"system","content":f"""please read the following context  :{context} ,
                                ###
                                Based on the context above please answer the question
                                please dont generate any answers from other sources ,
                                the answer should be always generaetd from the given context"""},
                      {"role":"user","content":f"The Question is : {query}"}
                     ],
            frequency_penalty = 1.5,
            temperature = 0.4,
            max_tokens=300,
            )
        return completion.choices[0].message.content


In [201]:
chroma_based_fetched_chunks_context = '--------/n'.join(i.page_content for i in answer)
seq_based_fetched_chunks_context =  '--------/n'.join([i.page_content for i in new_doc[78:82]])

In [207]:
chroma_based_fetched_chunks_context

'consumers make informed nutrition choices. We deliver nutrition labeling, calories on menu boards, mobile apps, kiosks, online nutrition calculators and defined marketing \n goals. • \nWorking toward our Happy Meal Nutrition Goals, our internal,--------/napproach this responsibly, offering balanced options and promoting menu items that contribute to recommended food groups, such as fruits, vegetables and low-fat\n  dairy. 12 major markets   \nhave met our goal   \non simplifying--------/nCommitment to Serving \nSafe, Quality Food \nHow our food is produced and where it comes from \nmatters to our customers, our communities and our shared environment. We provide a variety of choices and bring families together to share--------/nand fruits, vegetables  \nor dairy as\xa0Happy \xa0 \nMeal side Our Focus \non\xa0Nutrition\xa0and  Responsible Marketing \nMcDonald’ s aims to help \nconsumers make'

In [206]:
call_openai_new(chroma_based_fetched_chunks_context,'What is company doing to make sure that user is taking nutritious choice?')

'The company is taking several steps to ensure consumers make nutritious choices. They provide nutrition labeling and display calories on menu boards. They also offer mobile apps, kiosks, and online nutrition calculators for easy access to nutritional information. The company promotes balanced options and menu items that contribute to recommended food groups such as fruits, vegetables, and low-fat dairy. Furthermore, they are working towards their Happy Meal Nutrition Goals internally by providing a variety of choices including fruits, vegetables or dairy as Happy Meal sides.\n'

In [209]:
call_openai_new(seq_based_fetched_chunks_context,'What is company doing to make sure that user is taking nutritious choice?')

'The company is helping consumers make informed nutrition choices by providing nutrition labeling, displaying calories on menu boards, offering mobile apps and kiosks, online nutrition calculators and setting defined marketing goals. They also share nutrition best practices across the Company through their internal network. Additionally, they participate in various pledges to drive industry-wide practices.'

In [None]:
#we know the factually best answer :)