## Create embeddings

In [39]:
from configs import API_KEY, DEFAULT_MODEL
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
import os
import openai
import numpy as np

In [12]:
loader_docx = Docx2txtLoader("data/Introduction_to_Data_and_Data_Science_1.docx")
pages_docx = loader_docx.load()

markdown_header_text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = [('#', 'Course Title'),
                                                                                  ('##', 'Lecture Title')])

pages_markdown_split = markdown_header_text_splitter.split_text(pages_docx[0].page_content)

for i in range(len(pages_markdown_split)):
    pages_markdown_split[i].page_content = (' '.join(pages_markdown_split[i].page_content.split()))

In [13]:
pages_markdown_split[0].page_content

"Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis. Consider the following… You have a huge dataset containing data of various types. Instead of tackling the entire dataset and running the risk of becoming overwhelmed, you separate it into easier to digest chunks and study them individually and examine how they relate to other parts. And that’s analysis in a nutshell. One important thing to remember, however, is that you perform analyses on things that have already happened in the past. Such as using an analysis to explain how a story ended the way i

In [16]:
character_splitter= CharacterTextSplitter(separator = '', chunk_size = 500, chunk_overlap = 50)
pages_md_char_split = character_splitter.split_documents(pages_markdown_split)

In [17]:
pages_md_char_split

[Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis. Consi'),
 Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='hall we? First, we will start with analysis. Consider the following… You have a huge dataset containing data of various types. Instead of tackling the entire dataset and running the risk of becoming overwhelmed, you separate it 

In [26]:
os.environ["OPENAI_API_KEY"] = API_KEY
# openai.api_key = API_KEY
openai.api_key = os.getenv('OPENAI_API_KEY')

In [27]:
embedding = OpenAIEmbeddings(model='text-embedding-ada-002')

In [28]:
pages_md_char_split[3]

Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='tics is essentially the application of logical and computational reasoning to the component parts obtained in an analysis. And in doing this you are looking for patterns and exploring what you could do with them in the future. Here, analytics branches off into two areas: qualitative analytics – this is using your intuition and experience in conjunction with the analysis to plan your next business move. And quantitative analytics – this is applying formulas and algorithms to numbers you have gath')

In [29]:
pages_md_char_split[5]

Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='thing to start selling. This would be qualitative analytics. But you might not know when to introduce the new collection. In that case, relying on past sales data and user experience data, you could predict in which month it would be best to do that. This is an example of using quantitative analytics. Fantastic! To backtrack a little, you can combine these areas with analyses also – you could perform qualitative analysis – to explain how or why a story ended the way it did. And you can perform q')

In [31]:
pages_md_char_split[18]

Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='retical preparation is strong enough, you will find yourself restricted by software. Knowing a programming language such as R and Python, gives you the freedom to create specific, ad-hoc tools for each project you are working on. Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!')

In [32]:
vector1 = embedding.embed_query(pages_md_char_split[3].page_content)

In [33]:
vector2 = embedding.embed_query(pages_md_char_split[5].page_content)

In [34]:
vector3 = embedding.embed_query(pages_md_char_split[18].page_content)

In [38]:
len(vector1), len(vector2), len(vector3)

(1536, 1536, 1536)

In [43]:
np.dot(vector1, vector2), np.dot(vector1, vector3), np.dot(vector2, vector3)

(0.8524481588092789, 0.7879765158574175, 0.7791194664242602)

#### Above dot products confirm that the vectors 1 & 2 are close to each other but far from vector3

In [46]:
np.linalg.norm(vector1), np.linalg.norm(vector2), np.linalg.norm(vector3)

(0.9999999585905728, 1.0000000900713542, 0.9999999536394362)