In [None]:
from helper_utils import load_chroma, word_wrap
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()

chroma_collection = load_chroma(filename='MachineLearning-Lecture01.pdf', collection_name='MachineLearning-Lecture', embedding_function=embedding_function)
chroma_collection.count()

In [None]:
import umap.umap_ as umap
import numpy as np
from tqdm import tqdm

In [None]:
embeddings = chroma_collection.get(include=['embeddings'])['embeddings']
umap_transform = umap.UMAP(random_state=0, transform_seed=0).fit(embeddings)

In [None]:
def project_embeddings(embeddings, umap_transform):
    umap_embeddings = np.empty((len(embeddings),2))
    for i, embedding in enumerate(tqdm(embeddings)): 
        umap_embeddings[i] = umap_transform.transform([embedding])
    return umap_embeddings  

In [None]:
projected_dataset_embeddings = project_embeddings(embeddings, umap_transform)

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10)
plt.gca().set_aspect('equal', 'datalim')
plt.title('Projected Embeddings')
plt.axis('off')

![image.png](attachment:image.png)

## Relevancy and Distraction

In [4]:
query = "What is Machine Learning"

results = chroma_collection.query(query_texts=query, n_results=5, include=['documents', 'embeddings'])

retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(word_wrap(document))
    print('')

industry in silicon valley or as i work with various businesses in cs
and outside cs, i find that there ' s often a huge difference between
how well someone who really understands this stuff can apply a learning
algorithm versus someone who sort of gets it but sort of doesn ' t. the
analogy i like to think of is imagine you were going to a carpentry
school instead of a machine learning class, right? if you go to a
carpentry school, they can give you the tools of carpentry. they ' ll
give you a hammer, a bunch of nails, a screwdriver or whatever. but a
master carpenter will be able to use those tools far better than most
of us in this room. i know a carpenter can do things with a hammer and
nail that i couldn ' t possibly. and it ' s actually a little bit like
that in machine learning, too. one thing that ' s sadly not taught in
many courses on machine learning is how to take the tools of machine
learning and really, really apply them well.

so in this class, we ' ve tried to convey to 

In [5]:
query_embedding = embedding_function([query])[0]
retrieved_embeddings = results['embeddings'][0]

In [None]:
projected_query_embedding = project_embeddings([query_embedding], umap_transform)
projected_retrieved_embeddings = project_embeddings(retrieved_embeddings, umap_transform)

In [None]:
# Plot the projected query and retrieved documents in the embedding space
plt.figure()
plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10, color='gray')
plt.scatter(projected_query_embedding[:, 0], projected_query_embedding[:, 1], s=150, marker='X', color='r')
plt.scatter(projected_retrieved_embeddings[:, 0], projected_retrieved_embeddings[:, 1], s=100, facecolors='none', edgecolors='g')

plt.gca().set_aspect('equal', 'datalim')
plt.title(f'{query}')
plt.axis('off')

![image.png](attachment:image.png)

In [7]:
query = "What is the strategy around Marketing ?"
results = chroma_collection.query(query_texts=query, n_results=5, include=['documents', 'embeddings'])

retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(word_wrap(document))
    print('')

network analysis, market segmentation, so if you ' re a marketer and
you want to divide your market into different segments or different
groups of people to market to them separately ; even for astronomical
data analysis and understanding how galaxies are formed. these are just
a sort of small sample of the applications of unsupervised learning
algorithms and clustering algorithms that we ' ll talk about later in
this class. just one particularly cool example of an unsupervised
learning algorithm that i want to tell you about. and to motivate that,
i ' m gonna tell you about what ' s called the cocktail party problem,
which is imagine that you ' re at some cocktail party and there are
lots of people standing all over. and you know how it is, right, if you
' re at a large party, everyone ' s talking, it can be sometimes very
hard to hear even the person in front of you. so imagine a large
cocktail party with lots of people. so the problem is, is that all of
these

is let ' s say plot th

In [None]:
query_embedding = embedding_function([query])[0]
retrieved_embeddings = results['embeddings'][0]

projected_query_embedding = project_embeddings([query_embedding], umap_transform)
projected_retrieved_embeddings = project_embeddings(retrieved_embeddings, umap_transform)

In [None]:
# Plot the projected query and retrieved documents in the embedding space
plt.figure()
plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10, color='gray')
plt.scatter(projected_query_embedding[:, 0], projected_query_embedding[:, 1], s=150, marker='X', color='r')
plt.scatter(projected_retrieved_embeddings[:, 0], projected_retrieved_embeddings[:, 1], s=100, facecolors='none', edgecolors='g')

plt.gca().set_aspect('equal', 'datalim')
plt.title(f'{query}')
plt.axis('off')

![image.png](attachment:image.png)

In [8]:
query = "What has been the investment in research and development?"
results = chroma_collection.query(query_texts=query, n_results=5, include=['documents', 'embeddings'])

retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(word_wrap(document))
    print('')

and let ' s see. oh, and the goal of the project should really be for
you to do a publishable piece of research in machine learning, okay?
and if you go to the course website, you ' ll actually find a list of
the projects that students had done last year. and so i ' m holding the
list in my hand. you can go home later and take a look at it online.
but reading down this list, i see that last year, there were students
that applied learning algorithms to control a snake robot. there was a
few projects on improving learning algorithms. there ' s a project on
flying autonomous aircraft. there was a project actually done by our ta
paul on improving computer vision algorithms using machine learning.
there are a couple of projects on netflix rankings using learning
algorithms ; a few medical robots ; ones on segmenting [ inaudible ] to
segmenting pieces of the body using learning algorithms ; one on
musical instrument detection ; another on irony sequence

this class, i ' ll ask you to execute

In [None]:
query = "What has been the investment in research and development?"
results = chroma_collection.query(query_texts=query, n_results=5, include=['documents', 'embeddings'])

retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(word_wrap(document))
    print('')

In [None]:
query_embedding = embedding_function([query])[0]
retrieved_embeddings = results['embeddings'][0]

projected_query_embedding = project_embeddings([query_embedding], umap_transform)
projected_retrieved_embeddings = project_embeddings(retrieved_embeddings, umap_transform)

In [None]:
# Plot the projected query and retrieved documents in the embedding space
plt.figure()
plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10, color='gray')
plt.scatter(projected_query_embedding[:, 0], projected_query_embedding[:, 1], s=150, marker='X', color='r')
plt.scatter(projected_retrieved_embeddings[:, 0], projected_retrieved_embeddings[:, 1], s=100, facecolors='none', edgecolors='g')

plt.gca().set_aspect('equal', 'datalim')
plt.title(f'{query}')
plt.axis('off')

![image.png](attachment:image.png)

In [9]:
query = "What has Michael Jordan done for us lately?"
results = chroma_collection.query(query_texts=query, n_results=5, include=['documents', 'embeddings'])

retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(word_wrap(document))
    print('')

with wins and what are the board positions that tend to be associated
with losses. and way back around 1959, the amazing thing about this was
that his program actually learned to play checkers much better than
arthur samuel himself could.

works in machine learning and computer vision. catie chang is actually
a neuroscientist who applies machine learning algorithms to try to
understand the human brain. tom do is another phd student, works in
computational biology and in sort of the basic fundamentals of human
learning. zico kolter is the head ta — he ' s head ta two years in a
row now — works in machine learning a nd applies them to a bunch of
robots. and daniel ramage is — i guess he ' s not here — daniel applies
l earning algorithms to problems in natural language processing. so you
' ll get to know the tas and me much better throughout this quarter,
but just from the sorts of things the ta ' s do, i hope you can already
tell that machine learning is a highly interdisciplinary topic 

In [None]:
query_embedding = embedding_function([query])[0]
retrieved_embeddings = results['embeddings'][0]

projected_query_embedding = project_embeddings([query_embedding], umap_transform)
projected_retrieved_embeddings = project_embeddings(retrieved_embeddings, umap_transform)

In [None]:
# Plot the projected query and retrieved documents in the embedding space
plt.figure()
plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10, color='gray')
plt.scatter(projected_query_embedding[:, 0], projected_query_embedding[:, 1], s=150, marker='X', color='r')
plt.scatter(projected_retrieved_embeddings[:, 0], projected_retrieved_embeddings[:, 1], s=100, facecolors='none', edgecolors='g')

plt.gca().set_aspect('equal', 'datalim')
plt.title(f'{query}')
plt.axis('off')

![image.png](attachment:image.png)