# Semantic Search with Enriched Embeddings

## General Setup

In [None]:
from dotenv import load_dotenv
from openai import OpenAI
from scipy.spatial import distance

load_dotenv()

client = OpenAI()

def create_embeddings(texts):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=texts
    )

    response_dict = response.model_dump()

    embeddings = [item["embedding"] for item in response_dict["data"]]

    return embeddings

## Setup Articles

In [None]:
articles = [
  {
    "headline": "Ford has an AI assistant and new hands-free BlueCruise tech on the way",
    "topic": "Technology",
    "keywords": ["ai", "ford", "transportation", "ces2026"]
  },
  {
    "headline": "Shopify competitor Swap raises $100M six months after raising $40M",
    "topic": "Business",
    "keywords": ["ecommerce", "startups", "venture"],
  },
  {
    "headline": "US insurance giant Aflac says hackers stole personal and health data of 22.6 million people",
    "topic": "Security",
    "keywords": ["cybersecurity", "security", "data breach"],
  },
  {
    "headline": "OpenAI unveils ChatGPT Health, says 230 million users ask about health each week",
    "topic": "Technology",
    "keywords": ["ai", "biotech", "openai", "chatgpt"]
  }
]

## Combine Article Texts

In [None]:
def create_article_text(article):
  return f"""
  Headline: {article['headline']}
  Topic: {article['topic']}
  Keywords: {', '.join(article['keywords'])}
  """

print(create_article_text(articles[-1]))

## Create Enriched Embeddings

In [None]:
article_texts = [create_article_text(article) for article in articles]

article_embeddings = create_embeddings(article_texts)

print(article_embeddings)

## Compute Distances

In [None]:
def find_n_closest(query_vector, embeddings, n=3):
    distances = [distance.cosine(query_vector, emb) for emb in embeddings]
    closest_indices = sorted(range(len(distances)), key=lambda i: distances[i])[:n]
    return closest_indices[0:n]

## Return Search Results

In [None]:
query = "Security"
query_vector = create_embeddings(query)[0]

hits = find_n_closest(query_vector, article_embeddings)

for hit in hits:
  article = articles[hit]
  print(article['headline'])

## References

Articles are provided by [TechCrunch](https://techcrunch.com/).

> Ford has an AI assistant and new hands-free BlueCruise tech on the way
>
>  Sean O'Kane
>
> https://techcrunch.com/2026/01/07/ford-has-an-ai-assistant-and-new-hands-free-bluecruise-tech-on-the-way/

> Shopify competitor Swap raises $100M six months after raising $40M
>
> Julie Bort
>  
> https://techcrunch.com/2026/01/07/shopify-competitor-swap-raises-100m-six-months-after-raising-40m/

> US insurance giant Aflac says hackers stole personal and health data of 22.6 million people
>
> Lorenzo Franceschi-Bicchierai
>
> https://techcrunch.com/2025/12/23/us-insurance-giant-aflac-says-hackers-stole-personal-and-health-data-of-22-6-million-people/

> OpenAI unveils ChatGPT Health, says 230 million users ask about health each week
>
> Amanda Silberling
> 
> https://techcrunch.com/2026/01/07/openai-unveils-chatgpt-health-says-230-million-users-ask-about-health-each-week/