# Vector embeddings with OpenAI

## Setup OpenAI API

In [1]:
import os

import azure.identity
import dotenv
from openai import OpenAI

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")

azure_credential = azure.identity.AzureDeveloperCliCredential(tenant_id=os.getenv("AZURE_TENANT_ID"))
token_provider = azure.identity.get_bearer_token_provider(azure_credential,
    "https://cognitiveservices.azure.com/.default")
openai_client = OpenAI(
    base_url=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com/openai/v1",
    api_key=token_provider)

## Vector representations

In [2]:
sentence = "A dog just walked past my house and yipped yipped like a Martian"

response = openai_client.embeddings.create(model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT, input=sentence)

vector = response.data[0].embedding

In [3]:
vector

[-0.034256935119628906,
 0.01204992737621069,
 -0.007885666564106941,
 -0.005639251787215471,
 0.0007455312297679484,
 0.02021416462957859,
 0.027156978845596313,
 0.003426765091717243,
 0.019271312281489372,
 0.041485466063022614,
 -0.03159980848431587,
 0.007321384269744158,
 -0.008042808622121811,
 -0.04651400446891785,
 -0.013849916867911816,
 -0.007349955849349499,
 -0.03374265506863594,
 0.01718560978770256,
 0.010664221830666065,
 0.015171336941421032,
 -0.025999844074249268,
 -0.012021356262266636,
 -0.025814130902290344,
 0.0012214211747050285,
 0.01948559656739235,
 -0.0008334771264344454,
 -0.02235700748860836,
 0.010471365414559841,
 -0.01213564071804285,
 -0.0027089123614132404,
 -0.029142681509256363,
 0.02315700240433216,
 -0.01018565334379673,
 0.03874262422323227,
 0.023771286010742188,
 -0.017785606905817986,
 0.04797114059329033,
 -0.03571407124400139,
 0.019299883395433426,
 -0.014621340669691563,
 0.005117826163768768,
 0.02132844366133213,
 -0.030599815770983696,


In [4]:
len(vector)

3072

### Document similarity modeled as cosine distance

In [5]:
import numpy as np
import pandas as pd


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['The new movie is awesome',
              'This recent movie is so good',
              'djkshsjdkhfsjdfkhsd']

def get_embeddings(sentences):
    embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT, input=sentences)
    return [embedding_object.embedding for embedding_object in embeddings_response.data]

embeddings1 = get_embeddings(sentences1)
embeddings2 = get_embeddings(sentences2)

for i in range(len(sentences1)):
    print(f"{sentences1[i]} \t\t {sentences2[i]} \t\t Score: {cosine_similarity(embeddings1[i], embeddings2[i]):.4f}")

The new movie is awesome 		 The new movie is awesome 		 Score: 1.0000
The new movie is awesome 		 This recent movie is so good 		 Score: 0.6300
The new movie is awesome 		 djkshsjdkhfsjdfkhsd 		 Score: 0.1257


### Vector search

In [11]:
import json

# Load in vectors for movie titles
with open('vector_embeddings/openai_movies_embedding3large-256.json') as json_file:
    movie_vectors = json.load(json_file)

In [13]:
# Compute vector for query
query = "101 Dalmations"

embeddings_response = openai_client.embeddings.create(
    model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT, input=[query], dimensions=256)
vector = embeddings_response.data[0].embedding

# Compute cosine similarity between query and each movie title
scores = []
for movie in movie_vectors:
    scores.append((movie, cosine_similarity(vector, movie_vectors[movie])))

# Display the top 10 results
df = pd.DataFrame(scores, columns=['Movie', 'Score'])
df = df.sort_values('Score', ascending=False)
df.head(10)

Unnamed: 0,Movie,Score
8,101 Dalmatians,0.974949
335,102 Dalmatians,0.904177
15,The Aristocats,0.657329
6,Lady and the Tramp,0.631367
72,Oliver & Company,0.626305
488,Old Dogs,0.622776
468,Beverly Hills Chihuahua,0.620266
431,The Shaggy Dog,0.594158
140,Homeward Bound: The Incredible Journey,0.59265
354,Snow Dogs,0.578238
