In [2]:
import config
import time

api_key = config.PINECONE_API_KEY

# Import the Pinecone library
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

# Initialize a Pinecone client with your API key
pc = Pinecone(api_key=api_key)

In [None]:
# Define a sample dataset where each item has a unique ID and piece of text
data = [
    {"id": "run1", "text": "She decided to run a marathon after months of training."},
    {"id": "run2", "text": "The software program needs to run for at least 30 minutes to complete the test."},
    {"id": "run3", "text": "He started a small business that has been running successfully for five years."},
    {"id": "run4", "text": "The river runs through the valley, providing water to nearby towns."},
    {"id": "run5", "text": "The machine is designed to run on solar power."},
    {"id": "run6", "text": "A bad run of luck caused the team to lose several games in a row."},
    {"id": "run7", "text": "She had to run to catch the last bus of the night."},
    {"id": "run8", "text": "They decided to run a series of experiments to test the new hypothesis."},
    {"id": "run9", "text": "His term as mayor is coming to an end after a successful two-year run."},
    {"id": "run10", "text": "The company is expected to run out of supplies if they don't reorder soon."},
    {"id": "run11", "text": "The colors in the fabric began to run after the shirt was washed."},
    {"id": "run12", "text": "The actor had a long run on the popular TV show."},
    {"id": "run13", "text": "The students were asked to run a simulation of the system to observe its behavior."},
    {"id": "run14", "text": "The train runs every 30 minutes from the station."},
    {"id": "run15", "text": "She felt a run in her stocking just before the big meeting."},
    {"id": "run16", "text": "The car has a range of 300 miles on a full tank when it's running smoothly."},
    {"id": "run17", "text": "He was worried that the negotiations might run over the deadline."},
    {"id": "run18", "text": "The team made a strong run for the championship this season."},
    {"id": "run19", "text": "They had to run a wire from the power source to the device to complete the circuit."},
    {"id": "run20", "text": "The story has been running in the newspaper for the past few weeks."},
    {"id": "run21", "text": "The movie had a successful run at the box office, breaking multiple records."},
    {"id": "run22", "text": "He needs to run some errands before heading to the office."},
    {"id": "run23", "text": "The team is on a winning run, having secured five victories in a row."},
    {"id": "run24", "text": "They decided to run the advertisement during the evening news."},
    {"id": "run25", "text": "The printer started to run out of ink halfway through the job."},
    {"id": "run26", "text": "She felt her heart race as she made a mad run for the finish line."},
    {"id": "run27", "text": "The engine will run more efficiently if you clean the fuel injectors."},
    {"id": "run28", "text": "They run a charity organization that helps the homeless."},
    {"id": "run29", "text": "The stock market is currently on a bull run, with prices steadily increasing."},
    {"id": "run30", "text": "They need to run a background check before offering him the job."},
    {"id": "run31", "text": "He realized he had left the tap running all night."},
    {"id": "run32", "text": "She went for a quick run around the block to clear her mind."},
    {"id": "run33", "text": "The committee decided to run the event online instead of in person."},
    {"id": "run34", "text": "The rumor started to run through the office, causing a lot of speculation."},
    {"id": "run35", "text": "Their production run for the new product was completed last week."},
    {"id": "run36", "text": "She’s been running simulations to test the new algorithm."},
    {"id": "run37", "text": "The politician decided to run for office in the upcoming election."},
    {"id": "run38", "text": "They had to run a diagnostic to figure out what was wrong with the system."},
    {"id": "run39", "text": "He plans to run the data analysis overnight due to the large dataset."},
    {"id": "run40", "text": "The cable runs behind the wall, so you won’t see any wires."}
]

# Convert the text into numerical vectors that Pinecone can index
embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[d['text'] for d in data],
    parameters={"input_type": "passage", "truncate": "END"}
)

print(embeddings)

EmbeddingsList(
  model='multilingual-e5-large',
  data=[
    {'values': [0.058074951171875, -0.01288604736328125, ..., -0.0243072509765625, 0.0236663818359375]},
    {'values': [0.01004791259765625, -0.035614013671875, ..., -0.0428466796875, -0.005855560302734375]},
    ... (16 more embeddings) ...,
    {'values': [0.025390625, 0.00762176513671875, ..., -0.0279998779296875, -0.0231475830078125]},
    {'values': [0.0300140380859375, 0.00909423828125, ..., -0.0279693603515625, -0.0224151611328125]}
  ],
  usage={'total_tokens': 380}
)


In [9]:
# Create a serverless index
index_name = "example-index"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [10]:
# Target the index where you'll store the vector embeddings
index = pc.Index("example-index")

# Prepare the records for upsert
# Each contains an 'id', the embedding 'values', and the original text as 'metadata'
records = []
for d, e in zip(data, embeddings):
    records.append({
        "id": d['id'],
        "values": e['values'],
        "metadata": {'text': d['text']}
    })

# Upsert the records into the index
index.upsert(
    vectors=records,
    namespace="example-namespace"
)

time.sleep(10)  # Wait for the upserted vectors to be indexed

print(index.describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'example-namespace': {'vector_count': 20}},
 'total_vector_count': 20}


In [11]:
# Define your query
query = "What does run mean?"

# Convert the query into a numerical vector that Pinecone can search with
query_embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

# Search the index for the three most similar vectors
results = index.query(
    namespace="example-namespace",
    vector=query_embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)


{'matches': [{'id': 'run16',
              'metadata': {'text': 'The car has a range of 300 miles on a full '
                                   "tank when it's running smoothly."},
              'score': 0.7420962,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': 'run11',
              'metadata': {'text': 'The colors in the fabric began to run '
                                   'after the shirt was washed.'},
              'score': 0.74083894,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': 'run4',
              'metadata': {'text': 'The river runs through the valley, '
                                   'providing water to nearby towns.'},
              'score': 0.73837006,
              'sparse_values': {'indices': [], 'values': []},
              'values': []}],
 'namespace': 'example-namespace',
 'usage': {'read_units': 6}}


In [14]:
# Define your query
query = "Give me the top sentences that use the word run as a verb meaning to move quickly on foot."

# Convert the query into a numerical vector that Pinecone can search with
query_embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

# Search the index for the three most similar vectors
results = index.query(
    namespace="example-namespace",
    vector=query_embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': 'run11',
              'metadata': {'text': 'The colors in the fabric began to run '
                                   'after the shirt was washed.'},
              'score': 0.74563885,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': 'run18',
              'metadata': {'text': 'The team made a strong run for the '
                                   'championship this season.'},
              'score': 0.7443016,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': 'run15',
              'metadata': {'text': 'She felt a run in her stocking just before '
                                   'the big meeting.'},
              'score': 0.7442411,
              'sparse_values': {'indices': [], 'values': []},
              'values': []}],
 'namespace': 'example-namespace',
 'usage': {'read_units': 6}}


In [6]:
pc.delete_index(index_name)