In [16]:
import pinecone
from pinecone import Pinecone, ServerlessSpec
import os
from configs import API_KEY, DEFAULT_MODEL, PINECONE_KEY
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

In [2]:
pc = Pinecone(api_key = PINECONE_KEY)

In [3]:
index_name = "my-index"
dimensions = 3
metric = "cosine"

In [5]:
index = pc.Index(name = index_name)

In [6]:
index.upsert([
    ('Dog', [4., 0., 1.]),
    ('Cat', [4., 0., 1.]),
    ('Chicken', [2., 2., 1.]),
    ('Mantis', [6., 2., 3.]),
    ('Elephant', [4., 0., 1.]),
])

UpsertResponse(upserted_count=5, _response_info={'raw_headers': {'date': 'Mon, 09 Feb 2026 05:17:18 GMT', 'content-type': 'application/json', 'content-length': '19', 'connection': 'keep-alive', 'x-pinecone-request-lsn': '1', 'x-pinecone-request-logical-size': '87', 'x-pinecone-request-latency-ms': '253', 'x-envoy-upstream-service-time': '254', 'x-pinecone-response-duration-ms': '255', 'grpc-status': '0', 'server': 'envoy'}})

In [10]:
fw = load_dataset("HuggingFaceFW/fineweb", name = "sample-10BT", split = "train", streaming = True)

Resolving data files:   0%|          | 0/27468 [00:00<?, ?it/s]

#### Datasets from - https://huggingface.co/HuggingFaceFW/datasets

In [11]:
fw

IterableDataset({
    features: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'token_count'],
    num_shards: 15
})

In [12]:
fw.features

{'text': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'dump': Value(dtype='string', id=None),
 'url': Value(dtype='string', id=None),
 'date': Value(dtype='string', id=None),
 'file_path': Value(dtype='string', id=None),
 'language': Value(dtype='string', id=None),
 'language_score': Value(dtype='float64', id=None),
 'token_count': Value(dtype='int64', id=None)}

In [17]:
model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [26]:
[i.name for i in pc.list_indexes()]

['custom-index', 'my-index']

In [41]:
index_name = "text-index"
dimensions = model.get_sentence_embedding_dimension()
metric = "cosine"
pc.create_index(
    name = index_name,
    dimension = dimensions,
    metric = metric,
    spec = ServerlessSpec(
        cloud = "aws",
        region = "us-east-1",
    ))

{
    "name": "text-index",
    "metric": "cosine",
    "host": "text-index-j92o7jv.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "region": "us-east-1",
            "cloud": "aws",
            "read_capacity": {
                "mode": "OnDemand",
                "status": {
                    "state": "Ready",
                    "current_shards": null,
                    "current_replicas": null
                }
            }
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null,
    "_response_info": {
        "raw_headers": {
            "content-type": "application/json",
            "vary": "origin, access-control-request-method, access-control-request-headers",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "*",
            "x-pinecone-api-version": "2025-1

In [42]:
index = pc.Index(name = index_name)

In [35]:
subset_size = 10000
vectors_to_upsert = []
for i,item in enumerate(fw):
    if i >= subset_size:
        break

    text = item["text"]
    unique_id = str(item["id"])
    language = item["language"]

    embedding = model.encode(text, show_progress_bar = False).tolist()

    metadata = {"language" : language}

    vectors_to_upsert.append((unique_id, embedding, metadata))
    print(f"Completed {i}", end=("\n" if i%100 == 0 else ""))

Completed 0
Completed 1Completed 2Completed 3Completed 4Completed 5Completed 6Completed 7Completed 8Completed 9Completed 10Completed 11Completed 12Completed 13Completed 14Completed 15Completed 16Completed 17Completed 18Completed 19Completed 20Completed 21Completed 22Completed 23Completed 24Completed 25Completed 26Completed 27Completed 28Completed 29Completed 30Completed 31Completed 32Completed 33Completed 34Completed 35Completed 36Completed 37Completed 38Completed 39Completed 40Completed 41Completed 42Completed 43Completed 44Completed 45Completed 46Completed 47Completed 48Completed 49Completed 50Completed 51Completed 52Completed 53Completed 54Completed 55Completed 56Completed 57Completed 58Completed 59Completed 60Completed 61Completed 62Completed 63Completed 64Completed 65Completed 66Completed 67Completed 68Completed 69Completed 70Completed 71Completed 72Completed 73Completed 74Completed 75Completed 76Completed 77Completed 78Completed 79Completed 80Completed 81Completed 82Completed 83C

In [36]:
print(vectors_to_upsert[:10])

[('<urn:uuid:39147604-bfbe-4ed5-b19c-54105f8ae8a7>', [-0.07137057930231094, -0.012872859835624695, 0.09573531150817871, 0.014749757014214993, -0.005443941336125135, 0.0684562474489212, 0.01221048180013895, 0.01889183558523655, 0.0627390444278717, -0.0015465256292372942, 0.033482179045677185, -0.024711819365620613, -0.07785335183143616, -0.020623115822672844, -0.05137188360095024, -0.05435722693800926, -0.05358939245343208, -0.010994247160851955, -0.09039902687072754, 0.06976143270730972, -0.052547309547662735, -0.11004109680652618, 0.01854798011481762, 0.07281849533319473, 0.03385879844427109, 0.006531236693263054, 0.05077851563692093, 0.023879827931523323, -0.005901089403778315, -0.001958815148100257, -0.039048682898283005, 0.022828489542007446, 0.042945727705955505, 0.0192585326731205, 0.0023721400648355484, 0.027770381420850754, 0.03706520050764084, -0.028697524219751358, 0.055799972265958786, 0.024242304265499115, 0.00032792615820653737, -0.08616069704294205, 0.0004863736394327134,

In [43]:
batch_size = 1000
for i in range(0, len(vectors_to_upsert), batch_size):
    batch = vectors_to_upsert[i:i + batch_size]
    index.upsert(vectors = batch)
    print(f"Upserted batch {i}")
    # print(i)

print("Upsert Complete!!")

Upserted batch 0
Upserted batch 1000
Upserted batch 2000
Upserted batch 3000
Upserted batch 4000
Upserted batch 5000
Upserted batch 6000
Upserted batch 7000
Upserted batch 8000
Upserted batch 9000
Upsert Complete!!
