<a href="https://colab.research.google.com/github/bhogaai-agenticai-sathishkumar/genaicourse/blob/main/helloworld_pinecone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Used to securely store your API key
from google.colab import userdata

# Load the API key from Colab secrets
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
print("PINECONE_API_KEY = "+PINECONE_API_KEY)
# Print divider line
print("*"*50)

PINECONE_ENVIRONMENT = userdata.get('PINECONE_ENVIRONMENT')
print("PINECONE_ENVIRONMENT = "+PINECONE_ENVIRONMENT)
# Print divider line
print("*"*50)

PINECONE_API_KEY = pcsk_6qCdS9_EQxASLPEuam22ypJDQzyXhzefEDMjW29AvA1fDiHmTm4m9nBsraNh29MUZJz9eC
**************************************************
PINECONE_ENVIRONMENT = gcp-starter
**************************************************


In [None]:
import os

# List files in the sample_data directory
print(os.listdir('/content/sample_data'))

['anscombe.json', 'README.md', 'mnist_train_small.csv', 'mnist_test.csv', 'california_housing_test.csv', 'california_housing_train.csv']


In [None]:
!pip install pinecone



In [None]:
import os
from pinecone import Pinecone
# Initialize Pinecone client
pinecone = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)

print("Pinecone client initialized successfully!")

Pinecone client initialized successfully!


## Load and prepare data

### Subtask:
Load the data from `california_housing.train.csv` into a pandas DataFrame and prepare it for vectorization.


**Reasoning**:
Import pandas and load the CSV file into a DataFrame, then display the head to inspect the data.



In [None]:
import pandas as pd

df = pd.read_csv('/content/sample_data/california_housing_test.csv')
display(df.head())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


# Task
Create code to create an index in pinecone and ingest the document from "sample-data\california_housing_test.csv"

## Load and prepare data

### Subtask:
Load the data from `california_housing_test.csv` into a pandas DataFrame and prepare it for vectorization.


**Reasoning**:
Check for missing values in the DataFrame to understand the data quality before vectorization.



In [None]:
display(df.isnull().sum())

Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,0
population,0
households,0
median_income,0
median_house_value,0


## Vectorize data

### Subtask:
Use a suitable method (e.g., an embedding model) to vectorize the relevant columns of the DataFrame.


**Reasoning**:
Select the numerical columns, convert them to a NumPy array, and then use StandardScaler to vectorize the data.



In [None]:
from sklearn.preprocessing import StandardScaler

numerical_cols = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
numerical_data = df[numerical_cols].values

scaler = StandardScaler()
vectorized_data = scaler.fit_transform(numerical_data)

## Create pinecone index

### Subtask:
Create a new index in Pinecone with a specified dimension and metric.


**Reasoning**:
Define the index name, get the dimension from the vectorized data, and check if the index exists before creating it.



In [None]:
from pinecone import ServerlessSpec, PineconeApiException, NotFoundException

index_name = "california-housing-index"
dimension = vectorized_data.shape[1]

try:
    if index_name in pinecone.list_indexes():
        print(f"Index '{index_name}' already exists. Deleting it...")
        pinecone.delete_index(index_name)
        print(f"Index '{index_name}' deleted.")

    print(f"Attempting to create index '{index_name}'...")
    pinecone.create_index(
        index_name,
        dimension=dimension,
        metric="euclidean",
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )
    print(f"Index '{index_name}' created successfully with dimension {dimension}.")

except (PineconeApiException, NotFoundException) as e:
    print(f"Failed to create or delete index '{index_name}'. Error: {e}")
    print("Caveat: Index creation/deletion on the free tier can be sensitive to region availability.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Attempting to create index 'california-housing-index'...
Failed to create or delete index 'california-housing-index'. Error: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2025-04', 'x-cloud-trace-context': '56328949bd9cfcbc0150f96e515b48f0', 'date': 'Sun, 17 Aug 2025 13:04:53 GMT', 'server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}

Caveat: Index creation/deletion on the free tier can be sensitive to region availability.


In [None]:
index = pinecone.Index(index_name)

data_to_upsert = [(str(i), vector.tolist()) for i, vector in enumerate(vectorized_data)]

batch_size = 100
for i in range(0, len(data_to_upsert), batch_size):
    batch = data_to_upsert[i:i + batch_size]
    index.upsert(vectors=batch)

index_stats = index.describe_index_stats()
print(f"Upsert complete. Number of vectors in index: {index_stats.total_vector_count}")
print(f"Number of rows in original DataFrame: {len(df)}")

if index_stats.total_vector_count == len(df):
    print("Number of vectors in index matches the number of rows in the DataFrame.")
else:
    print("Warning: Number of vectors in index does not match the number of rows in the DataFrame.")

print("*"*100)

Upsert complete. Number of vectors in index: 3000
Number of rows in original DataFrame: 3000
Number of vectors in index matches the number of rows in the DataFrame.
****************************************************************************************************


In [None]:
# Fetch all vectors from the index (this might be inefficient for large indexes)
# In a real application, you would typically fetch vectors based on a query.
index = pinecone.Index(index_name)
index_stats = index.describe_index_stats()
total_vectors = index_stats.total_vector_count

# Fetching all vectors can be done in batches
fetched_vectors = []
for i in range(total_vectors):
    try:
        # Fetching vectors one by one is very inefficient.
        # A better approach would be to store metadata in Pinecone
        # or fetch in batches if you have the IDs.
        # For demonstration purposes, we'll fetch the first few.
        if i < 10: # Fetching only the first 10 for demonstration
            vector_id = str(i)
            response = index.fetch(ids=[vector_id])
            if vector_id in response.vectors:
                fetched_vectors.append(response.vectors[vector_id].values)
            else:
                print(f"Vector with id {vector_id} not found.")
        else:
            break # Stop after fetching 10
    except Exception as e:
        print(f"Error fetching vector {i}: {e}")

if fetched_vectors:
    # Inverse transform the fetched vectors to get the original data
    recreated_data = scaler.inverse_transform(fetched_vectors)

    # Create a new DataFrame from the recreated data
    recreated_df = pd.DataFrame(recreated_data, columns=numerical_cols)

    print("\nRecreated DataFrame (first 10 rows):")
    display(recreated_df.head(10))
else:
    print("No vectors fetched to recreate the DataFrame.")


Recreated DataFrame (first 10 rows):


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3884.999988,660.999997,1536.999998,605.999998,6.6085,344700.004339
1,-118.3,34.26,43.0,1510.0001,309.999995,808.999993,276.999994,3.599,176500.000151
2,-117.81,33.78,27.0,3588.999999,507.0,1483.999997,495.0,5.7934,270499.998388
3,-118.36,33.82,28.0,66.999962,14.999988,49.000045,10.999993,6.1359,329999.998454
4,-119.67,36.33,19.0,1241.000057,244.000002,849.999975,237.000001,2.9375,81699.994711
5,-119.56,36.51,37.0,1017.999987,212.999988,662.999983,204.000004,1.6635,67000.001268
6,-121.43,38.63,43.0,1008.999943,225.000005,604.000028,217.999995,1.6641,67000.001268
7,-120.65,35.48,19.0,2310.000008,470.999999,1341.0,441.000001,3.225,166899.998764
8,-122.84,38.4,15.000001,3079.999996,617.0,1446.0,599.000003,3.6696,194399.999787
9,-118.02,34.08,31.0,2401.999999,632.000003,2830.000029,602.999995,2.3333,164199.999703
