In [9]:
from prefect import flow, task, get_run_logger
from prefect.blocks.system import Secret
from google.oauth2 import service_account
from datetime import datetime
import pandas as pd

from google.cloud import bigquery
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

In [13]:
pinecone_api = ""
huggingface_embeddings_model = "sentence-transformers/all-MiniLM-L6-v2"
pinecone_index_name = "stock-data-index-5"

In [2]:
def connect_to_pinecone(pinecone_api):
    """Initialize Pinecone connection and return client"""    
    try:        
        # Initialize Pinecone client
        pc = Pinecone(api_key=pinecone_api)
        
        return pc
        
    except Exception as e:
        raise

In [3]:
def connect_to_huggingface_embeddings(huggingface_embeddings_model):
    """Initialize HuggingFace embedding model"""
    
    try:
        # Initialize HuggingFace embeddings (no API key required)
        embeddings = HuggingFaceEmbeddings(
            model_name=huggingface_embeddings_model,
            model_kwargs={'device': 'cpu'},  # Use 'cuda' if GPU available
            encode_kwargs={'normalize_embeddings': True}
        )

        return embeddings

    except Exception as e:
        raise

In [4]:
def create_text_chunks(df):
    """Create text chunks from stock data for embedding"""
    
    try:
        documents = []
        columns = df.columns.tolist()
        
        # Convert each stock record to a text document
        for _, row in df.iterrows():
            doc_content = ""
            for col in columns:
                if pd.notna(row[col]):
                    doc_content += f"{col}: {row[col]}\n"
            
            # Create metadata for each document (FIXED: removed nested "metadata" key)
            metadata = {
                "Ticker": row['Ticker'],
                "Company_Name": row['Company_Name'], 
                "Sector": row['Sector'],
                "Industry": row['Industry'],
            }

            documents.append(Document(page_content=doc_content.strip(), metadata=metadata))
        
        # Initialize text splitter for chunking  
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]  # Better separators for structured data
        )
        
        # Split documents into chunks
        chunks = text_splitter.split_documents(documents)

        print(f"Created {len(chunks)} chunks from {len(df)} stock records")
        return chunks

    except Exception as e:
        print(f"Error creating text chunks: {str(e)}")
        raise

In [5]:
def create_embeddings_with_model(chunks, embeddings_model):
    """Create embeddings for text chunks using the embedding model"""
    
    try:
        # Extract text content from chunks
        texts = [chunk.page_content for chunk in chunks]
        
        # Create embeddings
        embeddings = embeddings_model.embed_documents(texts)

        print(f"Successfully created embeddings for {len(texts)} chunks")
        return embeddings
        
    except Exception as e:
        print(f"Failed to create embeddings: {str(e)}")
        raise

In [7]:
def save_embeddings_to_pinecone(pc, chunks, embeddings, index_name):
    """Save vector embeddings to Pinecone with metadata and page content"""
    
    try:
        # Check if index exists, create if not
        existing_indexes = pc.list_indexes().names()
        
        if index_name not in existing_indexes:
            print(f"Creating new Pinecone index: {index_name}")
            pc.create_index(
                name=index_name,
                dimension=len(embeddings[0]),
                metric="cosine",
                spec=ServerlessSpec(
                    cloud='aws',
                    region='us-east-1'
                )
            )
            print("Index created successfully")
        
        # Get index
        index = pc.Index(index_name)
        
        # Prepare vectors for upsert
        vectors = []
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            vector_id = f"{chunk.metadata.get('Ticker', 'unknown')}_{i}_{chunk.metadata.get('Update_Date', '')}"
            
            # Store minimal metadata + page content (no duplication)
            metadata = {
                # Essential metadata for filtering/searching
                'Ticker': chunk.metadata.get('Ticker'),
                'Company_Name': chunk.metadata.get('Company_Name'),
                'Sector': chunk.metadata.get('Sector'),
                'Industry': chunk.metadata.get('Industry'),

                # Full content for LLM context (this contains all the detailed info)
                'content': chunk.page_content,
                
                # Utility fields
                'chunk_index': i
            }
            
            vectors.append({
                "id": vector_id,
                "values": embedding,
                "metadata": metadata
            })
        
        # Upsert vectors in batches
        batch_size = 100
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i:i + batch_size]
            index.upsert(vectors=batch)
            print(f"Upserted batch {i//batch_size + 1}/{(len(vectors) + batch_size - 1)//batch_size}")

        print(f"Successfully saved {len(vectors)} embeddings to Pinecone index '{index_name}'")
        return f"Saved {len(vectors)} embeddings to Pinecone"
        
    except Exception as e:
        print(f"Failed to save embeddings to Pinecone: {str(e)}")
        raise

# Test

In [10]:
df_sandp500 = pd.read_csv('/Users/ani/Projects/6_stock_portfolio_recommendation/data/stock_data.csv')
df_sandp500

Unnamed: 0,Ticker,Company_Name,Sector,Industry,Headquarters_Location,Founded_Year,Annualized_Return,YTD_Pct_Return,2024_Pct_Return,2023_Pct_Return,2022_Pct_Return,2021_Pct_Return,2020_Pct_Return,Market_Cap,Pct_Diff_200_MA,Annualized_Volatility,Sharpe_Ratio,Beta,Years_Since_Founded
0,MSFT,Microsoft,Information Technology,Systems Software,"Redmond, Washington",1975,21.32,10.40,14.50,58.35,-27.69,55.79,22.27,3.421644e+12,10.81,0.27,78.51,1.18,50.0
1,NVDA,Nvidia,Information Technology,Semiconductors,"Santa Clara, California",1993,72.79,-2.29,178.87,246.10,-51.44,124.48,48.40,3.295497e+12,6.61,0.53,137.30,2.07,32.0
2,AAPL,Apple Inc.,Information Technology,"Technology Hardware, Storage & Peripherals","Cupertino, California",1977,20.76,-17.44,35.56,54.80,-28.20,38.06,65.49,2.999856e+12,-10.79,0.30,69.06,1.27,48.0
3,AMZN,Amazon,Consumer Discretionary,Broadline Retail,"Seattle, Washington",1994,10.66,-6.91,46.33,77.04,-50.71,4.64,31.80,2.176468e+12,1.80,0.36,29.80,1.43,31.0
4,GOOG,Alphabet Inc. (Class C),Communication Services,Interactive Media & Services,"Mountain View, California",1998,19.39,-9.22,36.95,57.11,-38.84,67.43,22.35,2.090085e+12,0.16,0.31,62.44,1.24,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,APA,APA Corporation,Energy,Oil & Gas Exploration & Production,"Houston, Texas",1954,10.16,-25.26,-33.86,-15.71,69.08,83.27,21.51,6.137973e+09,-20.98,0.55,18.30,1.39,71.0
499,CZR,Caesars Entertainment,Consumer Discretionary,Casinos & Gaming,"Reno, Nevada",1973,-5.72,-17.52,-30.05,10.93,-55.49,30.63,105.79,5.590180e+09,-22.33,0.56,-10.32,1.97,52.0
500,ENPH,Enphase Energy,Information Technology,Semiconductor Materials & Equipment,"Fremont, California",2006,-7.06,-42.00,-47.67,-47.83,43.65,6.21,194.02,5.430658e+09,-44.18,0.68,-10.33,1.48,19.0
501,BRK.B,Berkshire Hathaway,Financials,Multi-Sector Holdings,"Omaha, Nebraska",1839,,,,,,,,,,,,,


In [11]:
pc = connect_to_pinecone(pinecone_api)
embeddings_model = connect_to_huggingface_embeddings(huggingface_embeddings_model)
chunks = create_text_chunks(df_sandp500)
embeddings = create_embeddings_with_model(chunks, embeddings_model)

  from .autonotebook import tqdm as notebook_tqdm


Created 503 chunks from 503 stock records
Successfully created embeddings for 503 chunks


In [12]:
save_embeddings_to_pinecone(pc, chunks, embeddings, index_name=pinecone_index_name)

Creating new Pinecone index: stock-data-index-5
Index created successfully
Upserted batch 1/6
Upserted batch 2/6
Upserted batch 3/6
Upserted batch 4/6
Upserted batch 5/6
Upserted batch 6/6
Successfully saved 503 embeddings to Pinecone index 'stock-data-index-5'


'Saved 503 embeddings to Pinecone'