In [2]:
from prefect import flow, task, get_run_logger
from prefect.blocks.system import Secret
from google.oauth2 import service_account
from datetime import datetime
import pandas as pd

from google.cloud import bigquery
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Parameters

In [None]:
pinecone_api = open("/Users/ani/Documents/0_API_KEYS/pinecone.txt").read().strip()
huggingface_embeddings_model = "FinLang/finance-embeddings-investopedia"
pinecone_index_name = "stock-recommendation-app-index-test"

# Define Functions

In [3]:
def connect_to_pinecone(pinecone_api):
    """Initialize Pinecone connection and return client"""    
    try:        
        # Initialize Pinecone client
        pc = Pinecone(api_key=pinecone_api)
        
        return pc
        
    except Exception as e:
        raise

In [4]:
def connect_to_huggingface_embeddings(huggingface_embeddings_model):
    """Initialize HuggingFace embedding model"""
    
    try:
        # Initialize HuggingFace embeddings (no API key required)
        embeddings = HuggingFaceEmbeddings(
            model_name=huggingface_embeddings_model,
            model_kwargs={'device': 'cpu'},  # Use 'cuda' if GPU available
            encode_kwargs={'normalize_embeddings': True}
        )

        return embeddings

    except Exception as e:
        raise

In [13]:
def create_text_chunks(df):
    """Create exactly one text chunk per stock row"""
    
    try:
        chunks = []
        
        # Convert each stock record to a single chunk
        for _, row in df.iterrows():
            doc_content = ""
            for col in df.columns:
                if pd.notna(row[col]):
                    doc_content += f"{col}: {row[col]}\n"
            
            # Create metadata for each document
            metadata = {
                "Ticker": row['Ticker'],
                "Company_Name": row['Company_Name'], 
                "Sector": row['Sector'] if pd.notna(row['Sector']) else None,
                "Industry": row['Industry'] if pd.notna(row['Industry']) else None,
            }

            # Create Document object (no splitting)
            chunk = Document(page_content=doc_content.strip(), metadata=metadata)
            chunks.append(chunk)
        
        print(f"Created {len(chunks)} chunks from {len(df)} stock records (1:1 ratio)")
        return chunks

    except Exception as e:
        print(f"Error creating text chunks: {str(e)}")
        raise

In [14]:
def create_embeddings_with_model(chunks, embeddings_model):
    """Create embeddings for text chunks using the embedding model"""
    
    try:
        # Extract text content from chunks
        texts = [chunk.page_content for chunk in chunks]
        
        # Create embeddings
        embeddings = embeddings_model.embed_documents(texts)

        print(f"Successfully created embeddings for {len(texts)} chunks")
        return embeddings
        
    except Exception as e:
        print(f"Failed to create embeddings: {str(e)}")
        raise

In [15]:
def save_embeddings_to_pinecone(pc, chunks, embeddings, index_name, clear_existing):
    """Save vector embeddings to Pinecone with metadata and page content"""
    
    try:
        existing_indexes = pc.list_indexes().names()
        
        # Clear existing data if requested (recommended for stock data)
        if index_name in existing_indexes and clear_existing:
            print("Clearing existing data from index...")
            index = pc.Index(index_name)
            index.delete(delete_all=True)
            print("Index cleared successfully")

        if index_name not in existing_indexes:
            print(f"Creating new Pinecone index: {index_name}")
            pc.create_index(
                name=index_name,
                dimension=len(embeddings[0]),
                metric="cosine",
                spec=ServerlessSpec(
                    cloud='aws',
                    region='us-east-1'
                )
            )
            print("Index created successfully")
        
        index = pc.Index(index_name)

        vectors = []
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            vector_id = f"{chunk.metadata.get('Ticker', 'unknown')}_{i}_{chunk.metadata.get('Update_Date', '')}"
            
            metadata = {
                'Ticker': chunk.metadata.get('Ticker'),
                'Company_Name': chunk.metadata.get('Company_Name'),
                'Sector': chunk.metadata.get('Sector'),
                'Industry': chunk.metadata.get('Industry'),

                'content': chunk.page_content,
                
                'chunk_index': i
            }
            
            vectors.append({
                "id": vector_id,
                "values": embedding,
                "metadata": metadata
            })
        
        batch_size = 100
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i:i + batch_size]
            index.upsert(vectors=batch)
            print(f"Upserted batch {i//batch_size + 1}/{(len(vectors) + batch_size - 1)//batch_size}")

        print(f"Successfully saved {len(vectors)} embeddings to Pinecone index '{index_name}'")
        return f"Saved {len(vectors)} embeddings to Pinecone"
        
    except Exception as e:
        print(f"Failed to save embeddings to Pinecone: {str(e)}")
        raise

# Test

In [16]:
df_enriched_stock_data = pd.read_csv('/Users/ani/Projects/6_stock_portfolio_recommendation/data//enriched_stock_data.csv')
df_enriched_stock_data

Unnamed: 0,Ticker,Closing_Price,All_Time_High,Percent_From_All_Time_High,Percent_Difference_200_Day_Moving_Average,24_Hour_Percent_Change,7_Day_Percent_Change,30_Day_Percent_Change,Annualized_Return,YTD_Return,...,Sector,Industry,Country,Business_Summary,Dividend_Yield,Trailing_PE,Forward_PE,Average_Volume,Average_Volume_10days,52_Week_Change
0,MSFT,487.27,487.27,0.00,16.05,2.07,3.10,11.41,16.65,16.86,...,Technology,Software - Infrastructure,United States,Microsoft Corporation develops and supports so...,0.70,37.656105,32.593310,22828664.0,18533110.0,0.066411
1,NVDA,144.28,149.41,-3.43,12.52,0.30,1.02,22.94,57.59,4.33,...,Technology,Semiconductors,United States,"NVIDIA Corporation, a computing infrastructure...",0.03,46.392320,35.019444,251213522.0,173200140.0,0.217932
2,AAPL,201.95,258.40,-21.85,-9.56,0.47,1.59,2.39,14.62,-16.98,...,Technology,Consumer Electronics,United States,"Apple Inc. designs, manufactures, and markets ...",0.52,31.495320,24.294222,61159585.0,55367100.0,-0.034304
3,AMZN,209.62,242.06,-13.40,2.70,-0.03,-1.68,9.13,7.19,-4.81,...,Consumer Cyclical,Internet Retail,United States,"Amazon.com, Inc. engages in the retail sale of...",0.00,34.122150,34.066666,48931716.0,39057090.0,0.129978
4,GOOGL,163.90,205.89,-20.39,-4.40,-1.64,-7.58,6.37,14.49,-13.27,...,Communication Services,Internet Content & Information,United States,Alphabet Inc. offers various products and plat...,0.50,18.278538,18.298939,40432375.0,36300310.0,-0.070193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,ALB,56.83,313.32,-81.86,-29.27,0.30,-12.11,-0.42,-3.84,-32.51,...,Basic Materials,Specialty Chemicals,United States,Albemarle Corporation provides energy storage ...,2.86,0.000000,30.885870,3520533.0,3122920.0,-0.412241
497,IVZ,14.88,24.58,-39.45,-7.66,0.48,-1.26,2.26,8.32,-13.58,...,Financial Services,Asset Management,United States,Invesco Ltd. is a publicly owned investment ma...,5.67,11.904560,7.873386,5743362.0,5694980.0,-0.032658
498,MHK,102.00,229.74,-55.60,-18.31,2.48,-2.59,-2.96,-0.30,-12.01,...,Consumer Cyclical,"Furnishings, Fixtures & Appliances",United States,"Mohawk Industries, Inc. designs, manufactures,...",0.00,13.333333,9.147983,800620.0,908320.0,-0.117407
499,CZR,28.26,119.49,-76.35,-16.54,0.39,1.29,1.55,-7.11,-13.29,...,Consumer Cyclical,Resorts & Casinos,United States,"Caesars Entertainment, Inc. operates as a gami...",0.00,0.000000,21.089552,5669772.0,5448770.0,-0.278020


In [18]:
pc = connect_to_pinecone(pinecone_api)
embeddings_model = connect_to_huggingface_embeddings(huggingface_embeddings_model)
chunks = create_text_chunks(df_enriched_stock_data)
embeddings = create_embeddings_with_model(chunks, embeddings_model)

Created 501 chunks from 501 stock records (1:1 ratio)
Successfully created embeddings for 501 chunks


In [19]:
save_embeddings_to_pinecone(pc, chunks, embeddings, index_name=pinecone_index_name, clear_existing=True)

Clearing existing data from index...
Index cleared successfully
Upserted batch 1/6
Upserted batch 2/6
Upserted batch 3/6
Upserted batch 4/6
Upserted batch 5/6
Upserted batch 6/6
Successfully saved 501 embeddings to Pinecone index 'stock-recommendation-app-index-test'


'Saved 501 embeddings to Pinecone'