In [1]:
from prefect import flow, task, get_run_logger
from prefect.blocks.system import Secret
from google.oauth2 import service_account
from datetime import datetime
import pandas as pd

from google.cloud import bigquery
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

In [2]:
def connect_to_pinecone(pinecone_api):
    """Initialize Pinecone connection and return client"""    
    try:        
        # Initialize Pinecone client
        pc = Pinecone(api_key=pinecone_api)
        
        return pc
        
    except Exception as e:
        raise

In [3]:
def connect_to_huggingface_embeddings(huggingface_embeddings_model):
    """Initialize HuggingFace embedding model"""
    
    try:
        # Initialize HuggingFace embeddings (no API key required)
        embeddings = HuggingFaceEmbeddings(
            model_name=huggingface_embeddings_model,
            model_kwargs={'device': 'cpu'},  # Use 'cuda' if GPU available
            encode_kwargs={'normalize_embeddings': True}
        )

        return embeddings

    except Exception as e:
        raise

In [4]:
def create_text_chunks(df):
    """Create text chunks from stock data for embedding"""
    
    try:
        documents = []
        columns = df.columns.tolist()
        
        # Convert each stock record to a text document
        for _, row in df.iterrows():
            doc_content = ""
            for col in columns:
                if pd.notna(row[col]):
                    doc_content += f"{col}: {row[col]}\n"
            
            # Create metadata for each document (FIXED: removed nested "metadata" key)
            metadata = {
                "Ticker": row['Ticker'],
                "Company_Name": row['Company_Name'], 
                "Sector": row['Sector'],
                "Industry": row['Industry'],
            }

            documents.append(Document(page_content=doc_content.strip(), metadata=metadata))
        
        # Initialize text splitter for chunking  
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]  # Better separators for structured data
        )
        
        # Split documents into chunks
        chunks = text_splitter.split_documents(documents)

        print(f"Created {len(chunks)} chunks from {len(df)} stock records")
        return chunks

    except Exception as e:
        print(f"Error creating text chunks: {str(e)}")
        raise

In [5]:
def create_embeddings_with_model(chunks, embeddings_model):
    """Create embeddings for text chunks using the embedding model"""
    
    try:
        # Extract text content from chunks
        texts = [chunk.page_content for chunk in chunks]
        
        # Create embeddings
        embeddings = embeddings_model.embed_documents(texts)

        print(f"Successfully created embeddings for {len(texts)} chunks")
        return embeddings
        
    except Exception as e:
        print(f"Failed to create embeddings: {str(e)}")
        raise

In [11]:
def save_embeddings_to_pinecone(pc, chunks, embeddings, index_name, clear_existing):
    """Save vector embeddings to Pinecone with metadata and page content"""
    
    try:
        existing_indexes = pc.list_indexes().names()
        
        # Clear existing data if requested (recommended for stock data)
        if index_name in existing_indexes and clear_existing:
            print("Clearing existing data from index...")
            index.delete(delete_all=True)
            print("Index cleared successfully")

        if index_name not in existing_indexes:
            print(f"Creating new Pinecone index: {index_name}")
            pc.create_index(
                name=index_name,
                dimension=len(embeddings[0]),
                metric="cosine",
                spec=ServerlessSpec(
                    cloud='aws',
                    region='us-east-1'
                )
            )
            print("Index created successfully")
        
        index = pc.Index(index_name)

        vectors = []
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            vector_id = f"{chunk.metadata.get('Ticker', 'unknown')}_{i}_{chunk.metadata.get('Update_Date', '')}"
            
            metadata = {
                'Ticker': chunk.metadata.get('Ticker'),
                'Company_Name': chunk.metadata.get('Company_Name'),
                'Sector': chunk.metadata.get('Sector'),
                'Industry': chunk.metadata.get('Industry'),

                'content': chunk.page_content,
                
                'chunk_index': i
            }
            
            vectors.append({
                "id": vector_id,
                "values": embedding,
                "metadata": metadata
            })
        
        batch_size = 100
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i:i + batch_size]
            index.upsert(vectors=batch)
            print(f"Upserted batch {i//batch_size + 1}/{(len(vectors) + batch_size - 1)//batch_size}")

        print(f"Successfully saved {len(vectors)} embeddings to Pinecone index '{index_name}'")
        return f"Saved {len(vectors)} embeddings to Pinecone"
        
    except Exception as e:
        print(f"Failed to save embeddings to Pinecone: {str(e)}")
        raise

# Test

In [12]:
df_enriched_stock_data = pd.read_csv('/Users/ani/Projects/6_stock_portfolio_recommendation/data//enriched_stock_data.csv')
df_enriched_stock_data

Unnamed: 0,Ticker,Closing_Price,All_Time_High,Percent_From_All_Time_High,Percent_Difference_200_Day_Moving_Average,24_Hour_Percent_Change,7_Day_Percent_Change,30_Day_Percent_Change,Annualized_Return,YTD_Return,...,Sector,Industry,Country,Business_Summary,Dividend_Yield,Trailing_PE,Forward_PE,Average_Volume,Average_Volume_10days,52_Week_Change
0,AAPL,196.58,258.4,-23.92,-12.05,0.48,-2.42,-0.84,15.0,-19.19,...,Technology,Consumer Electronics,United States,"Apple Inc. designs, manufactures, and markets ...",0.53,30.572319,23.655836,61130764,51288240,-0.052581
1,AMZN,212.52,242.06,-12.2,4.29,-1.07,-2.06,14.87,8.02,-3.5,...,Consumer Cyclical,Internet Retail,United States,"Amazon.com, Inc. engages in the retail sale of...",,34.61238,34.5561,48685895,36713730,0.123969
2,GOOG,173.98,207.22,-16.04,0.57,-1.83,-2.05,5.44,16.07,-8.52,...,Communication Services,Internet Content & Information,United States,Alphabet Inc. offers various products and plat...,0.48,19.41741,19.439106,26395583,22489300,-0.034839
3,GOOGL,173.32,205.89,-15.82,1.13,-1.49,-1.57,6.31,16.09,-8.29,...,Communication Services,Internet Content & Information,United States,Alphabet Inc. offers various products and plat...,0.48,19.34375,19.34375,39809440,32413440,-0.035128
4,ABBV,185.49,214.68,-13.6,-0.46,0.01,-1.94,-0.89,15.34,5.31,...,Healthcare,Drug Manufacturers - General,United States,"AbbVie Inc., a research-based biopharmaceutica...",3.54,79.26923,15.291839,7701938,4394080,0.08862
5,ABT,132.41,139.57,-5.13,8.03,0.1,-0.74,-0.49,8.1,17.88,...,Healthcare,Medical Devices,United States,"Abbott Laboratories, together with its subsidi...",1.78,17.173801,25.660854,6692629,4782250,0.252459
6,AXP,296.42,324.79,-8.74,4.63,1.23,-1.7,7.64,20.98,-0.11,...,Financial Services,Credit Services,United States,"American Express Company, together with its su...",1.11,20.699722,19.604498,3185351,2358770,0.286657
7,AMD,126.79,211.38,-40.02,2.12,-0.24,4.16,28.56,15.21,5.11,...,Technology,Semiconductors,United States,"Advanced Micro Devices, Inc. operates as a sem...",,92.54745,24.860785,43686245,51216520,-0.213608
8,T,27.66,28.42,-2.67,14.3,0.04,-0.72,-1.78,9.63,24.0,...,Communication Services,Telecom Services,United States,AT&T Inc. provides telecommunications and tech...,4.01,16.969326,12.348214,37241861,28881250,0.503261
9,ACN,306.38,396.28,-22.69,-9.65,-1.81,-3.23,0.85,8.57,-11.36,...,Technology,Information Technology Services,Ireland,Accenture plc provides strategy and consulting...,1.93,25.258038,21.77541,3365711,2795730,-0.008415


In [13]:
pinecone_api = open("/Users/ani/Documents/0_API_KEYS/pinecone.txt").read().strip()
huggingface_embeddings_model = "sentence-transformers/all-MiniLM-L6-v2"
pinecone_index_name = "stock-recommendation-app-index-test-7"

In [9]:
pc = connect_to_pinecone(pinecone_api)
embeddings_model = connect_to_huggingface_embeddings(huggingface_embeddings_model)
chunks = create_text_chunks(df_enriched_stock_data)
embeddings = create_embeddings_with_model(chunks, embeddings_model)

Created 199 chunks from 50 stock records
Successfully created embeddings for 199 chunks


In [14]:
save_embeddings_to_pinecone(pc, chunks, embeddings, index_name=pinecone_index_name, clear_existing=True)

Creating new Pinecone index: stock-recommendation-app-index-test-7
Index created successfully
Upserted batch 1/2
Upserted batch 2/2
Successfully saved 199 embeddings to Pinecone index 'stock-recommendation-app-index-test-7'


'Saved 199 embeddings to Pinecone'