# Import Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq
from sentence_transformers import SentenceTransformer
import json
import re
from typing import Dict, List, Optional, Any
import pandas_gbq



In [4]:
groq_api = open("/Users/ani/Documents/0_API_KEYS/groq.txt").read().strip()
groq_llm_model = "llama-3.3-70b-versatile"

# Query from CSV

In [5]:
# df_enriched_stock_data_w_sentiment = pd.read_csv('/Users/ani/Projects/6_stock_portfolio_recommendation/data/enriched_stock_data_w_sentiment.csv')
# df_enriched_stock_data_w_sentiment = df_enriched_stock_data_w_sentiment.drop('Update_Date', axis=1)
# df_enriched_stock_data_w_sentiment

# Query from BigQuery

In [7]:
def load_table_from_bigquery(dataset_id, table_id, project_id):
    """Load a table from BigQuery."""

    query = f"SELECT * FROM `{dataset_id}.{table_id}`"
    df = pandas_gbq.read_gbq(query, project_id=project_id)
    return df

df_enriched_stock_data_w_sentiment = load_table_from_bigquery(dataset_id='stock_data', table_id='stock_data', project_id="capable-arbor-293714")
df_enriched_stock_data_w_sentiment

Downloading: 100%|[32m██████████[0m|


Unnamed: 0,Ticker,Closing_Price,All_Time_High,Percent_From_All_Time_High,Percent_Difference_200_Day_Moving_Average,24_Hour_Percent_Change,7_Day_Percent_Change,30_Day_Percent_Change,Annualized_Return,YTD_Return,...,Country,Business_Summary,Dividend_Yield,Trailing_PE,Forward_PE,Average_Volume,Average_Volume_10days,52_Week_Change,Update_Date,Sentiment
0,CTVA,77.12,77.12,0.00,24.03,0.16,5.23,11.75,19.90,37.58,...,United States,"Corteva, Inc. operates in the agriculture busi...",0.91,46.457832,24.024923,3929798,5160870,0.470633,2025-07-04,neutral
1,CF,94.65,111.25,-14.92,12.07,0.85,-0.03,7.65,23.63,11.61,...,United States,"CF Industries Holdings, Inc., together with it...",2.17,12.519841,15.315535,2917270,3787930,0.349444,2025-07-04,neutral
2,MOS,37.27,72.25,-48.41,33.69,-1.43,4.87,5.53,20.85,55.21,...,United States,"The Mosaic Company, through its subsidiaries, ...",2.41,32.129310,15.150407,5676824,5383670,0.371229,2025-07-04,neutral
3,VMC,266.24,291.14,-8.55,2.87,0.20,0.67,-2.42,14.42,4.60,...,United States,Vulcan Materials Company produces and supplies...,0.75,37.445850,29.321585,1092090,1132540,0.090209,2025-07-04,neutral
4,MLM,557.89,617.09,-9.59,4.40,-0.14,1.00,-0.87,17.76,9.61,...,United States,"Martin Marietta Materials, Inc., a natural res...",0.58,31.952465,26.770155,444847,478580,0.039405,2025-07-04,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,PNW,90.42,94.52,-4.34,2.08,0.28,0.28,-2.31,6.61,9.40,...,United States,"Pinnacle West Capital Corporation, through its...",4.00,17.904950,19.320513,1289762,1499870,0.192090,2025-07-04,neutral
495,ATO,152.28,161.76,-5.86,4.36,0.51,-2.66,-3.75,9.33,11.37,...,United States,"Atmos Energy Corporation, together with its su...",2.26,21.268156,21.327732,1126109,893440,0.328564,2025-07-04,neutral
496,NI,39.56,40.94,-3.37,6.04,0.38,-2.37,0.13,12.45,10.40,...,United States,"NiSource Inc., an energy holding company, oper...",2.78,21.383783,21.268818,4582860,4733460,0.386125,2025-07-04,neutral
497,AWK,140.05,175.80,-20.34,2.91,-0.19,-1.50,-2.62,2.70,14.46,...,United States,"American Water Works Company, Inc., through it...",2.38,25.510020,24.570177,1346942,1242620,0.077225,2025-07-04,neutral


# Define Functions

In [8]:
def extract_preferences_with_groq(query: str,
                                  all_columns: List[str], 
                                  numerical_columns: List[str], 
                                  non_numerical_columns: List[str], 
                                  groq_client) -> Dict:
    """Extract user preferences from natural language using Groq LLM"""
    
    # Identify categorical columns from non-numerical columns
    categorical_columns = [col for col in non_numerical_columns 
                          if col.lower() in ['sector', 'industry', 'country', 'company_name', 'ticker']]

    # Create dynamic categorical filters based on available columns
    categorical_filters = {}
    for col in categorical_columns:
        if col.lower() in ['sector', 'industry', 'country']:
            categorical_filters[col] = "value or null"
    
    # Create prompt for Groq LLM
    prompt = f"""
    Analyze this stock investment query and extract preferences: "{query}"
    
    Available stock data columns: {all_columns}
    Available numerical columns for analysis: {numerical_columns}
    Available categorical columns for filtering: {non_numerical_columns}
    
    Extract preferences and return ONLY a valid JSON object with this exact structure:
    {{
        "categorical_filters": {json.dumps(categorical_filters, indent=12)},

        "numerical_preferences": {{
            "risk_level": "low/medium/high or null",
            "return_preference": "low/medium/high or null",
            "sentiment_preference": "extreme_negative/negative/neutral/positve/extreme_positive or null",
            "market_cap_preference": "small/medium/large or null",
            "dividend_preference": "low/medium/high or null",
            "volatility_preference": "low/medium/high or null",
            "growth_preference": "low/medium/high or null",
            "valuation_preference": "undervalued/fairly_valued/overvalued or null"
        }},

        "feature_weights": {{
            // Assign weights 0.0-1.0 for numerical columns based on query importance
            // Only include columns mentioned or relevant to the query
        }},

        "investment_style": "value_investing/growth_investing/dividend_investing/momentum_investing/income_investing or null",
        
        "time_horizon": "short_term/medium_term/long_term or null"
    }}
    
    Guidelines:
    - Only extract explicitly mentioned preferences from the query
    - Use null for unmentioned criteria
    - For categorical_filters: match query terms to available categorical columns
    - For numerical_preferences: interpret investment terms (low risk, high return, etc.)
    - For feature_weights: assign higher weights (0.7-1.0) to numerical columns that are important for the query
    - Risk preferences: low risk = prefer stability, lower volatility
    - Return preferences: high return = prefer higher growth/returns
    - Be conservative - only extract clear, explicit preferences
    - Do not include any explanatory text, only return the JSON object
    """
    
    try:
        response = groq_client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[
                {
                    "role": "system", 
                    "content": "You are a financial analyst. Extract investment preferences from queries and return only valid JSON. No explanations, just JSON."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            temperature=0.1,
            max_tokens=1000
        )
        
        # Extract JSON from response
        content = response.choices[0].message.content.strip()
        
        # Clean up the response to extract JSON
        # Remove any markdown code blocks
        content = re.sub(r'```json\s*', '', content)
        content = re.sub(r'```\s*', '', content)
        
        # Try to find JSON in the response
        json_match = re.search(r'\{.*\}', content, re.DOTALL)
        if json_match:
            json_str = json_match.group()
            preferences = json.loads(json_str)
            return preferences
        else:
            print("No valid JSON found in Groq response")
            print(f"Response content: {content}")
            return {}
            
    except Exception as e:
        print(f"Error extracting preferences with Groq: {e}")
        return {}

In [None]:
user_query = "Recommend me 3 stocks that have a high yield, low risk and have an annualized return of at least 20%. Technology sector, long term"

non_numerical_columns = ['Ticker', 'Company_Name', 'Sector', 'Industry', 'Country', 'Business_Summary', 'Sentiment']
numerical_columns = df_enriched_stock_data.drop(non_numerical_columns, axis=1).columns.tolist()
groq_client = Groq(api_key=groq_api)

preferences = extract_preferences_with_groq(query=user_query, all_columns=non_numerical_columns+numerical_columns, non_numerical_columns=non_numerical_columns, numerical_columns=numerical_columns, groq_client=groq_client)
preferences

In [None]:

def filter_stocks_by_categories(df: pd.DataFrame, preferences: Dict) -> pd.DataFrame:
    """Filter stocks based on categorical preferences"""
    
    filtered_df = df.copy()
    categorical_filters = preferences.get('categorical_filters', {})
    
    for column, value in categorical_filters.items():
        if value and value.lower() != 'null' and column in df.columns:
            # Case-insensitive partial matching
            mask = filtered_df[column].str.contains(value, case=False, na=False)
            filtered_df = filtered_df[mask]
            print(f"Filtered by {column}='{value}': {len(filtered_df)} stocks remaining")
    
    return filtered_df

In [None]:
filtered_dataframe = filter_stocks_by_categories(df_enriched_stock_data, preferences)
filtered_dataframe

In [None]:

def create_user_preference_vector_with_embeddings(user_query: str, 
                                                 preferences: Dict, 
                                                 numerical_columns: List[str],
                                                 embedding_model) -> np.ndarray:
    """Create user preference vector using finance embeddings and extracted preferences"""
    
    # Create financial context for embeddings
    financial_contexts = []
    numerical_prefs = preferences.get('numerical_preferences', {})
    investment_style = preferences.get('investment_style')
    time_horizon = preferences.get('time_horizon')
    
    # Build financial context strings
    base_context = f"Investment query: {user_query}"
    financial_contexts.append(base_context)
    
    # Add preference contexts
    for pref_type, pref_value in numerical_prefs.items():
        if pref_value and pref_value.lower() != 'null':
            context = f"{pref_type.replace('_', ' ')}: {pref_value}"
            financial_contexts.append(context)
    
    if investment_style:
        financial_contexts.append(f"Investment style: {investment_style}")
    
    if time_horizon:
        financial_contexts.append(f"Time horizon: {time_horizon}")
    
    # Generate embeddings for the combined financial context
    combined_context = ". ".join(financial_contexts)
    # print(f"Financial context for embeddings: {combined_context}")
    
    # Get embeddings
    query_embedding = embedding_model.encode([combined_context])[0]
    
    # Create feature-specific embeddings
    feature_embeddings = []
    for feature in numerical_columns:
        # Create financial description for each feature
        feature_context = f"Financial metric: {feature}. {combined_context}"
        feature_emb = embedding_model.encode([feature_context])[0]
        feature_embeddings.append(feature_emb)
    
    # Calculate similarity between query and each feature
    feature_similarities = []
    for feature_emb in feature_embeddings:
        similarity = cosine_similarity([query_embedding], [feature_emb])[0][0]
        feature_similarities.append(similarity)
    
    # Normalize similarities to create preference vector
    user_vector = np.array(feature_similarities)
    
    # # Apply traditional preference mapping as backup/enhancement
    # preference_mapping = {
    #     'low': 0.2,
    #     'medium': 0.5, 
    #     'high': 0.8
    # }
    
    # # Enhance vector with explicit preferences
    # for i, feature in enumerate(numerical_columns):
    #     feature_lower = feature.lower()
        
    #     # Risk-related features (lower values for low risk preference)
    #     if any(risk_word in feature_lower for risk_word in ['volatility', 'beta', 'risk']):
    #         risk_pref = numerical_prefs.get('risk_level')
    #         if risk_pref in preference_mapping:
    #             # Invert for risk (low risk = low values)
    #             explicit_pref = 1 - preference_mapping[risk_pref]
    #             user_vector[i] = 0.7 * user_vector[i] + 0.3 * explicit_pref
        
    #     # Return-related features
    #     elif any(return_word in feature_lower for return_word in ['return', 'growth', 'yield']):
    #         return_pref = numerical_prefs.get('return_preference')
    #         if return_pref in preference_mapping:
    #             explicit_pref = preference_mapping[return_pref]
    #             user_vector[i] = 0.7 * user_vector[i] + 0.3 * explicit_pref
        
    #     # Volatility-related features
    #     elif any(vol_word in feature_lower for vol_word in ['change', 'volatility']):
    #         vol_pref = numerical_prefs.get('volatility_preference')
    #         if vol_pref in preference_mapping:
    #             explicit_pref = 1 - preference_mapping[vol_pref] if vol_pref == 'low' else preference_mapping[vol_pref]
    #             user_vector[i] = 0.7 * user_vector[i] + 0.3 * explicit_pref
    
    # Normalize the final vector
    user_vector = (user_vector - user_vector.min()) / (user_vector.max() - user_vector.min() + 1e-8)
    
    # print(f"Created user preference vector: {user_vector}")
    return user_vector

In [None]:
embedding_model = SentenceTransformer('FinLang/finance-embeddings-investopedia')

user_vector = create_user_preference_vector_with_embeddings(user_query, preferences, numerical_columns, embedding_model)
user_vector

In [None]:

def calculate_similarity_and_recommend(df: pd.DataFrame, 
                                     user_vector: np.ndarray, 
                                     numerical_columns: List[str], 
                                     top_n: int) -> pd.DataFrame:
    """Calculate cosine similarity between user preferences and stocks"""
    
    # Filter out rows with NaN values in selected features
    df_clean = df.dropna(subset=numerical_columns)
    
    if df_clean.empty:
        print("No stocks left after removing NaN values")
        return pd.DataFrame()
    
    print(f"Calculating similarity for {len(df_clean)} stocks")
    
    # Scale the features
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(df_clean[numerical_columns])
    
    # Scale user vector to same range - create DataFrame with proper feature names
    user_vector_df = pd.DataFrame(user_vector.reshape(1, -1), columns=numerical_columns)
    user_vector_scaled = scaler.transform(user_vector_df)[0]
    
    # Calculate cosine similarity between user vector and all stocks
    similarities = cosine_similarity([user_vector_scaled], scaled_features)[0]
    
    # Add similarity scores to dataframe
    df_with_scores = df_clean.copy()
    df_with_scores['Similarity_Score'] = similarities
    
    # Sort by similarity and get top N
    recommendations = df_with_scores.nlargest(top_n, 'Similarity_Score')
    
    # Select relevant columns for output
    output_columns = ['Ticker', 'Company_Name', 'Sector', 'Industry', 'Similarity_Score'] + numerical_columns
    available_columns = [col for col in output_columns if col in recommendations.columns]
    
    print(f"Top {top_n} recommendations found with similarity scores: {recommendations['Similarity_Score'].values}")
    
    return recommendations[available_columns].round(4)

In [None]:
recommended_dataframe = calculate_similarity_and_recommend(filtered_dataframe, user_vector, numerical_columns, 5)
recommended_dataframe

In [None]:
def generate_recommendation_justification(user_query: str, 
                                        recommendations_df: pd.DataFrame, 
                                        user_preferences: Dict,
                                        groq_client) -> str:
    """
    Generate a justification for the stock recommendations using Groq LLM.
    
    Args:
        user_query: Original user query
        recommendations_df: DataFrame containing recommended stocks
        user_preferences: Extracted user preferences
        groq_client: Groq client instance
        
    Returns:
        str: Justification for the recommendations
    """
    
    # Prepare stock information for justification
    stock_info = []
    for idx, row in recommendations_df.iterrows():
        stock_name = row.get('Company_Name', row.get('Ticker', f'Stock {idx}'))
        sector = row.get('Sector', 'N/A')
        industry = row.get('Industry', 'N/A')
        similarity_score = row.get('similarity_score', 'N/A')
        
        stock_summary = f"- {stock_name} (Sector: {sector}, Industry: {industry})"
        stock_info.append(stock_summary)
    
    stock_list = "\n".join(stock_info)
    
    # Create prompt for justification
    justification_prompt = f"""
    Based on the user's investment query: "{user_query}"
    
    The following stocks have been recommended (in bullet point list form):
    {stock_list}
    
    User preferences extracted: {user_preferences}
    
    Please provide a clear, concise justification (2-3 sentences) explaining why these stocks are good matches for the user's requirements. Focus on:
    1. How the stocks align with their specified criteria
    2. Key strengths of the selected stocks
    3. Why they form a good portfolio for their needs
    
    Keep the response professional and investment-focused.
    """
    
    try:
        chat_completion = groq_client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a professional financial advisor providing clear, concise investment justifications."
                },
                {
                    "role": "user",
                    "content": justification_prompt
                }
            ],
            model="llama3-8b-8192",
            temperature=0.3,
            max_tokens=200
        )
        
        justification = chat_completion.choices[0].message.content.strip()
        return justification
        
    except Exception as e:
        print(f"Error generating justification: {e}")
        return f"These stocks were selected based on their strong alignment with your criteria: {', '.join(user_preferences.keys()) if user_preferences else 'your investment preferences'}. They represent a diversified selection that matches your risk profile and investment objectives."

In [None]:
generate_recommendation_justification(user_query, recommended_dataframe, preferences, groq_client)

In [None]:

def recommend_stocks_from_query(df: pd.DataFrame, 
                               user_query: str, 
                               numerical_columns: List[str], 
                               non_numerical_columns: List[str],
                               top_n: int = 5,
                               groq_api_key: str = None) -> Dict[str, Any]:
    """
    Recommend stocks based on natural language query using Groq LLM and Hugging Face finance embeddings.
    
    Args:
        df: DataFrame containing stock data
        user_query: Natural language query (e.g., "I want 5 stocks that are low risk, high return, in retail")
        numerical_columns: List of numerical feature columns for similarity calculation
        non_numerical_columns: List of non-numerical columns for categorical filtering
        top_n: Number of stocks to recommend
        groq_api_key: Groq API key
        
    Returns:
        Dict[str, Any]: Dictionary containing recommended stock names and justification
    """
    
    # Initialize Groq client
    if not groq_api_key:
        raise ValueError("groq_api_key is required")
    
    groq_client = Groq(api_key=groq_api_key)
    
    # Initialize finance embeddings model
    print("Loading finance embeddings model...")
    embedding_model = SentenceTransformer('FinLang/finance-embeddings-investopedia')
    
    # Get all column names
    all_columns = numerical_columns + non_numerical_columns
    
    # Step 1: Extract preferences using Groq LLM
    user_preferences = extract_preferences_with_groq(user_query, all_columns, 
                                                    numerical_columns, non_numerical_columns, groq_client)
    
    if not user_preferences:
        print("Could not extract valid preferences from query")
        return {
            "recommended_stocks": [],
            "justification": "Could not extract valid preferences from your query. Please try rephrasing your request."
        }
    
    # Step 2: Filter stocks based on categorical preferences
    filtered_df = filter_stocks_by_categories(df, user_preferences)
    
    if filtered_df.empty:
        print("No stocks match the categorical criteria")
        return {
            "recommended_stocks": [],
            "justification": "No stocks match your specified criteria. Please try adjusting your requirements."
        }
    
    # Step 3: Create user preference vector using embeddings
    user_vector = create_user_preference_vector_with_embeddings(
        user_query, user_preferences, numerical_columns, embedding_model
    )
    
    # Step 4: Calculate similarity and recommend
    recommendations_df = calculate_similarity_and_recommend(filtered_df, user_vector, numerical_columns, top_n)
    
    # Step 5: Extract stock names and generate justification
    if recommendations_df.empty:
        return {
            "recommended_stocks": [],
            "justification": "No suitable stock recommendations could be generated based on your criteria."
        }
    
    # Get recommended stock names
    recommended_stocks = []
    if 'Company_Name' in recommendations_df.columns:
        recommended_stocks = recommendations_df['Company_Name'].tolist()
    elif 'Ticker' in recommendations_df.columns:
        recommended_stocks = recommendations_df['Ticker'].tolist()
    else:
        # Fallback to index if no company name or ticker columns
        recommended_stocks = recommendations_df.index.tolist()
    
    # Step 6: Generate justification using Groq LLM
    justification = generate_recommendation_justification(
        user_query, recommendations_df, user_preferences, groq_client
    )
    
    return {
        "recommended_stocks": recommended_stocks,
        "justification": justification
    }

In [None]:
user_query = "Recommend me 3 stocks that have a high yield, low risk and have an annualized return of at least 20%"

non_numerical_columns = ['Ticker', 'Company_Name', 'Sector', 'Industry', 'Country', 'Business_Summary', 'Sentiment']
numerical_columns = df_enriched_stock_data_w_sentiment.drop(non_numerical_columns, axis=1).columns.tolist()

# Call the function
recommendations = recommend_stocks_from_query(
    df=df_enriched_stock_data_w_sentiment,
    user_query=user_query,
    numerical_columns=numerical_columns,
    non_numerical_columns=non_numerical_columns,
    top_n=5,
    groq_api_key=groq_api
)

recommendations