# Import Libraries

In [78]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq
from sentence_transformers import SentenceTransformer
import json
import re
from typing import Dict, List, Optional, Any

In [3]:
groq_api = open("/Users/ani/Documents/0_API_KEYS/groq.txt").read().strip()
groq_llm_model = "llama-3.3-70b-versatile"

# Query from CSV

In [None]:
df_enriched_stock_data_w_sentiment = pd.read_csv('/Users/ani/Projects/6_stock_portfolio_recommendation/data/enriched_stock_data_w_sentiment.csv')
df_enriched_stock_data_w_sentiment = df_enriched_stock_data_w_sentiment.drop('Update_Date', axis=1)
df_enriched_stock_data_w_sentiment

Unnamed: 0,Ticker,Closing_Price,All_Time_High,Percent_From_All_Time_High,Percent_Difference_200_Day_Moving_Average,24_Hour_Percent_Change,7_Day_Percent_Change,30_Day_Percent_Change,Annualized_Return,YTD_Return,...,Industry,Country,Business_Summary,Dividend_Yield,Trailing_PE,Forward_PE,Average_Volume,Average_Volume_10days,52_Week_Change,Sentiment
0,NVDA,157.75,157.75,0.00,22.11,1.76,9.46,16.57,60.67,14.07,...,Semiconductors,United States,"NVIDIA Corporation, a computing infrastructure...",0.03,50.887100,38.288837,249322685,187835800,0.254816,example_sentiment
1,MSFT,495.94,497.45,-0.30,17.61,-0.30,3.74,9.69,17.33,18.94,...,Software - Infrastructure,United States,Microsoft Corporation develops and supports so...,0.67,38.355762,33.173244,23145869,20812190,0.112988,example_sentiment
2,AAPL,201.08,258.40,-22.18,-9.79,0.04,2.78,-5.30,14.78,-17.34,...,Consumer Electronics,United States,"Apple Inc. designs, manufactures, and markets ...",0.52,31.369736,24.197351,61975983,51950890,-0.045675,example_sentiment
3,AMZN,223.30,242.06,-7.75,8.96,2.85,3.95,6.21,8.88,1.40,...,Internet Retail,United States,"Amazon.com, Inc. engages in the retail sale of...",0.00,36.368080,36.308945,49333604,39989330,0.123519,example_sentiment
4,GOOGL,178.53,205.89,-13.29,3.89,2.88,1.47,8.09,17.04,-5.53,...,Internet Content & Information,United States,Alphabet Inc. offers various products and plat...,0.48,19.947487,19.925222,41357416,37073840,-0.047269,example_sentiment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,IVZ,15.70,24.58,-36.12,-2.56,0.77,8.13,1.42,11.01,-8.82,...,Asset Management,United States,Invesco Ltd. is a publicly owned investment ma...,5.39,12.559999,8.306878,5714119,5047610,0.041444,example_sentiment
496,APA,18.56,46.75,-60.30,-10.65,0.60,-10.68,2.15,7.67,-18.45,...,Oil & Gas E&P,United States,"APA Corporation, an independent energy company...",5.42,6.652329,6.749091,9028964,10513290,-0.373302,example_sentiment
497,MHK,104.90,229.74,-54.34,-15.38,1.50,6.57,-2.20,0.83,-9.51,...,"Furnishings, Fixtures & Appliances",United States,"Mohawk Industries, Inc. designs, manufactures,...",0.00,13.712419,9.408072,803162,920930,-0.090149,example_sentiment
498,CZR,28.86,119.49,-75.85,-14.37,1.23,9.11,-5.96,-5.13,-11.45,...,Resorts & Casinos,United States,"Caesars Entertainment, Inc. operates as a gami...",0.00,0.000000,21.537313,5763859,5734720,-0.282587,example_sentiment


# Define Functions

In [52]:
def extract_preferences_with_groq(query: str,
                                  all_columns: List[str], 
                                  numerical_columns: List[str], 
                                  non_numerical_columns: List[str], 
                                  groq_client) -> Dict:
    """Extract user preferences from natural language using Groq LLM"""
    
    # Identify categorical columns from non-numerical columns
    categorical_columns = [col for col in non_numerical_columns 
                          if col.lower() in ['sector', 'industry', 'country', 'company_name', 'ticker']]

    # Create dynamic categorical filters based on available columns
    categorical_filters = {}
    for col in categorical_columns:
        if col.lower() in ['sector', 'industry', 'country']:
            categorical_filters[col] = "value or null"
    
    # Create prompt for Groq LLM
    prompt = f"""
    Analyze this stock investment query and extract preferences: "{query}"
    
    Available stock data columns: {all_columns}
    Available numerical columns for analysis: {numerical_columns}
    Available categorical columns for filtering: {non_numerical_columns}
    
    Extract preferences and return ONLY a valid JSON object with this exact structure:
    {{
        "categorical_filters": {json.dumps(categorical_filters, indent=12)},

        "numerical_preferences": {{
            "risk_level": "low/medium/high or null",
            "return_preference": "low/medium/high or null",
            "sentiment_preference": "extreme_negative/negative/neutral/positve/extreme_positive or null",
            "market_cap_preference": "small/medium/large or null",
            "dividend_preference": "low/medium/high or null",
            "volatility_preference": "low/medium/high or null",
            "growth_preference": "low/medium/high or null",
            "valuation_preference": "undervalued/fairly_valued/overvalued or null"
        }},

        "feature_weights": {{
            // Assign weights 0.0-1.0 for numerical columns based on query importance
            // Only include columns mentioned or relevant to the query
        }},

        "investment_style": "value_investing/growth_investing/dividend_investing/momentum_investing/income_investing or null",
        
        "time_horizon": "short_term/medium_term/long_term or null"
    }}
    
    Guidelines:
    - Only extract explicitly mentioned preferences from the query
    - Use null for unmentioned criteria
    - For categorical_filters: match query terms to available categorical columns
    - For numerical_preferences: interpret investment terms (low risk, high return, etc.)
    - For feature_weights: assign higher weights (0.7-1.0) to numerical columns that are important for the query
    - Risk preferences: low risk = prefer stability, lower volatility
    - Return preferences: high return = prefer higher growth/returns
    - Be conservative - only extract clear, explicit preferences
    - Do not include any explanatory text, only return the JSON object
    """
    
    try:
        response = groq_client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[
                {
                    "role": "system", 
                    "content": "You are a financial analyst. Extract investment preferences from queries and return only valid JSON. No explanations, just JSON."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            temperature=0.1,
            max_tokens=1000
        )
        
        # Extract JSON from response
        content = response.choices[0].message.content.strip()
        
        # Clean up the response to extract JSON
        # Remove any markdown code blocks
        content = re.sub(r'```json\s*', '', content)
        content = re.sub(r'```\s*', '', content)
        
        # Try to find JSON in the response
        json_match = re.search(r'\{.*\}', content, re.DOTALL)
        if json_match:
            json_str = json_match.group()
            preferences = json.loads(json_str)
            return preferences
        else:
            print("No valid JSON found in Groq response")
            print(f"Response content: {content}")
            return {}
            
    except Exception as e:
        print(f"Error extracting preferences with Groq: {e}")
        return {}

In [53]:
user_query = "Recommend me 3 stocks that have a high yield, low risk and have an annualized return of at least 20%. Technology sector, long term"

non_numerical_columns = ['Ticker', 'Company_Name', 'Sector', 'Industry', 'Country', 'Business_Summary', 'Sentiment']
numerical_columns = df_enriched_stock_data.drop(non_numerical_columns, axis=1).columns.tolist()
groq_client = Groq(api_key=groq_api)

preferences = extract_preferences_with_groq(query=user_query, all_columns=non_numerical_columns+numerical_columns, non_numerical_columns=non_numerical_columns, numerical_columns=numerical_columns, groq_client=groq_client)
preferences

{'categorical_filters': {'Sector': 'Technology',
  'Industry': None,
  'Country': None},
 'numerical_preferences': {'risk_level': 'low',
  'return_preference': 'high',
  'sentiment_preference': None,
  'market_cap_preference': None,
  'dividend_preference': 'high',
  'volatility_preference': 'low',
  'growth_preference': 'high',
  'valuation_preference': None},
 'feature_weights': {'Annualized_Return': 0.9,
  'Annualized_Volatility': 0.8,
  'Dividend_Yield': 0.7},
 'investment_style': 'growth_investing',
 'time_horizon': 'long_term'}

In [54]:

def filter_stocks_by_categories(df: pd.DataFrame, preferences: Dict) -> pd.DataFrame:
    """Filter stocks based on categorical preferences"""
    
    filtered_df = df.copy()
    categorical_filters = preferences.get('categorical_filters', {})
    
    for column, value in categorical_filters.items():
        if value and value.lower() != 'null' and column in df.columns:
            # Case-insensitive partial matching
            mask = filtered_df[column].str.contains(value, case=False, na=False)
            filtered_df = filtered_df[mask]
            print(f"Filtered by {column}='{value}': {len(filtered_df)} stocks remaining")
    
    return filtered_df

In [55]:
filtered_dataframe = filter_stocks_by_categories(df_enriched_stock_data, preferences)
filtered_dataframe

Filtered by Sector='Technology': 82 stocks remaining


Unnamed: 0,Ticker,Closing_Price,All_Time_High,Percent_From_All_Time_High,Percent_Difference_200_Day_Moving_Average,24_Hour_Percent_Change,7_Day_Percent_Change,30_Day_Percent_Change,Annualized_Return,YTD_Return,...,Industry,Country,Business_Summary,Dividend_Yield,Trailing_PE,Forward_PE,Average_Volume,Average_Volume_10days,52_Week_Change,Sentiment
0,NVDA,157.75,157.75,0.00,22.11,1.76,9.46,16.57,60.67,14.07,...,Semiconductors,United States,"NVIDIA Corporation, a computing infrastructure...",0.03,50.887100,38.288837,249322685,187835800,0.254816,example_sentiment
1,MSFT,495.94,497.45,-0.30,17.61,-0.30,3.74,9.69,17.33,18.94,...,Software - Infrastructure,United States,Microsoft Corporation develops and supports so...,0.67,38.355762,33.173244,23145869,20812190,0.112988,example_sentiment
2,AAPL,201.08,258.40,-22.18,-9.79,0.04,2.78,-5.30,14.78,-17.34,...,Consumer Electronics,United States,"Apple Inc. designs, manufactures, and markets ...",0.52,31.369736,24.197351,61975983,51950890,-0.045675,example_sentiment
7,AVGO,269.35,270.17,-0.30,34.58,-0.30,8.27,16.31,46.29,16.73,...,Semiconductors,United States,"Broadcom Inc. designs, develops, and supplies ...",0.87,98.663000,43.654780,26899625,24933890,0.682747,example_sentiment
13,ORCL,210.24,215.27,-2.34,26.35,-1.21,0.99,29.02,26.69,27.39,...,Software - Infrastructure,United States,Oracle Corporation offers products and service...,0.94,48.442394,29.363130,11922400,23950820,0.507224,example_sentiment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455,AKAM,79.60,128.32,-37.97,-10.98,0.00,1.17,2.70,-4.51,-16.58,...,Software - Infrastructure,United States,"Akamai Technologies, Inc. engages in the provi...",0.00,26.711409,11.916168,2506609,2982760,-0.116341,example_sentiment
456,SWKS,74.80,179.58,-58.35,-4.34,0.20,4.57,4.60,-6.54,-13.67,...,Semiconductors,United States,"Skyworks Solutions, Inc., together with its su...",3.75,29.218752,11.687500,3996051,3042730,-0.299587,example_sentiment
471,EPAM,174.93,717.49,-75.62,-13.96,0.53,3.77,-5.13,-5.53,-23.66,...,Information Technology Services,United States,"EPAM Systems, Inc. provides digital platform e...",0.00,24.465733,15.632708,831843,650370,-0.075009,example_sentiment
484,DAY,55.14,130.32,-57.69,-14.40,0.04,-3.04,-7.12,-5.67,-22.76,...,Software - Application,United States,"Dayforce Inc., together with its subsidiaries,...",0.00,344.625000,24.950226,1964017,1912050,0.111290,example_sentiment


In [56]:

def create_user_preference_vector_with_embeddings(user_query: str, 
                                                 preferences: Dict, 
                                                 numerical_columns: List[str],
                                                 embedding_model) -> np.ndarray:
    """Create user preference vector using finance embeddings and extracted preferences"""
    
    # Create financial context for embeddings
    financial_contexts = []
    numerical_prefs = preferences.get('numerical_preferences', {})
    investment_style = preferences.get('investment_style')
    time_horizon = preferences.get('time_horizon')
    
    # Build financial context strings
    base_context = f"Investment query: {user_query}"
    financial_contexts.append(base_context)
    
    # Add preference contexts
    for pref_type, pref_value in numerical_prefs.items():
        if pref_value and pref_value.lower() != 'null':
            context = f"{pref_type.replace('_', ' ')}: {pref_value}"
            financial_contexts.append(context)
    
    if investment_style:
        financial_contexts.append(f"Investment style: {investment_style}")
    
    if time_horizon:
        financial_contexts.append(f"Time horizon: {time_horizon}")
    
    # Generate embeddings for the combined financial context
    combined_context = ". ".join(financial_contexts)
    # print(f"Financial context for embeddings: {combined_context}")
    
    # Get embeddings
    query_embedding = embedding_model.encode([combined_context])[0]
    
    # Create feature-specific embeddings
    feature_embeddings = []
    for feature in numerical_columns:
        # Create financial description for each feature
        feature_context = f"Financial metric: {feature}. {combined_context}"
        feature_emb = embedding_model.encode([feature_context])[0]
        feature_embeddings.append(feature_emb)
    
    # Calculate similarity between query and each feature
    feature_similarities = []
    for feature_emb in feature_embeddings:
        similarity = cosine_similarity([query_embedding], [feature_emb])[0][0]
        feature_similarities.append(similarity)
    
    # Normalize similarities to create preference vector
    user_vector = np.array(feature_similarities)
    
    # # Apply traditional preference mapping as backup/enhancement
    # preference_mapping = {
    #     'low': 0.2,
    #     'medium': 0.5, 
    #     'high': 0.8
    # }
    
    # # Enhance vector with explicit preferences
    # for i, feature in enumerate(numerical_columns):
    #     feature_lower = feature.lower()
        
    #     # Risk-related features (lower values for low risk preference)
    #     if any(risk_word in feature_lower for risk_word in ['volatility', 'beta', 'risk']):
    #         risk_pref = numerical_prefs.get('risk_level')
    #         if risk_pref in preference_mapping:
    #             # Invert for risk (low risk = low values)
    #             explicit_pref = 1 - preference_mapping[risk_pref]
    #             user_vector[i] = 0.7 * user_vector[i] + 0.3 * explicit_pref
        
    #     # Return-related features
    #     elif any(return_word in feature_lower for return_word in ['return', 'growth', 'yield']):
    #         return_pref = numerical_prefs.get('return_preference')
    #         if return_pref in preference_mapping:
    #             explicit_pref = preference_mapping[return_pref]
    #             user_vector[i] = 0.7 * user_vector[i] + 0.3 * explicit_pref
        
    #     # Volatility-related features
    #     elif any(vol_word in feature_lower for vol_word in ['change', 'volatility']):
    #         vol_pref = numerical_prefs.get('volatility_preference')
    #         if vol_pref in preference_mapping:
    #             explicit_pref = 1 - preference_mapping[vol_pref] if vol_pref == 'low' else preference_mapping[vol_pref]
    #             user_vector[i] = 0.7 * user_vector[i] + 0.3 * explicit_pref
    
    # Normalize the final vector
    user_vector = (user_vector - user_vector.min()) / (user_vector.max() - user_vector.min() + 1e-8)
    
    # print(f"Created user preference vector: {user_vector}")
    return user_vector

In [57]:
embedding_model = SentenceTransformer('FinLang/finance-embeddings-investopedia')

user_vector = create_user_preference_vector_with_embeddings(user_query, preferences, numerical_columns, embedding_model)
user_vector

array([0.5872127 , 0.71622705, 0.38793385, 0.        , 0.19062577,
       0.2553812 , 0.18608235, 0.83707166, 0.7502495 , 0.840338  ,
       0.8366389 , 0.7892123 , 0.8734586 , 0.99999994, 0.71921295,
       0.8662729 , 0.7721423 , 0.7223301 , 0.6959013 , 0.49616125,
       0.54056877, 0.8297047 , 0.5209071 , 0.55810153], dtype=float32)

In [58]:

def calculate_similarity_and_recommend(df: pd.DataFrame, 
                                     user_vector: np.ndarray, 
                                     numerical_columns: List[str], 
                                     top_n: int) -> pd.DataFrame:
    """Calculate cosine similarity between user preferences and stocks"""
    
    # Filter out rows with NaN values in selected features
    df_clean = df.dropna(subset=numerical_columns)
    
    if df_clean.empty:
        print("No stocks left after removing NaN values")
        return pd.DataFrame()
    
    print(f"Calculating similarity for {len(df_clean)} stocks")
    
    # Scale the features
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(df_clean[numerical_columns])
    
    # Scale user vector to same range - create DataFrame with proper feature names
    user_vector_df = pd.DataFrame(user_vector.reshape(1, -1), columns=numerical_columns)
    user_vector_scaled = scaler.transform(user_vector_df)[0]
    
    # Calculate cosine similarity between user vector and all stocks
    similarities = cosine_similarity([user_vector_scaled], scaled_features)[0]
    
    # Add similarity scores to dataframe
    df_with_scores = df_clean.copy()
    df_with_scores['Similarity_Score'] = similarities
    
    # Sort by similarity and get top N
    recommendations = df_with_scores.nlargest(top_n, 'Similarity_Score')
    
    # Select relevant columns for output
    output_columns = ['Ticker', 'Company_Name', 'Sector', 'Industry', 'Similarity_Score'] + numerical_columns
    available_columns = [col for col in output_columns if col in recommendations.columns]
    
    print(f"Top {top_n} recommendations found with similarity scores: {recommendations['Similarity_Score'].values}")
    
    return recommendations[available_columns].round(4)

In [69]:
recommended_dataframe = calculate_similarity_and_recommend(filtered_dataframe, user_vector, numerical_columns, 5)
recommended_dataframe

Calculating similarity for 82 stocks
Top 5 recommendations found with similarity scores: [0.86733689 0.85867367 0.85748712 0.85384988 0.85327082]


Unnamed: 0,Ticker,Company_Name,Sector,Industry,Similarity_Score,Closing_Price,All_Time_High,Percent_From_All_Time_High,Percent_Difference_200_Day_Moving_Average,24_Hour_Percent_Change,...,Annualized_Volatility,Sharpe_Ratio,Beta,Market_Cap,Dividend_Yield,Trailing_PE,Forward_PE,Average_Volume,Average_Volume_10days,52_Week_Change
300,BR,"Broadridge Financial Solutions,",Technology,Information Technology Services,0.8673,239.45,245.35,-2.41,4.6,0.12,...,22.48,0.65,0,28126515200,1.47,36.0075,25.6096,545172,449280,0.2141
88,ADP,"Automatic Data Processing, Inc.",Technology,Software - Application,0.8587,303.45,325.19,-6.68,2.32,0.16,...,21.87,0.79,0,123177345024,2.03,30.9959,27.9163,1824070,2060060,0.2693
331,FTV,Fortive Corporation,Technology,Scientific & Technical Instruments,0.8575,71.6,85.83,-16.58,-3.87,0.53,...,24.85,0.17,0,24335550464,0.45,31.4035,17.1703,3308114,2532690,-0.0389
372,LDOS,"Leidos Holdings, Inc.",Technology,Information Technology Services,0.8538,155.68,199.72,-22.05,3.05,0.0,...,26.3,0.44,0,20038973440,1.03,15.6935,14.8976,1455864,1268920,0.0672
170,ROP,"Roper Technologies, Inc.",Technology,Software - Application,0.8533,563.51,592.96,-4.97,1.19,-0.32,...,21.02,0.36,0,60585779200,0.58,40.6866,28.1474,588858,495010,0.0029


In [74]:
def generate_recommendation_justification(user_query: str, 
                                        recommendations_df: pd.DataFrame, 
                                        user_preferences: Dict,
                                        groq_client) -> str:
    """
    Generate a justification for the stock recommendations using Groq LLM.
    
    Args:
        user_query: Original user query
        recommendations_df: DataFrame containing recommended stocks
        user_preferences: Extracted user preferences
        groq_client: Groq client instance
        
    Returns:
        str: Justification for the recommendations
    """
    
    # Prepare stock information for justification
    stock_info = []
    for idx, row in recommendations_df.iterrows():
        stock_name = row.get('Company_Name', row.get('Ticker', f'Stock {idx}'))
        sector = row.get('Sector', 'N/A')
        industry = row.get('Industry', 'N/A')
        similarity_score = row.get('similarity_score', 'N/A')
        
        stock_summary = f"- {stock_name} (Sector: {sector}, Industry: {industry})"
        stock_info.append(stock_summary)
    
    stock_list = "\n".join(stock_info)
    
    # Create prompt for justification
    justification_prompt = f"""
    Based on the user's investment query: "{user_query}"
    
    The following stocks have been recommended (in bullet point list form):
    {stock_list}
    
    User preferences extracted: {user_preferences}
    
    Please provide a clear, concise justification (2-3 sentences) explaining why these stocks are good matches for the user's requirements. Focus on:
    1. How the stocks align with their specified criteria
    2. Key strengths of the selected stocks
    3. Why they form a good portfolio for their needs
    
    Keep the response professional and investment-focused.
    """
    
    try:
        chat_completion = groq_client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a professional financial advisor providing clear, concise investment justifications."
                },
                {
                    "role": "user",
                    "content": justification_prompt
                }
            ],
            model="llama3-8b-8192",
            temperature=0.3,
            max_tokens=200
        )
        
        justification = chat_completion.choices[0].message.content.strip()
        return justification
        
    except Exception as e:
        print(f"Error generating justification: {e}")
        return f"These stocks were selected based on their strong alignment with your criteria: {', '.join(user_preferences.keys()) if user_preferences else 'your investment preferences'}. They represent a diversified selection that matches your risk profile and investment objectives."

In [75]:
generate_recommendation_justification(user_query, recommended_dataframe, preferences, groq_client)

"Based on the user's investment query, I am pleased to recommend the following three stocks that meet their requirements:\n\n• Broadridge Financial Solutions, (Sector: Technology, Industry: Information Technology Services)\n• Automatic Data Processing, Inc. (Sector: Technology, Industry: Software - Application)\n• Roper Technologies, Inc. (Sector: Technology, Industry: Software - Application)\n\nThese stocks align with the user's specified criteria, including a high yield, low risk, and an annualized return of at least 20%. Specifically, Broadridge Financial Solutions and Automatic Data Processing, Inc. have a strong track record of consistent dividend payments and low volatility, making them attractive options for investors seeking low-risk investments. Roper Technologies, Inc. has a history of delivering high returns and has a strong competitive position in its industry.\n\nThe key strengths of these stocks include their stable financials, diversified revenue streams, and proven busi

In [79]:

def recommend_stocks_from_query(df: pd.DataFrame, 
                               user_query: str, 
                               numerical_columns: List[str], 
                               non_numerical_columns: List[str],
                               top_n: int = 5,
                               groq_api_key: str = None) -> Dict[str, Any]:
    """
    Recommend stocks based on natural language query using Groq LLM and Hugging Face finance embeddings.
    
    Args:
        df: DataFrame containing stock data
        user_query: Natural language query (e.g., "I want 5 stocks that are low risk, high return, in retail")
        numerical_columns: List of numerical feature columns for similarity calculation
        non_numerical_columns: List of non-numerical columns for categorical filtering
        top_n: Number of stocks to recommend
        groq_api_key: Groq API key
        
    Returns:
        Dict[str, Any]: Dictionary containing recommended stock names and justification
    """
    
    # Initialize Groq client
    if not groq_api_key:
        raise ValueError("groq_api_key is required")
    
    groq_client = Groq(api_key=groq_api_key)
    
    # Initialize finance embeddings model
    print("Loading finance embeddings model...")
    embedding_model = SentenceTransformer('FinLang/finance-embeddings-investopedia')
    
    # Get all column names
    all_columns = numerical_columns + non_numerical_columns
    
    # Step 1: Extract preferences using Groq LLM
    user_preferences = extract_preferences_with_groq(user_query, all_columns, 
                                                    numerical_columns, non_numerical_columns, groq_client)
    
    if not user_preferences:
        print("Could not extract valid preferences from query")
        return {
            "recommended_stocks": [],
            "justification": "Could not extract valid preferences from your query. Please try rephrasing your request."
        }
    
    # Step 2: Filter stocks based on categorical preferences
    filtered_df = filter_stocks_by_categories(df, user_preferences)
    
    if filtered_df.empty:
        print("No stocks match the categorical criteria")
        return {
            "recommended_stocks": [],
            "justification": "No stocks match your specified criteria. Please try adjusting your requirements."
        }
    
    # Step 3: Create user preference vector using embeddings
    user_vector = create_user_preference_vector_with_embeddings(
        user_query, user_preferences, numerical_columns, embedding_model
    )
    
    # Step 4: Calculate similarity and recommend
    recommendations_df = calculate_similarity_and_recommend(filtered_df, user_vector, numerical_columns, top_n)
    
    # Step 5: Extract stock names and generate justification
    if recommendations_df.empty:
        return {
            "recommended_stocks": [],
            "justification": "No suitable stock recommendations could be generated based on your criteria."
        }
    
    # Get recommended stock names
    recommended_stocks = []
    if 'Company_Name' in recommendations_df.columns:
        recommended_stocks = recommendations_df['Company_Name'].tolist()
    elif 'Ticker' in recommendations_df.columns:
        recommended_stocks = recommendations_df['Ticker'].tolist()
    else:
        # Fallback to index if no company name or ticker columns
        recommended_stocks = recommendations_df.index.tolist()
    
    # Step 6: Generate justification using Groq LLM
    justification = generate_recommendation_justification(
        user_query, recommendations_df, user_preferences, groq_client
    )
    
    return {
        "recommended_stocks": recommended_stocks,
        "justification": justification
    }

In [None]:
user_query = "Recommend me 3 stocks that have a high yield, low risk and have an annualized return of at least 20%"

non_numerical_columns = ['Ticker', 'Company_Name', 'Sector', 'Industry', 'Country', 'Business_Summary', 'Sentiment']
numerical_columns = df_enriched_stock_data_w_sentiment.drop(non_numerical_columns, axis=1).columns.tolist()

# Call the function
recommendations = recommend_stocks_from_query(
    df=df_enriched_stock_data_w_sentiment,
    user_query=user_query,
    numerical_columns=numerical_columns,
    non_numerical_columns=non_numerical_columns,
    top_n=5,
    groq_api_key=groq_api
)

recommendations

Loading finance embeddings model...
Calculating similarity for 500 stocks
Top 5 recommendations found with similarity scores: [0.85263515 0.84999612 0.84946852 0.84907545 0.84890144]


{'recommended_stocks': ['Atmos Energy Corporation',
  'Veralto Corp',
  'Roper Technologies, Inc.',
  "McDonald's Corporation",
  'Coca-Cola Company (The)'],
 'justification': "Based on the user's preferences, I recommend the following three stocks:\n\n* Atmos Energy Corporation (Sector: Utilities, Industry: Utilities - Regulated Gas)\n* Roper Technologies, Inc. (Sector: Technology, Industry: Software - Application)\n* Coca-Cola Company (The) (Sector: Consumer Defensive, Industry: Beverages - Non-Alcoholic)\n\nThese stocks align with the user's criteria of high yield, low risk, and an annualized return of at least 20%. Atmos Energy Corporation offers a high dividend yield and a stable regulated business model, making it a low-risk investment. Roper Technologies, Inc. has a strong track record of consistent growth and profitability, driven by its diversified portfolio of software and technology businesses. Coca-Cola Company is a well-established brand with a global presence, offering a 