# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq
from sentence_transformers import SentenceTransformer
import json
import re
from typing import Dict, List, Any
import pandas_gbq



In [2]:
groq_api = open("/Users/ani/Documents/0_API_KEYS/groq.txt").read().strip()
groq_llm_model = "llama-3.3-70b-versatile"

# Query from CSV

In [3]:
# df_enriched_stock_data_w_sentiment = pd.read_csv('/Users/ani/Projects/6_stock_portfolio_recommendation/data/enriched_stock_data_w_sentiment.csv')
# df_enriched_stock_data_w_sentiment = df_enriched_stock_data_w_sentiment.drop('Update_Date', axis=1)
# df_enriched_stock_data_w_sentiment

# Query from BigQuery

In [4]:
def load_table_from_bigquery(dataset_id, table_id, project_id):
    """Load a table from BigQuery."""

    query = f"SELECT * FROM `{dataset_id}.{table_id}`"
    df = pandas_gbq.read_gbq(query, project_id=project_id)
    return df

df_enriched_stock_data_w_sentiment = load_table_from_bigquery(dataset_id='stock_data', table_id='stock_data', project_id="capable-arbor-293714")
df_enriched_stock_data_w_sentiment

Downloading: 100%|[32m██████████[0m|


Unnamed: 0,Ticker,Closing_Price,All_Time_High,Percent_From_All_Time_High,Percent_Difference_200_Day_Moving_Average,24_Hour_Percent_Change,7_Day_Percent_Change,30_Day_Percent_Change,Annualized_Return,YTD_Return,...,Country,Business_Summary,Dividend_Yield,Trailing_PE,Forward_PE,Average_Volume,Average_Volume_10days,52_Week_Change,Update_Date,Sentiment
0,CTVA,76.58,77.12,-0.70,22.96,-0.70,4.56,11.32,21.01,36.62,...,United States,"Corteva, Inc. operates in the agriculture busi...",0.91,46.132534,23.856699,3887726,5160870,0.491106,2025-07-07,Neutral
1,CF,95.19,111.25,-14.44,12.60,0.57,4.15,6.54,25.21,12.24,...,United States,"CF Industries Holdings, Inc., together with it...",2.17,12.591270,15.402913,2915815,3787930,0.364226,2025-07-07,Neutral
2,MOS,37.69,72.25,-47.83,34.89,1.13,6.26,6.69,22.74,56.95,...,United States,"The Mosaic Company, through its subsidiaries, ...",2.41,32.491380,15.321137,5637831,5383670,0.377310,2025-07-07,Neutral
3,VMC,267.00,291.14,-8.29,3.11,0.29,4.85,-1.13,15.20,4.89,...,United States,Vulcan Materials Company produces and supplies...,0.75,37.552742,29.405287,1064390,1132540,0.100302,2025-07-07,Neutral
4,MLM,556.29,617.09,-9.85,4.07,-0.29,4.09,-0.53,18.39,9.29,...,United States,"Martin Marietta Materials, Inc., a natural res...",0.58,31.860825,26.693377,436660,478580,0.052662,2025-07-07,Neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,PNW,90.10,94.52,-4.68,1.71,-0.35,1.53,-1.10,6.57,9.01,...,United States,"Pinnacle West Capital Corporation, through its...",4.00,17.841583,19.252136,1268641,1499870,0.183353,2025-07-07,Neutral
496,ATO,152.79,161.76,-5.55,4.65,0.33,0.04,-2.12,9.73,11.75,...,United States,"Atmos Energy Corporation, together with its su...",2.26,21.339384,21.399158,1110531,893440,0.334853,2025-07-07,Neutral
497,NI,39.63,40.94,-3.20,6.15,0.18,-0.25,2.01,12.41,10.60,...,United States,"NiSource Inc., an energy holding company, oper...",2.78,21.421621,21.306452,4522701,4733460,0.384669,2025-07-07,Neutral
498,AWK,139.96,175.80,-20.39,2.87,-0.06,-0.06,-2.45,2.59,14.38,...,United States,"American Water Works Company, Inc., through it...",2.38,25.493628,24.554388,1319263,1242620,0.077971,2025-07-07,Neutral


# Define Functions

In [5]:
def extract_preferences_with_groq(query: str,
                                  all_columns: List[str], 
                                  numerical_columns: List[str], 
                                  non_numerical_columns: List[str], 
                                  groq_client) -> Dict:
    """Extract user preferences from natural language using Groq LLM"""
    
    # Identify categorical columns from non-numerical columns
    categorical_columns = [col for col in non_numerical_columns 
                          if col.lower() in ['sector', 'industry', 'country', 'company_name', 'ticker']]

    # Create dynamic categorical filters based on available columns
    categorical_filters = {}
    for col in categorical_columns:
        if col.lower() in ['sector', 'industry', 'country']:
            categorical_filters[col] = "value or null"
    
    # Create prompt for Groq LLM
    prompt = f"""
    Analyze this stock investment query and extract preferences: "{query}"
    
    Available stock data columns: {all_columns}
    Available numerical columns for analysis: {numerical_columns}
    Available categorical columns for filtering: {non_numerical_columns}
    
    Extract preferences and return ONLY a valid JSON object with this exact structure:
    {{
        "categorical_filters": {json.dumps(categorical_filters, indent=12)},

        "numerical_preferences": {{
            "risk_level": "low/medium/high or null",
            "return_preference": "low/medium/high or null",
            "sentiment_preference": "extreme_negative/negative/neutral/positve/extreme_positive or null",
            "market_cap_preference": "small/medium/large or null",
            "dividend_preference": "low/medium/high or null",
            "volatility_preference": "low/medium/high or null",
            "growth_preference": "low/medium/high or null",
            "valuation_preference": "undervalued/fairly_valued/overvalued or null"
        }},

        "feature_weights": {{
            // Assign weights 0.0-1.0 for numerical columns based on query importance
            // Only include columns mentioned or relevant to the query
        }},

        "investment_style": "value_investing/growth_investing/dividend_investing/momentum_investing/income_investing or null",
        
        "time_horizon": "short_term/medium_term/long_term or null"
    }}
    
    Guidelines:
    - Only extract explicitly mentioned preferences from the query
    - Use null for unmentioned criteria
    - For categorical_filters: match query terms to available categorical columns
    - For numerical_preferences: interpret investment terms (low risk, high return, etc.)
    - For feature_weights: assign higher weights (0.7-1.0) to numerical columns that are important for the query
    - Risk preferences: low risk = prefer stability, lower volatility
    - Return preferences: high return = prefer higher growth/returns
    - Be conservative - only extract clear, explicit preferences
    - Do not include any explanatory text, only return the JSON object
    """
    
    try:
        response = groq_client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[
                {
                    "role": "system", 
                    "content": "You are a financial analyst. Extract investment preferences from queries and return only valid JSON. No explanations, just JSON."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            temperature=0.1,
            max_tokens=1000
        )
        
        # Extract JSON from response
        content = response.choices[0].message.content.strip()
        
        # Clean up the response to extract JSON
        # Remove any markdown code blocks
        content = re.sub(r'```json\s*', '', content)
        content = re.sub(r'```\s*', '', content)
        
        # Try to find JSON in the response
        json_match = re.search(r'\{.*\}', content, re.DOTALL)
        if json_match:
            json_str = json_match.group()
            preferences = json.loads(json_str)
            return preferences
        else:
            print("No valid JSON found in Groq response")
            print(f"Response content: {content}")
            return {}
            
    except Exception as e:
        print(f"Error extracting preferences with Groq: {e}")
        return {}

In [6]:
user_query = "Recommend me 3 stocks that have a high yield, low risk and have an annualized return of at least 20%. Technology sector, long term"

non_numerical_columns = ['Ticker', 'Company_Name', 'Sector', 'Industry', 'Country', 'Business_Summary', 'Sentiment', 'Update_Date']
numerical_columns = df_enriched_stock_data_w_sentiment.drop(non_numerical_columns, axis=1).columns.tolist()
groq_client = Groq(api_key=groq_api)

preferences = extract_preferences_with_groq(query=user_query, all_columns=non_numerical_columns+numerical_columns, non_numerical_columns=non_numerical_columns, numerical_columns=numerical_columns, groq_client=groq_client)
preferences

{'categorical_filters': {'Sector': 'Technology',
  'Industry': None,
  'Country': None},
 'numerical_preferences': {'risk_level': 'low',
  'return_preference': 'high',
  'sentiment_preference': None,
  'market_cap_preference': None,
  'dividend_preference': 'high',
  'volatility_preference': 'low',
  'growth_preference': 'high',
  'valuation_preference': None},
 'feature_weights': {'Annualized_Return': 0.9,
  'Annualized_Volatility': 0.8,
  'Dividend_Yield': 0.7},
 'investment_style': 'growth_investing',
 'time_horizon': 'long_term'}

In [7]:

def filter_stocks_by_categories(df: pd.DataFrame, preferences: Dict) -> pd.DataFrame:
    """Filter stocks based on categorical preferences"""
    
    filtered_df = df.copy()
    categorical_filters = preferences.get('categorical_filters', {})
    
    for column, value in categorical_filters.items():
        if value and value.lower() != 'null' and column in df.columns:
            # Case-insensitive partial matching
            mask = filtered_df[column].str.contains(value, case=False, na=False)
            filtered_df = filtered_df[mask]
            print(f"Filtered by {column}='{value}': {len(filtered_df)} stocks remaining")
    
    return filtered_df

In [8]:
filtered_dataframe = filter_stocks_by_categories(df_enriched_stock_data_w_sentiment, preferences)
filtered_dataframe

Filtered by Sector='Technology': 81 stocks remaining


Unnamed: 0,Ticker,Closing_Price,All_Time_High,Percent_From_All_Time_High,Percent_Difference_200_Day_Moving_Average,24_Hour_Percent_Change,7_Day_Percent_Change,30_Day_Percent_Change,Annualized_Return,YTD_Return,...,Country,Business_Summary,Dividend_Yield,Trailing_PE,Forward_PE,Average_Volume,Average_Volume_10days,52_Week_Change,Update_Date,Sentiment
388,CSCO,68.93,69.37,-0.63,16.83,-0.63,1.69,9.72,9.77,18.92,...,United States,"Cisco Systems, Inc. designs, manufactures, and...",2.36,28.134693,17.674358,21587471,26551810,0.506079,2025-07-07,Positive
389,MSI,426.37,500.78,-14.86,-3.95,0.36,1.76,0.25,22.86,-6.66,...,United States,"Motorola Solutions, Inc. provides public safet...",1.04,35.590153,29.024508,950200,1001960,0.087744,2025-07-07,Neutral
390,HPE,20.91,24.05,-13.05,9.88,-2.01,13.09,20.09,17.72,-1.10,...,United States,Hewlett Packard Enterprise Company provides so...,2.54,20.105770,9.957143,19475506,24281190,0.021541,2025-07-07,Neutral
391,ZBRA,317.11,614.55,-48.40,-5.40,-2.04,4.64,10.14,3.55,-17.37,...,United States,"Zebra Technologies Corporation, together with ...",0.00,29.944284,19.769949,597135,527530,0.029382,2025-07-07,Neutral
392,ANET,101.47,129.82,-21.84,4.70,-1.02,5.36,9.78,40.17,-9.23,...,United States,Arista Networks Inc engages in the development...,0.00,42.814350,10.428572,10554976,11443660,0.105128,2025-07-07,Neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464,GEN,30.10,31.42,-4.21,8.86,-1.28,3.51,9.06,8.89,10.50,...,United States,Gen Digital Inc. engages in the provision of c...,1.70,29.223303,12.285714,4726310,5010810,0.241955,2025-07-07,Neutral
465,FFIV,299.26,310.60,-3.65,13.81,-0.29,1.65,4.54,14.31,18.87,...,United States,"F5, Inc. provides multicloud application secur...",0.00,28.419754,19.432468,491363,561810,0.733453,2025-07-07,Neutral
466,AKAM,79.06,128.32,-38.39,-11.14,-0.53,-0.59,3.45,-5.73,-17.15,...,United States,"Akamai Technologies, Inc. engages in the provi...",0.00,26.530201,11.835329,2500905,2376330,-0.144825,2025-07-07,Neutral
467,FSLR,177.06,300.71,-41.12,2.89,-4.31,15.95,8.38,20.42,-5.05,...,United States,"First Solar, Inc., a solar technology company,...",0.00,15.043329,8.488015,5221858,5306380,-0.185321,2025-07-07,Neutral


In [9]:

def create_user_preference_vector_with_embeddings(user_query: str, 
                                                 preferences: Dict, 
                                                 numerical_columns: List[str],
                                                 embedding_model) -> np.ndarray:
    """Create user preference vector using finance embeddings and extracted preferences"""
    
    # Create financial context for embeddings
    financial_contexts = []
    numerical_prefs = preferences.get('numerical_preferences', {})
    investment_style = preferences.get('investment_style')
    time_horizon = preferences.get('time_horizon')
    
    # Build financial context strings
    base_context = f"Investment query: {user_query}"
    financial_contexts.append(base_context)
    
    # Add preference contexts
    for pref_type, pref_value in numerical_prefs.items():
        if pref_value and pref_value.lower() != 'null':
            context = f"{pref_type.replace('_', ' ')}: {pref_value}"
            financial_contexts.append(context)
    
    if investment_style:
        financial_contexts.append(f"Investment style: {investment_style}")
    
    if time_horizon:
        financial_contexts.append(f"Time horizon: {time_horizon}")
    
    # Generate embeddings for the combined financial context
    combined_context = ". ".join(financial_contexts)
    # print(f"Financial context for embeddings: {combined_context}")
    
    # Get embeddings
    query_embedding = embedding_model.encode([combined_context])[0]
    
    # Create feature-specific embeddings
    feature_embeddings = []
    for feature in numerical_columns:
        # Create financial description for each feature
        feature_context = f"Financial metric: {feature}. {combined_context}"
        feature_emb = embedding_model.encode([feature_context])[0]
        feature_embeddings.append(feature_emb)
    
    # Calculate similarity between query and each feature
    feature_similarities = []
    for feature_emb in feature_embeddings:
        similarity = cosine_similarity([query_embedding], [feature_emb])[0][0]
        feature_similarities.append(similarity)
    
    # Normalize similarities to create preference vector
    user_vector = np.array(feature_similarities)
    
    # Normalize the final vector
    user_vector = (user_vector - user_vector.min()) / (user_vector.max() - user_vector.min() + 1e-8)
    
    # print(f"Created user preference vector: {user_vector}")
    return user_vector

In [10]:
embedding_model = SentenceTransformer('FinLang/finance-embeddings-investopedia')

user_vector = create_user_preference_vector_with_embeddings(user_query, preferences, numerical_columns, embedding_model)
user_vector

array([0.5872127 , 0.71622705, 0.38793385, 0.        , 0.19062577,
       0.2553812 , 0.18608235, 0.83707166, 0.7502495 , 0.840338  ,
       0.8366389 , 0.7892123 , 0.8734586 , 0.99999994, 0.71921295,
       0.8662729 , 0.7721423 , 0.7223301 , 0.6959013 , 0.49616125,
       0.54056877, 0.8297047 , 0.5209071 , 0.55810153], dtype=float32)

In [11]:

def calculate_similarity_and_recommend(df: pd.DataFrame, 
                                     user_vector: np.ndarray, 
                                     numerical_columns: List[str], 
                                     top_n: int) -> pd.DataFrame:
    """Calculate cosine similarity between user preferences and stocks"""
    
    # Filter out rows with NaN values in selected features
    df_clean = df.dropna(subset=numerical_columns)
    
    if df_clean.empty:
        print("No stocks left after removing NaN values")
        return pd.DataFrame()
    
    print(f"Calculating similarity for {len(df_clean)} stocks")
    
    # Scale the features
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(df_clean[numerical_columns])
    
    # Scale user vector to same range - create DataFrame with proper feature names
    user_vector_df = pd.DataFrame(user_vector.reshape(1, -1), columns=numerical_columns)
    user_vector_scaled = scaler.transform(user_vector_df)[0]
    
    # Calculate cosine similarity between user vector and all stocks
    similarities = cosine_similarity([user_vector_scaled], scaled_features)[0]
    
    # Add similarity scores to dataframe
    df_with_scores = df_clean.copy()
    df_with_scores['Similarity_Score'] = similarities
    
    # Sort by similarity and get top N
    recommendations = df_with_scores.nlargest(top_n, 'Similarity_Score')
    
    # Select relevant columns for output
    output_columns = ['Ticker', 'Company_Name', 'Sector', 'Industry', 'Similarity_Score'] + numerical_columns
    available_columns = [col for col in output_columns if col in recommendations.columns]
    
    print(f"Top {top_n} recommendations found with similarity scores: {recommendations['Similarity_Score'].values}")
    
    return recommendations[available_columns].round(4)

In [12]:
recommended_dataframe = calculate_similarity_and_recommend(filtered_dataframe, user_vector, numerical_columns, 5)
recommended_dataframe

Calculating similarity for 81 stocks
Top 5 recommendations found with similarity scores: [0.84461927 0.84068234 0.8372404  0.83695175 0.82606782]


Unnamed: 0,Ticker,Company_Name,Sector,Industry,Similarity_Score,Closing_Price,All_Time_High,Percent_From_All_Time_High,Percent_Difference_200_Day_Moving_Average,24_Hour_Percent_Change,...,Annualized_Volatility,Sharpe_Ratio,Beta,Market_Cap,Dividend_Yield,Trailing_PE,Forward_PE,Average_Volume,Average_Volume_10days,52_Week_Change
410,BR,"Broadridge Financial Solutions,",Technology,Information Technology Services,0.8446,240.56,245.35,-1.95,4.71,-0.65,...,22.53,0.64,0,28256899072,1.45,36.1201,25.7283,527321,522000,0.2
419,FTV,Fortive Corporation,Technology,Scientific & Technical Instruments,0.8407,53.0,64.61,-17.97,-5.44,-0.09,...,24.9,0.15,0,18013745152,0.61,23.2456,12.7098,4544378,4764369,-0.0427
442,ADP,"Automatic Data Processing, Inc.",Technology,Software - Application,0.8372,308.4,325.19,-5.16,3.68,-0.26,...,21.87,0.79,0,125186654208,2.0,31.566,28.3717,1762995,1989750,0.3245
404,IBM,International Business Machines,Technology,Information Technology Services,0.837,292.47,294.78,-0.78,22.39,0.17,...,23.51,1.09,0,271820734464,2.28,50.0805,27.5655,4211631,3860720,0.6436
412,LDOS,"Leidos Holdings, Inc.",Technology,Information Technology Services,0.8261,164.46,199.72,-17.66,8.7,-0.06,...,26.2,0.53,0,21169127424,1.01,16.5786,15.7378,1342631,1332510,0.1305


In [13]:
def generate_recommendation_justification(user_query: str, 
                                        recommendations_df: pd.DataFrame, 
                                        user_preferences: Dict,
                                        groq_client) -> str:
    """
    Generate a justification for the stock recommendations using Groq LLM.
    
    Args:
        user_query: Original user query
        recommendations_df: DataFrame containing recommended stocks
        user_preferences: Extracted user preferences
        groq_client: Groq client instance
        
    Returns:
        str: Justification for the recommendations
    """
    
    # Prepare stock information for justification
    stock_info = []
    for idx, row in recommendations_df.iterrows():
        stock_name = row.get('Company_Name', row.get('Ticker', f'Stock {idx}'))
        sector = row.get('Sector', 'N/A')
        industry = row.get('Industry', 'N/A')
        similarity_score = row.get('similarity_score', 'N/A')
        
        stock_summary = f"- {stock_name} (Sector: {sector}, Industry: {industry})"
        stock_info.append(stock_summary)
    
    stock_list = "\n".join(stock_info)
    
    # Create prompt for justification
    justification_prompt = f"""
    Based on the user's investment query: "{user_query}"
    
    The following stocks have been recommended (in bullet point list form):
    {stock_list}
    
    User preferences extracted: {user_preferences}
    
    Please provide a clear, concise justification (2-3 sentences) explaining why these stocks are good matches for the user's requirements. Focus on:
    1. How the stocks align with their specified criteria
    2. Key strengths of the selected stocks
    3. Why they form a good portfolio for their needs
    
    Keep the response professional and investment-focused.
    """
    
    try:
        chat_completion = groq_client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a professional financial advisor providing clear, concise investment justifications."
                },
                {
                    "role": "user",
                    "content": justification_prompt
                }
            ],
            model="llama3-8b-8192",
            temperature=0.3,
            max_tokens=200
        )
        
        justification = chat_completion.choices[0].message.content.strip()
        return justification
        
    except Exception as e:
        print(f"Error generating justification: {e}")
        return f"These stocks were selected based on their strong alignment with your criteria: {', '.join(user_preferences.keys()) if user_preferences else 'your investment preferences'}. They represent a diversified selection that matches your risk profile and investment objectives."

In [14]:
generate_recommendation_justification(user_query, recommended_dataframe, preferences, groq_client)

"Based on the user's investment query, I am pleased to recommend the following three stocks that align with their specified criteria:\n\n• Broadridge Financial Solutions: With a high dividend yield of 2.3% and a low annualized volatility of 12.1%, Broadridge Financial Solutions meets the user's requirements for high yield and low risk. Its strong track record of consistent dividend payments and stable financials make it an attractive option for long-term investors seeking steady returns.\n\n• Fortive Corporation: Fortive Corporation's high dividend yield of 2.5% and annualized return of 24.1% make it an excellent match for the user's criteria. Its diversified portfolio of industrial and commercial businesses, combined with its strong financial position, provide a solid foundation for long-term growth and income generation.\n\n• Automatic Data Processing, Inc.: With a high dividend yield of 2.6% and annualized return of 23.4%, Automatic Data Processing, Inc. meets the user's requirement

In [17]:

def recommend_stocks_from_query(df: pd.DataFrame, 
                               user_query: str, 
                               numerical_columns: List[str], 
                               non_numerical_columns: List[str],
                               top_n: int = 5,
                               groq_api_key: str = None) -> Dict[str, Any]:
    """
    Recommend stocks based on natural language query using Groq LLM and Hugging Face finance embeddings.
    
    Args:
        df: DataFrame containing stock data
        user_query: Natural language query (e.g., "I want 5 stocks that are low risk, high return, in retail")
        numerical_columns: List of numerical feature columns for similarity calculation
        non_numerical_columns: List of non-numerical columns for categorical filtering
        top_n: Number of stocks to recommend
        groq_api_key: Groq API key
        
    Returns:
        Dict[str, Any]: Dictionary containing recommended stock names and justification
    """
    
    # Initialize Groq client
    if not groq_api_key:
        raise ValueError("groq_api_key is required")
    
    groq_client = Groq(api_key=groq_api_key)
    
    # Initialize finance embeddings model
    print("Loading finance embeddings model...")
    embedding_model = SentenceTransformer('FinLang/finance-embeddings-investopedia')
    
    # Get all column names
    all_columns = numerical_columns + non_numerical_columns
    
    # Step 1: Extract preferences using Groq LLM
    user_preferences = extract_preferences_with_groq(user_query, all_columns, 
                                                    numerical_columns, non_numerical_columns, groq_client)
    
    if not user_preferences:
        print("Could not extract valid preferences from query")
        return {
            "recommended_stocks": [],
            "justification": "Could not extract valid preferences from your query. Please try rephrasing your request."
        }
    
    # Step 2: Filter stocks based on categorical preferences
    filtered_df = filter_stocks_by_categories(df, user_preferences)
    
    if filtered_df.empty:
        print("No stocks match the categorical criteria")
        return {
            "recommended_stocks": [],
            "justification": "No stocks match your specified criteria. Please try adjusting your requirements."
        }
    
    # Step 3: Create user preference vector using embeddings
    user_vector = create_user_preference_vector_with_embeddings(
        user_query, user_preferences, numerical_columns, embedding_model
    )
    
    # Step 4: Calculate similarity and recommend
    recommendations_df = calculate_similarity_and_recommend(filtered_df, user_vector, numerical_columns, top_n)
    
    # Step 5: Extract stock names and generate justification
    if recommendations_df.empty:
        return {
            "recommended_stocks": [],
            "justification": "No suitable stock recommendations could be generated based on your criteria."
        }
    
    # Get recommended stock names
    recommended_stocks = []
    if 'Company_Name' in recommendations_df.columns:
        recommended_stocks = recommendations_df['Company_Name'].tolist()
    elif 'Ticker' in recommendations_df.columns:
        recommended_stocks = recommendations_df['Ticker'].tolist()
    else:
        # Fallback to index if no company name or ticker columns
        recommended_stocks = recommendations_df.index.tolist()
    
    # Step 6: Generate justification using Groq LLM
    justification = generate_recommendation_justification(
        user_query, recommendations_df, user_preferences, groq_client
    )
    
    return {
        "answer": justification,
        "context": recommended_stocks,
        "query": user_query,
        "num_sources": top_n,
        "success": True
    }

In [16]:
user_query = "Recommend me 3 stocks that have a high yield, low risk and have an annualized return of at least 20%"

non_numerical_columns = ['Ticker', 'Company_Name', 'Sector', 'Industry', 'Country', 'Business_Summary', 'Sentiment', 'Update_Date']
numerical_columns = df_enriched_stock_data_w_sentiment.drop(non_numerical_columns, axis=1).columns.tolist()

# Call the function
recommendations = recommend_stocks_from_query(
    df=df_enriched_stock_data_w_sentiment,
    user_query=user_query,
    numerical_columns=numerical_columns,
    non_numerical_columns=non_numerical_columns,
    top_n=5,
    groq_api_key=groq_api
)

recommendations

Loading finance embeddings model...
Calculating similarity for 500 stocks
Top 5 recommendations found with similarity scores: [0.88373934 0.88216724 0.88062809 0.88017734 0.87981282]


"Based on the user's preferences, I am pleased to recommend the following three stocks:\n\n* Atmos Energy Corporation (Sector: Utilities, Industry: Utilities - Regulated Gas)\n* Southern Company (The) (Sector: Utilities, Industry: Utilities - Regulated Electric)\n* Colgate-Palmolive Company (Sector: Consumer Defensive, Industry: Household & Personal Products)\n\nThese stocks align with the user's criteria of high yield, low risk, and annualized return of at least 20%. Atmos Energy and Southern Company are both regulated utilities with a strong track record of stable cash flows and high dividend yields, making them attractive for dividend investors. Colgate-Palmolive, a consumer staples company, has a long history of consistent profitability and dividend payments, providing a relatively low-risk investment opportunity. The combination of these three stocks forms a well-diversified portfolio that balances income generation with moderate growth potential, aligning with the user's dividend