In [1]:
# === Required Libraries ===
import pymongo
import pandas as pd
import json
import re
from sentence_transformers import SentenceTransformer
from ollama import Client as OllamaClient

# === MongoDB Setup ===
db_username = 'user_name'  # Replace with your MongoDB username
db_password = 'password_5'  # Replace with your MongoDB password
URI = f'mongodb+srv://{db_username}:{db_password}@cluster.eid6hdp.mongodb.net/'
client = pymongo.MongoClient(URI)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client.list_database_names()

['project', 'admin', 'local']

In [3]:
client.project.list_collection_names()

['news_articles', 'companies']

In [4]:
pd.DataFrame(client.project.news_articles.find().limit(2))

Unnamed: 0,_id,contentType,title,summary,canonicalUrl,pubDate,article_text,symbol,embedding
0,685e0a53fa5cf591ef626ce9,STORY,Billionaire predicts 'Fantastic 40' companies ...,Apple and Alphabet are not on the list.,{'url': 'https://www.thestreet.com/crypto/mark...,2025-06-26T23:10:00Z,Please enable JS and disable any ad blocker,NVDA,"[-0.003624762175604701, 0.03351498395204544, 0..."
1,685e0a53fa5cf591ef626ce6,VIDEO,"Tech trade check-in: Nvidia record high, Micro...",Nvidia (NVDA) stock rose in premarket trading ...,{'url': 'https://finance.yahoo.com/video/tech-...,2025-06-26T13:44:14Z,Nvidia (NVDA) stock rose in premarket trading ...,NVDA,"[0.028737712651491165, 0.05020241066813469, -0..."


In [5]:
list(client.project.news_articles.list_indexes())

[SON([('v', 2), ('key', SON([('_id', 1)])), ('name', '_id_')]),
 SON([('v', 2), ('key', SON([('symbol', 1), ('pubDate', 1), ('title', 1)])), ('name', 'symbol_1_pubDate_1_title_1'), ('unique', True)])]

In [6]:
list(client.project.news_articles.list_search_indexes())

[{'id': '685ed0b915a012707b1762aa',
  'name': 'article_vector_index',
  'type': 'vectorSearch',
  'status': 'READY',
  'queryable': True,
  'latestDefinitionVersion': {'version': 0,
   'createdAt': datetime.datetime(2025, 6, 27, 17, 11, 21, 859000)},
  'latestDefinition': {'fields': [{'type': 'vector',
     'path': 'embedding',
     'numDimensions': 768,
     'similarity': 'cosine'}]},
  'statusDetail': [{'hostname': 'atlas-15q7xs-shard-00-00',
    'status': 'READY',
    'queryable': True,
    'mainIndex': {'status': 'READY',
     'queryable': True,
     'definitionVersion': {'version': 0,
      'createdAt': datetime.datetime(2025, 6, 27, 17, 11, 21)},
     'definition': {'fields': [{'type': 'vector',
        'path': 'embedding',
        'numDimensions': 768,
        'similarity': 'cosine'}]}}},
   {'hostname': 'atlas-15q7xs-shard-00-01',
    'status': 'READY',
    'queryable': True,
    'mainIndex': {'status': 'READY',
     'queryable': True,
     'definitionVersion': {'version': 0,
 

In [7]:
! pip install einops
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)



<All keys matched successfully>


In [8]:
news_text = pd.DataFrame(client.project.news_articles.find()).article_text
news_text

0             Please enable JS and disable any ad blocker
1       Nvidia (NVDA) stock rose in premarket trading ...
2       Tech and financial stocks rose, helping send t...
3       Nvidia (NVDA) climbs higher, hitting fresh rec...
4       Nvidia (NVDA, Financials) surged to a record h...
                              ...                        
5007    Warner Bros. Discovery's (WBD) announcement th...
5008    The stocks in this article are all trading nea...
5009    Significantly high institutional ownership imp...
5010    (Reuters) -An Australian regulator has initiat...
5011    News Corp has been treading water for the past...
Name: article_text, Length: 5012, dtype: object

In [19]:
def sector_analysis(sector_name, rows=10, fields=None):
    """
    Aggregate and summarize news articles related to a given sector using vector similarity.

    Parameters:
        sector_name (str): The name of the sector to analyze (e.g., "Technology", "Healthcare").
        rows (int): Number of top articles to retrieve.
        fields (list, optional): Fields to include in the result (default includes most useful ones).

    Returns:
        pd.DataFrame: DataFrame containing top sector-related articles.
    """

    # Default fields to project if not provided
    if fields is None:
        fields = {
            "symbol": 1,
            "title": 1,
            "summary": 1,
            "article_text": 1,
            "pubDate": 1,
            "sector": 1,
            "score": {"$meta": "vectorSearchScore"},
            "_id": 0
        }

    # Step 1: Generate semantic embedding for the sector name
    sector_embedding = embedding_model.encode(sector_name).tolist()

    # Step 2: Build the vector search aggregation pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "article_vector_index",
                "path": "embedding",
                "queryVector": sector_embedding,
                "numCandidates": 2000,
                "limit": rows
            }
        },
        {
            "$project": fields
        }
    ]

    # Step 3: Execute the query
    cursor = client.project.news_articles.aggregate(pipeline)
    results = pd.DataFrame(cursor)

    # Step 4: Handle empty result set
    if results.empty:
        print(f"No results found for sector: {sector_name}")
    
    return results


In [20]:
query = "How is artificial intelligence transforming the technology sector and impacting company performance and valuations?"
results = sector_analysis(query, 300)
results

Unnamed: 0,title,summary,pubDate,article_text,symbol,score
0,Is C3.ai Stock the Next NVIDIA and a Buy?,"AI stock surges on strong FY 2025 growth, big-...",2025-06-26T19:00:00Z,NVIDIA Corporation‚Äôs NVDA data center graphics...,XOM,0.830337
1,Jabil (JBL) Taps AI Infrastructure Boom to Boo...,Manufacturing solutions provider Jabil Inc. (J...,2025-06-25T23:49:00Z,Manufacturing solutions provider Jabil Inc. (J...,JBL,0.828928
2,3 IT Services Stocks to Buy Right Now From a P...,The Zacks Computers - IT Services Industry par...,2025-04-09T15:03:00Z,The Zacks Computers ‚Äì IT Services industry par...,JKHY,0.827650
3,Zacks Industry Outlook Highlights Jack Henry &...,"Jack Henry & Associates, Science Applications ...",2025-04-10T09:00:00Z,"Chicago, IL ‚Äì April 10, 2025 ‚Äì Today, Zacks Eq...",JKHY,0.825083
4,DELL's CSG Revenues Rise: Is an Improving PC M...,Dell Technologies CSG revenues increase 5% yea...,2025-06-20T16:00:00Z,Dell Technologies DELL AI prospects remain str...,LOW,0.822168
...,...,...,...,...,...,...
295,Keysight Technologies (NYSE:KEYS) Faces DoJ An...,Keysight Technologies (NYSE:KEYS) has been nav...,2025-06-03T18:00:46Z,Keysight Technologies has been navigating sign...,KEYS,0.782743
296,ON Semiconductor Corp. (ON) Outpaces Stock Mar...,The latest trading day saw ON Semiconductor Co...,2025-06-10T21:50:13Z,ON Semiconductor Corp. (ON) closed the most re...,ON,0.782615
297,Is Innodata Stock's 4.65X PS Still Worth it? B...,INOD trades at a premium despite underperformi...,2025-06-16T15:15:00Z,"Innodata INOD shares are trading at a premium,...",BR,0.782581
298,"Scotiabank Raises PT on Kinder Morgan (KMI), M...","Kinder Morgan, Inc. (NYSE:KMI) is one of the 1...",2025-06-25T20:59:36Z,"Kinder Morgan, Inc. (NYSE:KMI) is one of the 1...",KMI,0.782483


In [21]:
# === Prompt Persona and Instructions ===
persona = (
    "You are a market intelligence strategist analyzing trends in stock market news "
    "to help investment analysts make data-driven decisions."
)

cot_instruction = """
Follow these steps for your analysis:
1. Identify the key facts and entities from the news articles
2. Explain how these facts relate to recent movements in stock prices or investor behavior
3. Summarize the overall trend and provide a brief market insight or conclusion
"""

output_format = """
Return your analysis as JSON:
{
    "topic": "...",
    "summary": "...",
    "trend": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "key_entities": [],
    "notable_quotes": []
}
"""


In [22]:
import json
import re


def extract_json_from_text(text):
    #Look for JSON objects (starting with { and ending with })
    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
    matches = re.findall(json_pattern, text)


    for match in matches:
        try: 
            return json.loads(match)
        except json.JSONDecodeError:
            continue
    return None

In [None]:
# Using the official OpenAI-style client
from ollama import Client
from pprint import pprint
client = Client(host='http://localhost:11434')
# Generate a response
response = client.chat(model='gemma3', messages=[
    {'role': 'user', 'content': 'what is pi?'}
])
pprint(response['message']['content'])

("Okay, let's break down what pi (œÄ) is. It's a truly fascinating and "
 'surprisingly complex number!\n'
 '\n'
 '**Simply Put:**\n'
 '\n'
 "Pi (œÄ) is the ratio of a circle's circumference (the distance around the "
 'circle) to its diameter (the distance across the circle through its '
 'center).\n'
 '\n'
 "**Here's a more detailed explanation:**\n"
 '\n'
 '* **Circumference:** Think of drawing a line around the entire outside of a '
 'circle. That distance is the circumference.\n'
 '* **Diameter:**  The diameter is a straight line that passes through the '
 "circle's center, connecting two points on the circle's edge.\n"
 '\n'
 '**The Formula:**\n'
 '\n'
 '*  Circumference = œÄ * Diameter\n'
 '*  œÄ = Circumference / Diameter\n'
 '\n'
 '**The Value of Pi:**\n'
 '\n'
 "* **It's an Irrational Number:** This means that its decimal representation "
 "goes on forever without repeating.  You can't write it as a simple fraction "
 '(like 1/4 or 22/7).\n'
 '* **Approximate Values:**\n'
 ' 

**SUMMARY OF THE INDUSTRY**

In [36]:
import pandas as pd
import re

# === 1. Simple regex-based sentence tokenizer ===
def simple_sent_tokenize(text):
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    return sentences

# === 2. Summarization function ===
def summarize(text, max_sentences=3):
    if not text:
        return ""
    try:
        sentences = simple_sent_tokenize(text)
        return " ".join(sentences[:max_sentences])
    except Exception as e:
        print(f"Summarization error: {e}")
        return text[:300]  # fallback

# === 3. Final summary generation function ===
def generate_final_summary(context_chunks):
    full_text = " ".join(context_chunks)
    sentences = simple_sent_tokenize(full_text)
    return " ".join(sentences[:5])

# === 4. Context Strategies ===
def truncate_context(articles, max_chars=2000):
    return [article[:max_chars] for article in articles]

def summarize_then_include(articles):
    return [summarize(article) for article in articles]

def hybrid_context(articles, important_n=2):
    context = articles[:important_n]
    context += [summarize(a) for a in articles[important_n:]]
    return context

# === 5. Apply context strategy ===
def apply_context_strategy(df, config):
    texts = df["article_text"].fillna("").tolist()
    
    if config["context"] == "truncate":
        return truncate_context(texts, max_chars=config.get("max_chars", 2000))
    elif config["context"] == "summarize":
        return summarize_then_include(texts)
    elif config["context"] == "hybrid":
        return hybrid_context(texts, important_n=config.get("important_n", 2))
    else:
        return texts

# === 6. Sector analysis query function ===
def sector_analysis(sector_name, rows=10, fields=None):
    if fields is None:
        fields = {
            "symbol": 1,
            "title": 1,
            "summary": 1,
            "article_text": 1,
            "pubDate": 1,
            "sector": 1,
            "score": {"$meta": "vectorSearchScore"},
            "_id": 0
        }

    # Step 1: Generate embedding
    sector_embedding = embedding_model.encode(sector_name).tolist()

    # Step 2: Build pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "article_vector_index",
                "path": "embedding",
                "queryVector": sector_embedding,
                "numCandidates": 2000,
                "limit": rows
            }
        },
        {
            "$project": fields
        }
    ]

    # Step 3: Execute
    cursor = client.project.news_articles.aggregate(pipeline)
    results = pd.DataFrame(cursor)

    if results.empty:
        print(f"No results found for sector: {sector_name}")
    return results

# === 7. Run experiments ===
def run_sector_analysis_experiments(sector_name):
    configs = [
        {"top_k": 3, "context": "truncate", "max_chars": 500},
        {"top_k": 10, "context": "summarize"},
        {"top_k": 15, "context": "hybrid", "important_n": 3}
    ]

    all_results = []

    for config in configs:
        print("\n--- Testing config ---")
        print(config)

        df = sector_analysis(sector_name, rows=config["top_k"])

        if df.empty:
            print("No results. Skipping this config.")
            continue

        context_chunks = apply_context_strategy(df, config)
        summary = generate_final_summary(context_chunks)

        print("üìù Summary:\n", summary)

        all_results.append({
            "config": config,
            "summary": summary
        })

    return all_results

# === 8. Execute ===
results = run_sector_analysis_experiments("Renewable Energy")



--- Testing config ---
{'top_k': 3, 'context': 'truncate', 'max_chars': 500}
üìù Summary:
 FirstEnergy Corporation FE announced that its subsidiaries, Mon Power and Potomac Edison, have completed their third utility-scale solar site in West Virginia to help meet the state's electricity needs.FE‚Äôs solar projects help meet the growing need for power and boost American manufacturing. The company‚Äôs solar facilities are an expanding component of its commitment to making sure that its customers have the proper mix and quantity of energy to satisfy their daily demands. The project‚Äôs more tha FirstEnergy Corporation FE announced that its subsidiaries, Mon Power and Potomac Edison, have completed their third utility-scale solar site in West Virginia to help meet the state's electricity needs.FE‚Äôs solar projects help meet the growing need for power and boost American manufacturing. The company‚Äôs solar facilities are an expanding component of its commitment to making sure that its cus

**FUTURE ANALYSIS OF THE INDUSTRY**

In [None]:
import pandas as pd
import re

# === Sentence splitter (no NLTK) ===
def simple_sent_tokenize(text):
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    return sentences

# === Basic summarizer ===
def summarize(text, max_sentences=3):
    if not text:
        return ""
    try:
        sentences = simple_sent_tokenize(text)
        return " ".join(sentences[:max_sentences])
    except Exception as e:
        print(f"Summarization error: {e}")
        return text[:300]

# === Final summary from multiple chunks ===
def generate_final_summary(chunks):
    full_text = " ".join(chunks)
    sentences = simple_sent_tokenize(full_text)
    return " ".join(sentences[:5])

# === Context strategies ===
def truncate_context(articles, max_chars=2000):
    return [article[:max_chars] for article in articles]

def summarize_then_include(articles):
    return [summarize(article) for article in articles]

def hybrid_context(articles, important_n=2):
    context = articles[:important_n]
    context += [summarize(a) for a in articles[important_n:]]
    return context

def apply_context_strategy(df, config):
    texts = df["article_text"].fillna("").tolist()
    if config["context"] == "truncate":
        return truncate_context(texts, max_chars=config.get("max_chars", 2000))
    elif config["context"] == "summarize":
        return summarize_then_include(texts)
    elif config["context"] == "hybrid":
        return hybrid_context(texts, important_n=config.get("important_n", 2))
    else:
        return texts


# === Vector search function with refined industry-level future trends query ===
def future_industry_articles(industry_name, rows=10, fields=None):
    if fields is None:
        fields = {
            "symbol": 1,
            "title": 1,
            "summary": 1,
            "article_text": 1,
            "pubDate": 1,
            "sector": 1,
            "score": {"$meta": "vectorSearchScore"},
            "_id": 0
        }

    query = (
        f"Future trends and forecasts in the {industry_name} industry, "
        f"including sector-wide challenges, innovations, growth drivers, "
        f"market outlook, and industry-level analysis. "
        f"Exclude company-specific news or earnings reports."
    )

    query_embedding = embedding_model.encode(query).tolist()

    pipeline = [
        {
            "$vectorSearch": {
                "index": "article_vector_index",
                "path": "embedding",
                "queryVector": query_embedding,
                "numCandidates": 2000,
                "limit": rows
            }
        },
        # Optional: filter to exclude company-specific articles if you have such metadata
        # {
        #     "$match": {"company_name": {"$exists": False}}
        # },
        {
            "$project": fields
        }
    ]

    cursor = client.project.news_articles.aggregate(pipeline)
    results = pd.DataFrame(cursor)
    return results

# === Main experimentation pipeline for future predictions ===
def run_future_predictions(industry_name):
    configs = [
        {"top_k": 5, "context": "truncate", "max_chars": 500},
        {"top_k": 10, "context": "summarize"},
        {"top_k": 15, "context": "hybrid", "important_n": 2}
    ]

    all_summaries = []

    for config in configs:
        print("\n--- CONFIG ---")
        print(config)

        df = future_industry_articles(industry_name, rows=config["top_k"])

        if df.empty:
            print("‚ö†Ô∏è No future-oriented articles found.")
            continue

        context_chunks = apply_context_strategy(df, config)
        summary = generate_final_summary(context_chunks)

        print("\nüß† Future Prediction Summary:\n", summary)

        all_summaries.append({
            "config": config,
            "summary": summary
        })

    return all_summaries

# === Example usage ===
results = run_future_predictions("Technology")



--- CONFIG ---
{'top_k': 5, 'context': 'truncate', 'max_chars': 500}

üß† Future Prediction Summary:
 Chicago, IL ‚Äì May 29, 2025 ‚Äì Today, Zacks Equity Research discusses AMETEK AME, Itron ITRI and Cognex CGNX. Link: https://www.zacks.com/commentary/2477738/3-electronics-testing-stocks-to-watch-from-a-challenging-industry
The Zacks Electronics ‚Äì Testing Equipment industry has been suffering from a challenging global macroeconomic environment, end-market volatility, unfavorable forex and growing geo-political tensions. The sluggish automotive sector, due to declining investments in electric vehicl Chicago, IL ‚Äì June 2, 2025 ‚Äì Today, Zacks Equity Research discusses Skyworks SWKS and Qorvo QRVO. Industry: Semiconductor - Radio Frequency
Link: https://www.zacks.com/commentary/2479644/2-radio-frequency-stocks-to-watch-in-a-prospering-industry
The Zacks Semiconductors - Radio Frequency industry participants like Skyworks and Qorvo are benefiting from the continued adoption of WiFi

**BEST OPPOTUNITIES**

In [42]:
import pandas as pd
import re

# === Sentence splitter (no NLTK) ===
def simple_sent_tokenize(text):
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    return sentences

# === Basic summarizer ===
def summarize(text, max_sentences=3):
    if not text:
        return ""
    try:
        sentences = simple_sent_tokenize(text)
        return " ".join(sentences[:max_sentences])
    except Exception as e:
        print(f"Summarization error: {e}")
        return text[:300]

# === Final summary from multiple chunks ===
def generate_final_summary(chunks):
    full_text = " ".join(chunks)
    sentences = simple_sent_tokenize(full_text)
    return " ".join(sentences[:5])

# === Context strategies ===
def truncate_context(articles, max_chars=2000):
    return [article[:max_chars] for article in articles]

def summarize_then_include(articles):
    return [summarize(article) for article in articles]

def hybrid_context(articles, important_n=2):
    context = articles[:important_n]
    context += [summarize(a) for a in articles[important_n:]]
    return context

def apply_context_strategy(df, config):
    texts = df["article_text"].fillna("").tolist()
    if config["context"] == "truncate":
        return truncate_context(texts, max_chars=config.get("max_chars", 2000))
    elif config["context"] == "summarize":
        return summarize_then_include(texts)
    elif config["context"] == "hybrid":
        return hybrid_context(texts, important_n=config.get("important_n", 2))
    else:
        return texts

# === Vector search function for investment trend queries ===
def investment_trend_articles(industry_name="Technology", rows=10, fields=None):
    if fields is None:
        fields = {
            "symbol": 1,
            "title": 1,
            "summary": 1,
            "article_text": 1,
            "pubDate": 1,
            "sector": 1,
            "score": {"$meta": "vectorSearchScore"},
            "_id": 0
        }

    # Craft a focused query about investment trends in the sector
    query = f"Biggest investment trends and opportunities in the {industry_name} sector, venture capital, funding, growth areas"

    # Generate embedding for the query
    query_embedding = embedding_model.encode(query).tolist()

    pipeline = [
        {
            "$vectorSearch": {
                "index": "article_vector_index",
                "path": "embedding",
                "queryVector": query_embedding,
                "numCandidates": 2000,
                "limit": rows
            }
        },
        {
            "$project": fields
        }
    ]

    cursor = client.project.news_articles.aggregate(pipeline)
    results = pd.DataFrame(cursor)
    return results

# === Hyperparameter tuning to find best configuration ===
def run_investment_trend_analysis(industry_name="Technology"):
    configs = [
        {"top_k": 5, "context": "truncate", "max_chars": 500},
        {"top_k": 10, "context": "summarize"},
        {"top_k": 15, "context": "hybrid", "important_n": 2}
    ]

    summaries = []

    for config in configs:
        print("\n--- CONFIGURATION ---")
        print(config)

        df = investment_trend_articles(industry_name, rows=config["top_k"])

        if df.empty:
            print("‚ö†Ô∏è No investment trend articles found for this configuration.")
            continue

        context_chunks = apply_context_strategy(df, config)
        summary = generate_final_summary(context_chunks)

        print("üß† Investment Trend Summary:\n", summary)

        summaries.append({
            "config": config,
            "summary": summary
        })

    return summaries

# === Example Run ===
# Make sure your embedding_model and client (MongoDB client) are initialized before running this
results = run_investment_trend_analysis("Technology")



--- CONFIGURATION ---
{'top_k': 5, 'context': 'truncate', 'max_chars': 500}
üß† Investment Trend Summary:
 Carlyle Group Inc. CG has announced a collaboration with Citigroup Inc. C to expand asset-backed financing opportunities within the fintech specialty lending space. Both companies have formalized a framework to exchange market intelligence and explore co-investment and financing opportunities to align strategic objectives and deepen integration. The collaboration will integrate Carlyle‚Äôs extensive investment network with the expertise of Citigroup‚Äôs Spread Products Investment in Technologies (SP Designed to provide broad exposure to the Technology - Software segment of the equity market, the Invesco AI and Next Gen Software ETF (IGPT) is a passively managed exchange traded fund launched on 06/23/2005.

--- CONFIGURATION ---
{'top_k': 10, 'context': 'summarize'}
üß† Investment Trend Summary:
 Carlyle Group Inc. CG has announced a collaboration with Citigroup Inc. C to expand