# Installing Libraries

In [None]:
!pip install --upgrade --quiet datasets pandas pymongo sentence_transformers python-dotenv

In [None]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# Data loading and preparation

In [None]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("MongoDB/fake_tech_companies_market_reports", split="train", streaming=True)
dataset_df = dataset.take(100)

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset_df)

In [None]:
dataset_df

Unnamed: 0,recent_news,reports,company,ticker,key_metrics,sector
0,"[{'date': '2024-06-09', 'headline': 'CyberDefe...","[{'author': 'Taylor Smith, Technology Sector L...",CyberDefense Dynamics,CDDY,"{'52_week_range': {'high': 387.3, 'low': 41.63...",Information Technology
1,"[{'date': '2024-07-04', 'headline': 'CloudComp...","[{'author': 'Casey Jones, Chief Market Strateg...",CloudCompute Pro,CCPR,"{'52_week_range': {'high': 524.23, 'low': 171....",Information Technology
2,"[{'date': '2024-06-27', 'headline': 'VirtualRe...","[{'author': 'Sam Brown, Head of Equity Researc...",VirtualReality Systems,VRSY,"{'52_week_range': {'high': 530.59, 'low': 56.4...",Information Technology
3,"[{'date': '2024-07-06', 'headline': 'BioTech I...","[{'author': 'Riley Smith, Senior Tech Analyst'...",BioTech Innovations,BTCI,"{'52_week_range': {'high': 366.55, 'low': 124....",Information Technology
4,"[{'date': '2024-06-26', 'headline': 'QuantumCo...","[{'author': 'Riley Garcia, Senior Tech Analyst...",QuantumComputing Inc,QCMP,"{'52_week_range': {'high': 231.91, 'low': 159....",Information Technology
...,...,...,...,...,...,...
58,"[{'date': '2024-06-19', 'headline': 'CloudComp...","[{'author': 'Taylor Smith, Chief Market Strate...",CloudCompute Solutions,CCSL,"{'52_week_range': {'high': 597.38, 'low': 70.3...",Information Technology
59,"[{'date': '2024-06-24', 'headline': 'CloudSecu...","[{'author': 'Morgan Smith, Chief Market Strate...",CloudSecurity Pro,CSPR,"{'52_week_range': {'high': 502.09, 'low': 105....",Information Technology
60,"[{'date': '2024-07-22', 'headline': 'NanoTech ...","[{'author': 'Alex Davis, Senior Tech Analyst',...",NanoTech Solutions,NTSL,"{'52_week_range': {'high': 417.08, 'low': 121....",Information Technology
61,"[{'date': '2024-07-08', 'headline': 'TechInnov...","[{'author': 'Morgan Davis, Technology Sector L...",TechInnovate,TCIV,"{'52_week_range': {'high': 314.73, 'low': 125....",Information Technology


In [None]:
# Data Preparation
def combine_attributes(row):
  combined = f"{row['company']} {row['sector']} "

  # Add report information
  for report in row['reports']:
    combined += f"{report['year']} {report['title']} {report['author']} {report['content']} "

  # Add recent news information
  for news in row['recent_news']:
    combined += f"{news['headline']} {news['summary']} "

  return combined.strip()

In [None]:
# Add the new column 'combined_attributes'
dataset_df['combined_attributes'] = dataset_df.apply(combine_attributes, axis=1)

In [None]:
dataset_df

Unnamed: 0,recent_news,reports,company,ticker,key_metrics,sector,combined_attributes
0,"[{'date': '2024-06-09', 'headline': 'CyberDefe...","[{'author': 'Taylor Smith, Technology Sector L...",CyberDefense Dynamics,CDDY,"{'52_week_range': {'high': 387.3, 'low': 41.63...",Information Technology,CyberDefense Dynamics Information Technology 2...
1,"[{'date': '2024-07-04', 'headline': 'CloudComp...","[{'author': 'Casey Jones, Chief Market Strateg...",CloudCompute Pro,CCPR,"{'52_week_range': {'high': 524.23, 'low': 171....",Information Technology,CloudCompute Pro Information Technology 2023 C...
2,"[{'date': '2024-06-27', 'headline': 'VirtualRe...","[{'author': 'Sam Brown, Head of Equity Researc...",VirtualReality Systems,VRSY,"{'52_week_range': {'high': 530.59, 'low': 56.4...",Information Technology,VirtualReality Systems Information Technology ...
3,"[{'date': '2024-07-06', 'headline': 'BioTech I...","[{'author': 'Riley Smith, Senior Tech Analyst'...",BioTech Innovations,BTCI,"{'52_week_range': {'high': 366.55, 'low': 124....",Information Technology,BioTech Innovations Information Technology 202...
4,"[{'date': '2024-06-26', 'headline': 'QuantumCo...","[{'author': 'Riley Garcia, Senior Tech Analyst...",QuantumComputing Inc,QCMP,"{'52_week_range': {'high': 231.91, 'low': 159....",Information Technology,QuantumComputing Inc Information Technology 20...
...,...,...,...,...,...,...,...
58,"[{'date': '2024-06-19', 'headline': 'CloudComp...","[{'author': 'Taylor Smith, Chief Market Strate...",CloudCompute Solutions,CCSL,"{'52_week_range': {'high': 597.38, 'low': 70.3...",Information Technology,CloudCompute Solutions Information Technology ...
59,"[{'date': '2024-06-24', 'headline': 'CloudSecu...","[{'author': 'Morgan Smith, Chief Market Strate...",CloudSecurity Pro,CSPR,"{'52_week_range': {'high': 502.09, 'low': 105....",Information Technology,CloudSecurity Pro Information Technology 2023 ...
60,"[{'date': '2024-07-22', 'headline': 'NanoTech ...","[{'author': 'Alex Davis, Senior Tech Analyst',...",NanoTech Solutions,NTSL,"{'52_week_range': {'high': 417.08, 'low': 121....",Information Technology,NanoTech Solutions Information Technology 2023...
61,"[{'date': '2024-07-08', 'headline': 'TechInnov...","[{'author': 'Morgan Davis, Technology Sector L...",TechInnovate,TCIV,"{'52_week_range': {'high': 314.73, 'low': 125....",Information Technology,TechInnovate Information Technology 2023 TechI...


# Embedding generation with GTE-Large

In [None]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

embedding_model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)


# Determine the maximum sequence length for the model
max_seq_length = embedding_model.max_seq_length

def chunk_text(text, tokenizer, max_length=8192, overlap=50):
   """
   Split the text into overlapping chunks based on token length.
   """
   tokens = tokenizer.tokenize(text)
   chunks = []
   for i in range(0, len(tokens), max_length - overlap):
       chunk_tokens = tokens[i:i + max_length]
       chunk = tokenizer.convert_tokens_to_string(chunk_tokens)
       chunks.append(chunk)
   return chunks


def get_embedding(text: str) -> list[float]:
   if not text.strip():
       print("Attempted to get embedding for empty text.")
       return []


   # Get the tokenizer from the model
   tokenizer = embedding_model.tokenizer


   # Split text into chunks if it's too long
   chunks = chunk_text(text, tokenizer, max_length=max_seq_length)

   if len(chunks) == 1:
       # If text fits in one chunk, embed as usual
       embedding = embedding_model.encode(text)
   else:
       # If text was split, embed each chunk and average the results
       chunk_embeddings = embedding_model.encode(chunks)
       embedding = np.mean(chunk_embeddings, axis=0)

   return embedding.tolist()

# Apply the embedding function with a progress bar
tqdm.pandas(desc="Generating embeddings")
dataset_df["embeddings"] = dataset_df['combined_attributes'].progress_apply(get_embedding)

Generating embeddings: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]


In [None]:
dataset_df

Unnamed: 0,recent_news,reports,company,ticker,key_metrics,sector,combined_attributes,embeddings
0,"[{'date': '2024-06-09', 'headline': 'CyberDefe...","[{'author': 'Taylor Smith, Technology Sector L...",CyberDefense Dynamics,CDDY,"{'52_week_range': {'high': 387.3, 'low': 41.63...",Information Technology,CyberDefense Dynamics Information Technology 2...,"[0.9752911925315857, -0.5128046870231628, 0.02..."
1,"[{'date': '2024-07-04', 'headline': 'CloudComp...","[{'author': 'Casey Jones, Chief Market Strateg...",CloudCompute Pro,CCPR,"{'52_week_range': {'high': 524.23, 'low': 171....",Information Technology,CloudCompute Pro Information Technology 2023 C...,"[0.7592183351516724, -0.48120608925819397, -0...."
2,"[{'date': '2024-06-27', 'headline': 'VirtualRe...","[{'author': 'Sam Brown, Head of Equity Researc...",VirtualReality Systems,VRSY,"{'52_week_range': {'high': 530.59, 'low': 56.4...",Information Technology,VirtualReality Systems Information Technology ...,"[1.3058066368103027, -0.46789559721946716, 0.0..."
3,"[{'date': '2024-07-06', 'headline': 'BioTech I...","[{'author': 'Riley Smith, Senior Tech Analyst'...",BioTech Innovations,BTCI,"{'52_week_range': {'high': 366.55, 'low': 124....",Information Technology,BioTech Innovations Information Technology 202...,"[0.2537815570831299, -0.4768553078174591, 0.29..."
4,"[{'date': '2024-06-26', 'headline': 'QuantumCo...","[{'author': 'Riley Garcia, Senior Tech Analyst...",QuantumComputing Inc,QCMP,"{'52_week_range': {'high': 231.91, 'low': 159....",Information Technology,QuantumComputing Inc Information Technology 20...,"[0.3205525577068329, -0.8558152318000793, -0.7..."
...,...,...,...,...,...,...,...,...
58,"[{'date': '2024-06-19', 'headline': 'CloudComp...","[{'author': 'Taylor Smith, Chief Market Strate...",CloudCompute Solutions,CCSL,"{'52_week_range': {'high': 597.38, 'low': 70.3...",Information Technology,CloudCompute Solutions Information Technology ...,"[0.5261812806129456, -0.08563333749771118, -0...."
59,"[{'date': '2024-06-24', 'headline': 'CloudSecu...","[{'author': 'Morgan Smith, Chief Market Strate...",CloudSecurity Pro,CSPR,"{'52_week_range': {'high': 502.09, 'low': 105....",Information Technology,CloudSecurity Pro Information Technology 2023 ...,"[0.6943804621696472, -0.37386947870254517, -0...."
60,"[{'date': '2024-07-22', 'headline': 'NanoTech ...","[{'author': 'Alex Davis, Senior Tech Analyst',...",NanoTech Solutions,NTSL,"{'52_week_range': {'high': 417.08, 'low': 121....",Information Technology,NanoTech Solutions Information Technology 2023...,"[0.7160444259643555, -0.7745052576065063, -0.3..."
61,"[{'date': '2024-07-08', 'headline': 'TechInnov...","[{'author': 'Morgan Davis, Technology Sector L...",TechInnovate,TCIV,"{'52_week_range': {'high': 314.73, 'low': 125....",Information Technology,TechInnovate Information Technology 2023 TechI...,"[0.09791075438261032, -0.7621095180511475, 0.0..."


# MongoDB vector database and connection setup

In [None]:
import pymongo
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

def get_mongo_client(mongo_uri):
    """Establish and validate connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri, appname="devrel.showcase.rag.cohere_mongodb.python")

        # Validate the connection
        ping_result = client.admin.command('ping')
        if ping_result.get('ok') == 1.0:
            # Connection successful
            print("Connection to MongoDB successful")
            return client
    except Exception as e:
        print(f"Connection to MongoDB failed: {e}")
    return None

# Get MONGO_URI from .env file
MONGO_URI = os.getenv("MONGO_URI")

if not MONGO_URI:
    print("MONGO_URI not set in .env file")
    exit(1)

mongo_client = get_mongo_client(MONGO_URI)

if mongo_client:
    DB_NAME = "asset_management_use_case"
    COLLECTION_NAME = "market_reports"

    db = mongo_client.get_database(DB_NAME)
    collection = db.get_collection(COLLECTION_NAME)
else:
    print("Failed to connect to MongoDB. Exiting.")

Connection to MongoDB successful


In [None]:
# Delete any existing records in the collection
collection.delete_many({})

DeleteResult({'n': 63, 'electionId': ObjectId('7fffffff0000000000000024'), 'opTime': {'ts': Timestamp(1733494125, 18), 't': 36}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1733494125, 18), 'signature': {'hash': b'\xb9\xe46;\x10\xf7\xfb0#\xf7\xa0P\x84\xe2u\xceQ\x1f\xdf~', 'keyId': 7413791028152369162}}, 'operationTime': Timestamp(1733494125, 18)}, acknowledged=True)

# Data ingestion

In [None]:
documents = dataset_df.to_dict('records')
collection.insert_many(documents)
print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


# Vector Search

In [None]:
def vector_search(user_query, collection):
 """
 Perform a vector search in the MongoDB collection based on the user query.


 Args:
 user_query (str): The user's query string.
 collection (MongoCollection): The MongoDB collection to search.


 Returns:
 list: A list of matching documents.
 """


 # Generate embedding for the user query
 query_embedding = get_embedding(user_query)


 if query_embedding is None:
   return "Invalid query or embedding generation failed."


 # Define the vector search pipeline
 vector_search_stage = {
   "$vectorSearch": {
     "index": "vector_index",
     "queryVector": query_embedding,
     "path": "embeddings",
     "numCandidates": 150,  # Number of candidate matches to consider
     "limit": 2  # Return top 2 matches
   }
 }


 unset_stage = {
   "$unset": "embeddings"  # Exclude the 'embeddings' field from the results
 }


 project_stage = {
   "$project": {
     "_id": 0,  # Exclude the _id field
     "company": 1,  # Include the company field
     "reports": 1,  # Include the reports field
     "combined_attributes": 1, # Include the combined_attributes field
     "score": {
       "$meta": "vectorSearchScore"  # Include the search score
     }
   }
 }


 pipeline = [vector_search_stage, unset_stage, project_stage]


 # Execute the search
 results = collection.aggregate(pipeline)
 return list(results)

# Handling user queries

In [None]:
def get_search_result(query, collection):


 get_knowledge = vector_search(query, collection)


 search_result = ''
 for result in get_knowledge:
     search_result += f"Company: {result.get('company', 'N/A')}, Combined Attributes: {result.get('combined_attributes', 'N/A')}\n"


 return search_result

In [None]:
# Conduct query with the retrieval of sources
query = "What companies have negative market reports or negative sentiment that might deter from investment in the long term"
source_information = get_search_result(query, collection)
combined_information = f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}."

print(combined_information)

Query: What companies have negative market reports or negative sentiment that might deter from investment in the long term
Continue to answer the query by using the Search Results:
Company: DataStream Analytics, Combined Attributes: DataStream Analytics Information Technology 2023 DataStream Analytics (DSTA) - 2023 Market Analysis Avery Miller, Head of Equity Research # DataStream Analytics (DSTA) - Market Analysis Report 2023

## Overview:
DataStream Analytics (DSTA) is a software company specializing in data analytics and business intelligence solutions. The company offers a range of products and services that enable organizations to collect, analyze, and derive insights from their data. Heading into 2023, DSTA had established itself as a prominent player in the data analytics space, particularly in the North American market. The company's products are used across various industries, including healthcare, finance, and retail. 

## Key Highlights:

### Financial Performance:
- DSTA re

# Load Gemma 2 (2B)

In [None]:
from huggingface_hub import login

# Retrieve the Hugging Face token from the environment variable
hf_token = os.getenv("HF_TOKEN")

# Ensure the token exists before attempting to log in
if hf_token:
    login(token=hf_token)
    print("Successfully logged in to Hugging Face!")
else:
    raise ValueError("HF_TOKEN environment variable is not set. Please set it before running the script.")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Successfully logged in to Hugging Face!


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it", torch_dtype=torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def extract_model_response(response):
 # Split the response at the start of the model's turn
 parts = response.split("<start_of_turn>model")
  # If there's a model response, it will be in the last part
 if len(parts) > 1:
   model_response = parts[-1].strip()

   # Remove any potential end-of-turn markers
   model_response = model_response.split("<end_of_turn>")[0].strip()

   return model_response
 else:
   return "No model response found."

In [None]:
# Prepare the prompt for the model
chat = [
   { "role": "user", "content": combined_information },
]
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

# Move model and inputs to the GPU
model = model.to("cuda")
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to("cuda")

# Generate outputs using the model
outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
)

# Decode the output tokens
response = tokenizer.decode(outputs[0])

In [None]:
model_output = extract_model_response(response)
print(model_output)

This is a great start! You've provided a good overview of the companies and their performance. Here are some suggestions to strengthen your analysis and make it more insightful:

**1.  Depth and Specificity:**

* **Financial Performance:**  Go deeper into the financial figures.  Compare revenue growth year-over-year for each company. Break down the profitability (gross margin, operating margin) and provide a detailed explanation of how it relates to their business model. 
* **Market Share:** If possible, provide specific data on market share of each company within their respective niches. This helps understand their competitive position. 
* **Challenges:**  Elaborate on the challenges each company faces. For example, DataStream Analytics mentions talent acquisition and regulatory landscape. Provide more concrete examples of these challenges. 
* **Opportunities:** Highlight specific opportunities each company is pursuing. For example, QCSL mentions their hybrid computing platform and pa