In [195]:
import pandas as pd
import os
import cohere
from datasets import load_dataset
from dotenv import load_dotenv

In [196]:
# Loading all the environment variables
load_dotenv()
cohere_api_key = os.getenv("COHERE_API_KEY")
hf_token = os.getenv("HF_TOKEN")
mongodb_uri = os.getenv("MONGO_URI")

In [201]:
co = cohere.Client(api_key=cohere_api_key)

In [202]:
dataset = load_dataset("MongoDB/fake_tech_companies_market_reports", split="train", streaming=True)
dataset_df = dataset.take(100)

In [203]:
# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset_df)
print(dataset_df.head(5))

                                         recent_news  \
0  [{'date': '2024-06-09', 'headline': 'CyberDefe...   
1  [{'date': '2024-07-04', 'headline': 'CloudComp...   
2  [{'date': '2024-06-27', 'headline': 'VirtualRe...   
3  [{'date': '2024-07-06', 'headline': 'BioTech I...   
4  [{'date': '2024-06-26', 'headline': 'QuantumCo...   

                                             reports                 company  \
0  [{'author': 'Taylor Smith, Technology Sector L...   CyberDefense Dynamics   
1  [{'author': 'Casey Jones, Chief Market Strateg...        CloudCompute Pro   
2  [{'author': 'Sam Brown, Head of Equity Researc...  VirtualReality Systems   
3  [{'author': 'Riley Smith, Senior Tech Analyst'...     BioTech Innovations   
4  [{'author': 'Riley Garcia, Senior Tech Analyst...    QuantumComputing Inc   

  ticker                                        key_metrics  \
0   CDDY  {'52_week_range': {'high': 387.3, 'low': 41.63...   
1   CCPR  {'52_week_range': {'high': 524.23, 'low': 171.

In [204]:
dataset_df.columns

Index(['recent_news', 'reports', 'company', 'ticker', 'key_metrics', 'sector'], dtype='object')

In [205]:
dataset_df['reports'][0]

[{'author': 'Taylor Smith, Technology Sector Lead',
  'content': '**CyberDefense Dynamics (CDDY) - 2023 Market Analysis Report**\n\n**Overview:**\nCyberDefense Dynamics (CDDY) is a leading provider of cybersecurity solutions, offering a range of products and services to protect businesses and government agencies from digital threats. In 2023, CDDY continued to strengthen its market position and delivered solid performance despite a challenging economic environment. This report provides an in-depth analysis of CDDY\'s performance in 2023, highlighting its key achievements, financial performance, challenges, and future prospects.\n\n**Key Highlights:**\n\n- **Financial Performance:** CDDY reported strong financial results for 2023. The company\'s revenue increased by 20% year-over-year, driven by robust demand for its core cybersecurity products and services. Profit margins also improved, with a 5% increase in net income compared to 2022. This growth can be attributed to CDDY\'s diverse 

In [206]:
# Data Preparation
def combine_attributes(row):
  combined = f"{row['company']} {row['sector']}"  
  # Add reports information
  for report in row['reports']:
    combined += f"{report['year']} {report['title']} {report['author']} {report['content']}"
  # Add recent news information
  for news in row['recent_news']:
    combined += f"{news['headline']} {news['summary']}"
  return combined.strip()

In [207]:
# Add the new column 'combined_attributes'
dataset_df['combined_attributes'] = dataset_df.apply(combine_attributes, axis=1)

In [208]:
# Display the first few rows of the updated dataframe
dataset_df[['company', 'ticker', 'combined_attributes']].head()

Unnamed: 0,company,ticker,combined_attributes
0,CyberDefense Dynamics,CDDY,CyberDefense Dynamics Information Technology20...
1,CloudCompute Pro,CCPR,CloudCompute Pro Information Technology2023 Cl...
2,VirtualReality Systems,VRSY,VirtualReality Systems Information Technology2...
3,BioTech Innovations,BTCI,BioTech Innovations Information Technology2023...
4,QuantumComputing Inc,QCMP,QuantumComputing Inc Information Technology202...


In [209]:
dataset_df['combined_attributes']

0     CyberDefense Dynamics Information Technology20...
1     CloudCompute Pro Information Technology2023 Cl...
2     VirtualReality Systems Information Technology2...
3     BioTech Innovations Information Technology2023...
4     QuantumComputing Inc Information Technology202...
                            ...                        
58    CloudCompute Solutions Information Technology2...
59    CloudSecurity Pro Information Technology2023 C...
60    NanoTech Solutions Information Technology2023 ...
61    TechInnovate Information Technology2023 TechIn...
62    RoboticsFuture Information Technology2023 Robo...
Name: combined_attributes, Length: 63, dtype: object

In [210]:
from tqdm import tqdm

In [211]:
def get_embedding(text: str, input_type: str="search_document") -> list[float]:
  if not text.strip():
    print("Attempted to get embedding for empty text.")
    return []

  model = "embed-english-v3.0"
  response = co.embed(
    texts=[text],
    model=model,
    input_type=input_type, # Used for embeddings of search queries run against a vector DB to find relevant documents
    embedding_types=['float']
  )

  return response.embeddings.float[0]

# Apply the embedding function with a progress bar
tqdm.pandas(desc="Generating embeddings")
dataset_df["embedding"] = dataset_df['combined_attributes'].progress_apply(get_embedding)

print(f"We just computed {len(dataset_df['embedding'])} embeddings.")

Generating embeddings: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]

We just computed 63 embeddings.





In [212]:
dataset_df.head()

Unnamed: 0,recent_news,reports,company,ticker,key_metrics,sector,combined_attributes,embedding
0,"[{'date': '2024-06-09', 'headline': 'CyberDefe...","[{'author': 'Taylor Smith, Technology Sector L...",CyberDefense Dynamics,CDDY,"{'52_week_range': {'high': 387.3, 'low': 41.63...",Information Technology,CyberDefense Dynamics Information Technology20...,"[0.013557434, -0.033691406, -0.018447876, -0.0..."
1,"[{'date': '2024-07-04', 'headline': 'CloudComp...","[{'author': 'Casey Jones, Chief Market Strateg...",CloudCompute Pro,CCPR,"{'52_week_range': {'high': 524.23, 'low': 171....",Information Technology,CloudCompute Pro Information Technology2023 Cl...,"[-0.057525635, -0.06390381, -0.038269043, -0.0..."
2,"[{'date': '2024-06-27', 'headline': 'VirtualRe...","[{'author': 'Sam Brown, Head of Equity Researc...",VirtualReality Systems,VRSY,"{'52_week_range': {'high': 530.59, 'low': 56.4...",Information Technology,VirtualReality Systems Information Technology2...,"[0.025650024, -0.020706177, -0.016921997, -0.0..."
3,"[{'date': '2024-07-06', 'headline': 'BioTech I...","[{'author': 'Riley Smith, Senior Tech Analyst'...",BioTech Innovations,BTCI,"{'52_week_range': {'high': 366.55, 'low': 124....",Information Technology,BioTech Innovations Information Technology2023...,"[0.0234375, -0.040496826, -0.0044822693, -0.05..."
4,"[{'date': '2024-06-26', 'headline': 'QuantumCo...","[{'author': 'Riley Garcia, Senior Tech Analyst...",QuantumComputing Inc,QCMP,"{'52_week_range': {'high': 231.91, 'low': 159....",Information Technology,QuantumComputing Inc Information Technology202...,"[-0.0069847107, -0.046844482, 0.039398193, 0.0..."


In [213]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

# Create a new client and connect to the server
mongo_client = MongoClient(mongodb_uri, appname="rag.cohere_mongodb.python", server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    mongo_client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [214]:
#mongo_client = MongoClient(mongodb_uri, appname="rag.cohere_mongodb.python")

DB_NAME = "asset_management_use_case"
COLLECTION_NAME = "market_reports"

db = mongo_client.get_database(DB_NAME)
collection = db.get_collection(COLLECTION_NAME)

In [215]:
# Delete any existing records in the collection
collection.delete_many({})

DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff000000000000002a'), 'opTime': {'ts': Timestamp(1729009489, 10), 't': 42}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1729009489, 10), 'signature': {'hash': b'\xb5\xf7a\xdc\x1fc\x08\xd8\xcf\x9d3\x08#%3\xb2&?\xb0\x14', 'keyId': 7364016862612422665}}, 'operationTime': Timestamp(1729009489, 10)}, acknowledged=True)

In [216]:
documents = dataset_df.to_dict('records')
collection.insert_many(documents)

print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


In [217]:
def vector_search(user_query, collection):

  # Generate embedding for the user query
  query_embedding = get_embedding(user_query, input_type="search_query")

  if query_embedding is None:
    return "Invalid query or embedding generation failed."

  # Define the vector search pipeline
  vector_search_stage = {
    "$vectorSearch": {
      "index": "vector_index",
      "queryVector": query_embedding,
      "path": "embedding",
      "numCandidates": 150,  # Number of candidate matches to consider
      "limit": 5  # Return top 4 matches
    }
  }

  unset_stage = {
    "$unset": "embedding"  # Exclude the 'embedding' field from the results
  }

  project_stage = {
    "$project": {
      "_id": 0,  # Exclude the _id field
      "company": 1,  # Include the plot field
      "reports": 1,  # Include the title field
      "combined_attributes": 1, # Include the genres field
      "score": {
        "$meta": "vectorSearchScore"  # Include the search score
      }
    }
  }

  pipeline = [vector_search_stage, unset_stage, project_stage]

  # Execute the search
  results = collection.aggregate(pipeline)
  return list(results)

In [218]:

def rerank_documents(query: str, documents, top_n: int = 3):
    # Perform reranking with Cohere ReRank Model
    try:
        response = co.rerank(
            model="rerank-english-v3.0",
            query=query,
            documents=documents,
            top_n=top_n,
            rank_fields=["company", "reports", "combined_attributes"]
        )

        # Extract the top reranked documents
        top_documents_after_rerank = []
        for result in response.results:
            original_doc = documents[result.index]
            top_documents_after_rerank.append({
                'company': original_doc['company'],
                'combined_attributes': original_doc['combined_attributes'],
                'reports': original_doc['reports'],
                'vector_search_score': original_doc['score'],
                'relevance_score': result.relevance_score
            })

        return top_documents_after_rerank

    except Exception as e:
        print(f"An error occurred during reranking: {e}")
        return documents[:top_n]  # Return top N documents without reranking

In [219]:
import pprint

query = "What companies have negative market reports or negative sentiment that might deter from investment in the long term"

get_knowledge = vector_search(query, collection)
pd.DataFrame(get_knowledge).head()

Unnamed: 0,reports,company,combined_attributes,score
0,"[{'author': 'Jordan Garcia, Senior Tech Analys...",GreenEnergy Corp,GreenEnergy Corp Information Technology2023 Gr...,0.660881
1,"[{'author': 'Morgan Smith, Technology Sector L...",BioTech Therapeutics,BioTech Therapeutics Information Technology202...,0.649082
2,"[{'author': 'Casey Davis, Technology Sector Le...",RenewableEnergy Innovations,RenewableEnergy Innovations Information Techno...,0.648021
3,"[{'author': 'Casey Davis, Head of Equity Resea...",BioTech Engineering,BioTech Engineering Information Technology2023...,0.646297
4,"[{'author': 'Morgan Williams, Senior Tech Anal...",BioEngineering Corp,BioEngineering Corp Information Technology2023...,0.645909


In [220]:
reranked_documents = rerank_documents(query, get_knowledge)
pd.DataFrame(reranked_documents).head()

Unnamed: 0,company,combined_attributes,reports,vector_search_score,relevance_score
0,GreenEnergy Corp,GreenEnergy Corp Information Technology2023 Gr...,"[{'author': 'Jordan Garcia, Senior Tech Analys...",0.660881,0.000147
1,BioEngineering Corp,BioEngineering Corp Information Technology2023...,"[{'author': 'Morgan Williams, Senior Tech Anal...",0.645909,6.4e-05
2,BioTech Engineering,BioTech Engineering Information Technology2023...,"[{'author': 'Casey Davis, Head of Equity Resea...",0.646297,5.4e-05


In [221]:
def format_documents_for_chat(documents):
  return [
    {
        "company": doc['company'],
        # "reports": doc['reports'],
        "combined_attributes": doc['combined_attributes']
    }
    for doc in documents
  ]

# Generating response with Cohere Command R
response = co.chat(
  message=query,
  documents=format_documents_for_chat(reranked_documents),
  model="command-r-plus",
  temperature=0.3
)

print("Final answer:")
print(response.text)


Final answer:
Here is an overview of the companies with negative market reports or sentiment that might deter long-term investment:

## GreenEnergy Corp (GRNE):
- **Challenges**: Despite impressive performance, GRNE faces challenges due to a volatile political environment and rising trade tensions, resulting in increased tariffs and supply chain hurdles.
- **Competitive Market**: The company operates in a highly competitive renewable energy sector, requiring constant innovation to adapt to evolving technologies and consumer preferences.

## BioEngineering Corp (BENC):
- **Regulatory Hurdles**: BENC faces delays in obtaining approvals for certain products due to stringent healthcare regulations, impacting their time-to-market.
- **Reimbursement and Pricing Pressures**: Rising healthcare costs and pressure from payers and governments for lower prices present challenges for BENC's pricing strategies and profitability.
- **Research and Development Expenses**: In 2025, BENC faced significan

In [222]:
for cite in response.citations:
  print(cite)

start=122 end=145 text='GreenEnergy Corp (GRNE)' document_ids=['doc_0']
start=151 end=161 text='Challenges' document_ids=['doc_0']
start=228 end=284 text='volatile political environment and rising trade tensions' document_ids=['doc_0']
start=299 end=342 text='increased tariffs and supply chain hurdles.' document_ids=['doc_0']
start=347 end=365 text='Competitive Market' document_ids=['doc_0']
start=395 end=437 text='highly competitive renewable energy sector' document_ids=['doc_0']
start=449 end=528 text='constant innovation to adapt to evolving technologies and consumer preferences.' document_ids=['doc_0']
start=533 end=559 text='BioEngineering Corp (BENC)' document_ids=['doc_1']
start=565 end=583 text='Regulatory Hurdles' document_ids=['doc_1']
start=598 end=648 text='delays in obtaining approvals for certain products' document_ids=['doc_1']
start=656 end=688 text='stringent healthcare regulations' document_ids=['doc_1']
start=706 end=721 text='time-to-market.' document_ids=['doc_1']


In [229]:
from typing import Dict, Optional, List

class CohereChat:

    def __init__(self, cohere_client, system: str = "", database: str = "cohere_chat",
                 main_collection: str = "main_collection", history_params: Optional[Dict[str, str]] = None):
      self.co = cohere_client
      self.system = system
      self.history_params = history_params or {}

      # Use the connection string from history_params
      #self.client = MongoClient(self.history_params.get('mongodb_uri', 'mongodb://localhost:27017/'))
      self.client = MongoClient(self.history_params.get('connection_string', 'mongodb://localhost:27017/'))

      # Use the database parameter
      self.db = self.client[database]

      # Use the main_collection parameter
      self.main_collection = self.db[main_collection]

      # Use the history_collection from history_params, or default to "chat_history"
      self.history_collection = self.db[self.history_params.get('history_collection', 'chat_history')]

      # Use the session_id from history_params, or default to "default_session"
      self.session_id = self.history_params.get('session_id', 'default_session')

    def add_to_history(self, message: str, prefix: str = ""):
      self.history_collection.insert_one({
        'session_id': self.session_id,
        'message': message,
        'prefix': prefix
      })

    def get_chat_history(self) -> List[Dict[str, str]]:
      history = self.history_collection.find({'session_id': self.session_id}).sort('_id', 1)
      return [{"role": "user" if item['prefix'] == "USER" else "chatbot", "message": item['message']} for item in history]

    def rerank_documents(self, query: str, documents: List[Dict], top_n: int = 3) -> List[Dict]:
      rerank_docs = [  # creating a new list of docs keeping only 'company' and 'combined attributes' field
          {
            'company': doc['company'],
            'combined_attributes': doc['combined_attributes']
          }
          for doc in documents
          if doc['combined_attributes'].strip() # filters out documents with empty 'combined_attributes'.
      ]

      if not rerank_docs: # Returns an empty list if there are no valid documents to rerank.
          print("No valid documents to rerank.")
          return []

      try:
          response = self.co.rerank(        # calling the rerank function again
              query=query,
              documents=rerank_docs,
              top_n=top_n,
              model="rerank-english-v3.0",
              rank_fields=["company", "combined_attributes"]
          )

          top_documents_after_rerank = [   # creating a new list of the top reranked documents
              {
                  'company': rerank_docs[result.index]['company'],
                  'combined_attributes': rerank_docs[result.index]['combined_attributes'],
                  'relevance_score': result.relevance_score
              }
              for result in response.results
          ]

          print(f"\nHere are the top {top_n} documents after rerank:")
          for doc in top_documents_after_rerank:
              print(f"== {doc['company']} (Relevance: {doc['relevance_score']:.4f})")

          return top_documents_after_rerank

      except Exception as e:
          print(f"An error occurred during reranking: {e}")
          return documents[:top_n]

    def format_documents_for_chat(self, documents: List[Dict]) -> List[Dict]: # pre-processing step
      return [
        {
          "company": doc['company'],
          "combined_attributes": doc['combined_attributes']
        }
        for doc in documents
      ]

    def send_message(self, message: str, vector_search_func) -> str:
      self.add_to_history(message, "USER") # Stores the user's message in the chat history.

      # Perform vector search
      search_results = vector_search_func(message, self.main_collection)

      # Rerank the search results
      reranked_documents = self.rerank_documents(message, search_results)

      # Format documents for chat
      formatted_documents = self.format_documents_for_chat(reranked_documents)

      # Generate response using Cohere chat
      response = self.co.chat(
        chat_history=self.get_chat_history(),
        message=message,
        documents=formatted_documents,
        model="command-r-plus",
        temperature=0.3
      )

      result = response.text # extracts the text from the response
      self.add_to_history(result, "CHATBOT") # adds the chatbot's response to the chat history

      print("Final answer:")
      print(result)

      print("\nCitations:")
      for cite in response.citations:
        print(cite)

      return result

    def show_history(self):
      history = self.history_collection.find({'session_id': self.session_id}).sort('_id', 1)
      for item in history:
        print(f"{item['prefix']}: {item['message']}")
        print("-------------------------")


In [230]:
# Initialize CohereChat
chat = CohereChat(
    co,
    system="You are a helpful assistant taking on the role of an Asset Manager focused on tech companies.",
    database=DB_NAME,
    main_collection=COLLECTION_NAME,
    history_params={
        'connection_string': mongodb_uri,
        'history_collection': "chat_history",
        'session_id': 2
    }
)

# Send a message
response = chat.send_message("What is the best investment to make why?", vector_search)


Here are the top 3 documents after rerank:
== EcoTech Innovations (Relevance: 0.0001)
== SmartRetail Tech (Relevance: 0.0001)
== GreenEnergy Systems (Relevance: 0.0001)
Final answer:
I am an AI assistant and cannot give financial advice. However, I can tell you about some companies that have been recommended as 'buy' investments.

## EcoTech Innovations (ETIN)
EcoTech Innovations is a leading provider of sustainable technology solutions, specialising in renewable energy and environmentally friendly products. In 2023, the company demonstrated resilience and adaptability, achieving solid results despite challenging market conditions. ETIN's diverse product portfolio, innovative capabilities, and the growing demand for sustainable solutions have contributed to its success. The company's financial performance has been impressive, with a 15% revenue growth year-over-year in 2023. 

## SmartRetail Tech (SRTY)
SmartRetail Tech is a leading provider of innovative retail technology solutions, 

In [231]:
# Show chat history
chat.show_history()

USER: What is the best investment to make why?
-------------------------
CHATBOT: I am an AI assistant and cannot give financial advice. However, I can tell you about some companies that have been recommended as 'buy' investments.

## EcoTech Innovations (ETIN)
EcoTech Innovations is a leading provider of sustainable technology solutions, specialising in renewable energy and environmentally friendly products. In 2023, the company demonstrated resilience and adaptability, achieving solid results despite challenging market conditions. ETIN's diverse product portfolio, innovative capabilities, and the growing demand for sustainable solutions have contributed to its success. The company's financial performance has been impressive, with a 15% revenue growth year-over-year in 2023. 

## SmartRetail Tech (SRTY)
SmartRetail Tech is a leading provider of innovative retail technology solutions, empowering retailers with advanced tools to enhance their operations and customer experience. SRTY has