### Imports

In [1]:
from googlesearch import search
import yfinance as yf
import os
from langchain.prompts import ChatPromptTemplate

import asyncio
import getpass
import os
from datetime import datetime
from hashlib import md5
from typing import Dict, List

import pandas as pd
import tiktoken
from langchain_community.graphs import Neo4jGraph
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_text_splitters import TokenTextSplitter
from pydantic import BaseModel, Field

from datetime import datetime

### Web Scraping

Financials - Yahoo Finance

In [2]:
ticker = input("What is the Ticker of the Stock?: ")

stock_ticker = yf.Ticker(ticker)
df_balancesheet = stock_ticker.balance_sheet
balance_sheet_json = df_balancesheet.to_json()

df_incomestmt = stock_ticker.income_stmt
incomestmt_json = df_incomestmt.to_json()

df_cashflow = stock_ticker.cash_flow
cashflow_json = df_cashflow.to_json()


Latest News - GoogleSearch, Latest Filings

In [3]:
stock_name = input("Enter the Stock Name: ")
def get_top_links(query, num_results=1):
    top_10_links = []
    search_results = search(query, num_results=num_results)
    
    for idx, result in enumerate(search_results, start=1):
        top_10_links.append(result)
    
    return top_10_links

search_news = stock_name + "latest stock news"
search_latest_opinion = stock_name + "latest financial analysis"
global_financial_news = "Latest News Financial Markets International"
latest_filings = stock_name + "Latest Legal Filings / Reporting"

top_10_news_links = get_top_links(search_news)
top_10_blogs_links = get_top_links(search_latest_opinion)
top_10_global_links = get_top_links(global_financial_news)
top_10_filings_links =  get_top_links(latest_filings)
top_10_news_links.append('https://tradingeconomics.com/stream')

all_links_combined = (
    top_10_news_links + 
    top_10_blogs_links + 
    top_10_global_links + 
    top_10_filings_links
)

In [18]:
all_links_combined

['https://www.moneycontrol.com/india/stockpricequote/it-servicesconsulting/varaniumcloud/VC10',
 'https://tradingeconomics.com/stream',
 'https://www.tofler.in/cloudraft-technologies-private-limited/company/U72200MP2022PTC063851',
 'https://www.reuters.com/markets/',
 'https://support.google.com/legal/troubleshooter/1114905?hl=en']

Extracting Useful Text From Raw Scraped Text

In [4]:
from groq import Groq
client = Groq(
    api_key= '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',
)

PROMPT_TEMPLATE = """
Prompt: Extract only the useful information from the below text, 
Do not Include Links, Always include plain text full of useful maximum impact information in concise manner.
{context}
"""

def call_groq(scraped_text_chunk):
    chat_completion = client.chat.completions.create(
    messages=[
            {
                "role": "user",
                "content": scraped_text_chunk,
            }
        ],
        model="llama3-8b-8192",
    )
    return chat_completion.choices[0].message.content

Passing it through a Web Scraper to Extract Text

In [5]:
from firecrawl import FirecrawlApp
import requests 

app = FirecrawlApp(api_key="fc-23cb2edcab4e4ba7a0221dfdd5ef4883")
useful_text = []

def chunk_text(text, size=50000):
    return [text[i:i + size] for i in range(0, len(text), size)]

for link in all_links_combined:
    try:
        scrape_result = app.scrape_url(link, params={'formats': ['markdown', 'html']})
        large_text = scrape_result['markdown']
        chunked_text = chunk_text(large_text)

        for chunk in chunked_text:
            prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
            prompt = prompt_template.format(context=chunk)
            response_text = call_groq(prompt)

            useful_text.append(response_text)
            print(response_text)
    
    except requests.exceptions.HTTPError as e:
        print(f"Request failed for {link}: {e}. Skipping...")
        continue  
    except Exception as e:
        print(f"An unexpected error occurred for {link}: {e}. Skipping...")
        continue  

An unexpected error occurred for https://www.moneycontrol.com/india/stockpricequote/it-servicesconsulting/varaniumcloud/VC10: Error code: 400 - {'error': {'message': 'Please reduce the length of the messages or completion.', 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}. Skipping...
Here are the useful information in concise manner:

**Economy and Credit Rating**

* Moody's downgraded Belgium's credit rating outlook to negative due to concerns over its deficit trajectory and political uncertainty.
* Fitch placed France on a negative outlook on October 11th, citing fiscal policy risks and increased fiscal slippage.

**Commodities**

* Agricultural commodities:
	+ Cheese prices surged by 7.30% and 7.49% in the last 24 hours.
	+ Orange juice and cocoa prices also rose.
* Energy commodities:
	+ Natural gas prices dropped by 1.76% and 1.38% in the last 24 hours.
	+ Oil prices fell slightly.
* Metals:
	+ Copper prices rose by 1.39% and 1.34% in the

In [6]:
whole_text = ""
for text in useful_text:
    whole_text += text

Input the Useful Scraped Data Into Neo4j

In [7]:
useful_text_list = chunk_text(whole_text)

In [8]:
from langchain_community.graphs import Neo4jGraph

os.environ["NEO4J_URI"] = "neo4j+s://0de88855.databases.neo4j.io:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "VvHhsur-pTOfen4lYtsCjaAg6Kovq0a1CjJ34G543Xs"

## Bh75snEPQV6s8St1BbjMWjHX7R9VdnSobxpWnzgY350

graph = Neo4jGraph(refresh_schema=False)

graph.query("CREATE CONSTRAINT IF NOT EXISTS FOR (c:Chunk) REQUIRE c.id IS UNIQUE")
graph.query("CREATE CONSTRAINT IF NOT EXISTS FOR (c:AtomicFact) REQUIRE c.id IS UNIQUE")
graph.query("CREATE CONSTRAINT IF NOT EXISTS FOR (c:KeyElement) REQUIRE c.id IS UNIQUE")
graph.query("CREATE CONSTRAINT IF NOT EXISTS FOR (d:Document) REQUIRE d.id IS UNIQUE")

[]

In [9]:
construction_system = """
You are now an intelligent assistant tasked with meticulously extracting both key elements and
atomic facts from a long text.
1. Key Elements: The essential nouns (e.g., characters, times, events, places, numbers), verbs (e.g.,
actions), and adjectives (e.g., states, feelings) that are pivotal to the text’s narrative.
2. Atomic Facts: The smallest, indivisible facts, presented as concise sentences. These include
propositions, theories, existences, concepts, and implicit elements like logic, causality, event
sequences, interpersonal relationships, timelines, etc.
Requirements:
#####
1. Ensure that all identified key elements are reflected within the corresponding atomic facts.
2. You should extract key elements and atomic facts comprehensively, especially those that are
important and potentially query-worthy and do not leave out details.
3. Whenever applicable, replace pronouns with their specific noun counterparts (e.g., change I, He,
She to actual names).
4. Ensure that the key elements and atomic facts you extract are presented in the same language as
the original text (e.g., English or Chinese).
"""

construction_human = """Use the given format to extract information from the 
following input: {input}"""

construction_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            construction_system,
        ),
        (
            "human",
            (
                "Use the given format to extract information from the "
                "following input: {input}"
            ),
        ),
    ]
)

In [10]:
class AtomicFact(BaseModel):
    key_elements: List[str] = Field(description="""The essential nouns (e.g., characters, times, events, places, numbers), verbs (e.g.,
actions), and adjectives (e.g., states, feelings) that are pivotal to the atomic fact's narrative.""")
    atomic_fact: str = Field(description="""The smallest, indivisible facts, presented as concise sentences. These include
propositions, theories, existences, concepts, and implicit elements like logic, causality, event
sequences, interpersonal relationships, timelines, etc.""")

class Extraction(BaseModel):
    atomic_facts: List[AtomicFact] = Field(description="List of atomic facts")

In [11]:
from langchain_groq import ChatGroq
os.environ["GROQ_API_KEY"] = "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"
model = ChatGroq(model="gemma2-9b-it", temperature=0.1)
structured_llm = model.with_structured_output(Extraction)

construction_chain = construction_prompt | structured_llm

In [12]:
import_query = """
MERGE (d:Document {id:$document_name})
WITH d
UNWIND $data AS row
MERGE (c:Chunk {id: row.chunk_id})
SET c.text = row.chunk_text,
    c.index = row.index,
    c.document_name = row.document_name
MERGE (d)-[:HAS_CHUNK]->(c)
WITH c, row
UNWIND row.atomic_facts AS af
MERGE (a:AtomicFact {id: af.id})
SET a.text = af.atomic_fact
MERGE (c)-[:HAS_ATOMIC_FACT]->(a)
WITH c, a, af
UNWIND af.key_elements AS ke
MERGE (k:KeyElement {id: ke})
MERGE (a)-[:HAS_KEY_ELEMENT]->(k)
"""

def encode_md5(text):
    return md5(text.encode("utf-8")).hexdigest()

In [13]:
async def process_document(text, document_name, chunk_size=1000, chunk_overlap=200):
    start = datetime.now()
    print(f"Started extraction at: {start}")
    text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_text(text)
    print(f"Total text chunks: {len(texts)}")
    tasks = [
        asyncio.create_task(construction_chain.ainvoke({"input":chunk_text}))
        for index, chunk_text in enumerate(texts)
    ]
    results = await asyncio.gather(*tasks)
    print(f"Finished LLM extraction after: {datetime.now() - start}")
    docs = [el.dict() for el in results]
    for index, doc in enumerate(docs):
        doc['chunk_id'] = encode_md5(texts[index])
        doc['chunk_text'] = texts[index]
        doc['index'] = index
        for af in doc["atomic_facts"]:
            af["id"] = encode_md5(af["atomic_fact"])

    graph.query(import_query, 
            params={"data": docs, "document_name": document_name})
    graph.query("""MATCH (c:Chunk)<-[:HAS_CHUNK]-(d:Document)
WHERE d.id = $document_name
WITH c ORDER BY c.index WITH collect(c) AS nodes
UNWIND range(0, size(nodes) -2) AS index
WITH nodes[index] AS start, nodes[index + 1] AS end
MERGE (start)-[:NEXT]->(end)
""",
           params={"document_name":document_name})
    print(f"Finished import at: {datetime.now() - start}")

In [14]:
await process_document(whole_text, "Apple", chunk_size=700, chunk_overlap=100)

Started extraction at: 2024-10-12 22:47:29.247946
Total text chunks: 2
Finished LLM extraction after: 0:00:04.411894
Finished import at: 0:00:05.123528


In [15]:
query = """
MATCH (af:AtomicFact)
RETURN af.text LIMIT 10
"""
results = graph.query(query)
for record in results:
    print(f"Atomic Fact: {record['af.text']}")

Atomic Fact: Moody's downgraded Belgium's credit rating outlook to negative due to concerns over its deficit trajectory and political uncertainty.
Atomic Fact: Fitch placed France on a negative outlook on October 11th, citing fiscal policy risks and increased fiscal slippage.
Atomic Fact: Cheese prices surged by 7.30% and 7.49% in the last 24 hours.
Atomic Fact: Orange juice and cocoa prices also rose.
Atomic Fact: Natural gas prices dropped by 1.76% and 1.38% in the last 24 hours.
Atomic Fact: Oil prices fell slightly.
Atomic Fact: Copper prices rose by 1.39% and 1.34% in the last 24 hours.
Atomic Fact: US stocks rallied on Friday, with the S&P 500 and Dow Jones hitting new highs.
Atomic Fact: Canadian stocks closed higher on Friday, with the S&P/TSX Composite Index adding 0.7%.
Atomic Fact: European stocks, such as the IBEX 35 and CAC 40, also rose.


In [16]:
query = """
MATCH (k:KeyElement)
RETURN k.id LIMIT 10
"""
results = graph.query(query)
for record in results:
    print(f"Key Element ID: {record['k.id']}")

Key Element ID: Moody's
Key Element ID: Belgium
Key Element ID: credit rating outlook
Key Element ID: negative
Key Element ID: deficit trajectory
Key Element ID: political uncertainty
Key Element ID: Fitch
Key Element ID: France
Key Element ID: negative outlook
Key Element ID: October 11th


In [17]:
keyword = "Random"
query = """
MATCH (k:KeyElement)<-[:HAS_KEY_ELEMENT]-(af:AtomicFact)
WHERE toLower(k.id) CONTAINS toLower($keyword)
RETURN k.id AS key_element, af.text AS atomic_fact
LIMIT 10
"""
results = graph.query(query, {"keyword": keyword})
for record in results:
    print(f"Key Element: {record['key_element']}, Atomic Fact: {record['atomic_fact']}")
