In [20]:
from analyze_news import analyze_news
from utils.analysis.analyzer import get_llm
from dotenv import load_dotenv
import json
from tqdm import tqdm
from langchain.docstore.document import Document
from langchain_chroma import Chroma
from nltk.tokenize import sent_tokenize
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough


In [3]:
env_path = '.env'
load_dotenv(dotenv_path=env_path)

True

In [90]:
def chunker(text: str, chunk_size: int = 3):
    sentences = sent_tokenize(text)
    
    # Create chunks of chunk_size number of sentences
    chunks = []
    for i in range(0, len(sentences), chunk_size):
        chunk = ' '.join(sentences[i:i + chunk_size])
        chunks.append(chunk)
        
    return chunks

def create_vec_db(chunks: list):
    documents = [Document(page_content=chunk) for chunk in chunks]
    vec_db = Chroma.from_documents(documents)

    return vec_db

def do_ner(text: str):
    template = """
        You are an expert in Named Entity Recognition (NER). Your task is to identify and mark entities in the given text without changing or omitting any part of the original text. The possible entities are: [organization, person, location]

        Instructions:
        1. Read the input text carefully.
        2. Identify all instances of organizations, persons, and locations.
        3. Mark each entity by placing its category in parentheses immediately after the entity name.
        4. Do not change, rephrase, or omit any part of the original text.
        5. If a sentence doesn't contain any of the specified entities, return it unchanged.
        6. Process the entire input text, sentence by sentence.

        Example:
        Input: "The natural biomolecules, available in liquid form, is administered to plants and is a targeted intervention during the plant's growth, aimed to prevent any kind of crop loss. (File Express photo) The rising instances of untimely or heavy rainfall, and fluctuating temperatures are among the weather vagaries that have plagued Indian farmers in recent times. Such inclement weather not only damages standing crops but also severely hampers seasonal yield, affecting their overall farm income. In order to help farmers better deal with the impact of changing weather, city-based Bioprime Agrisolutions has developed environment-friendly bio-molecules capable of making the crops more climate resilient."

        Output:
        "The natural biomolecules, available in liquid form, is administered to plants and is a targeted intervention during the plant's growth, aimed to prevent any kind of crop loss. (File Express photo) The rising instances of untimely or heavy rainfall, and fluctuating temperatures are among the weather vagaries that have plagued Indian (location) farmers in recent times. Such inclement weather not only damages standing crops but also severely hampers seasonal yield, affecting their overall farm income. In order to help farmers better deal with the impact of changing weather, city-based Bioprime Agrisolutions (organization) has developed environment-friendly bio-molecules capable of making the crops more climate resilient."

        Now, process the following text:
        {text}
    """
    prompt = PromptTemplate.from_template(template)
    llm = get_llm()

    llm_chain = (
        {"text": RunnablePassthrough()} |
        prompt |
        llm
    )

    response = llm_chain.invoke(text)
    return response

def do_classification(text: str):
    template = """
        You are an expert in text classification. Your task is to classify the given text into one of the following categories: 
        1. actor - affects (positive / negative) - object 
        2. object - affected_by (positive / negative) - actor
        3. actor - affects (positive / negative) - actor
        4. actor - affected_by (positive / negative) - actor 

    Now, classify the following text:
    {text}
    """
    prompt = PromptTemplate.from_template(template)
    llm = get_llm()

    llm_chain = (
        {"text": RunnablePassthrough()} |
        prompt |
        llm
    )

    response = llm_chain.invoke(text)
    return response.content

In [76]:
with open("agriculture_news.json", "r") as f:
    news = json.load(f)

with open("../dev/full_news_1.json", "r") as f:
    full_news = json.load(f)

news_ = news[5]
text = full_news[news_['news_idx']]['article']

In [84]:
ner = do_ner(text)
chunks = chunker(ner.content)

In [85]:
chunks

['Doppler radars at Kufri (location) and (right) Mukteshwar (location). (Courtesy: IMD)\nUnion Minister for Earth Sciences Dr Harsh Vardhan (person) Friday commissioned two of the ten indigenously built Doppler weather radars which will closely monitor the weather changes over the Himalayas. Services of X-band Doppler radars at Mukteshwar (location) in Uttarakhand (location) and Kufri (location) in Himachal Pradesh (location) were virtually inaugurated on the 146th foundation day of the India Meteorological Department (IMD) (organization) on Friday.',
 'Covering the central and western Himalayas (location), these dual polarised radars will gather atmospheric variations and pick signals of extreme weather events, IMD (organization) officials said. During the month ahead, the Met department has planned to install ten radars over the Himalayas (location). Both Uttarakhand (location) and Himachal Pradesh (location) are highly prone to cloud bursts, landslides, heavy rain and snowfall.',
 '

In [91]:
classification = do_classification(chunks[0])
print(classification)

Object - affected_by (positive) - actor

Explanation:
The text describes the Doppler radars at Kufri and Mukteshwar being commissioned and inaugurated by Union Minister for Earth Sciences Dr Harsh Vardhan. Therefore, the object (Doppler radars) is being affected positively by the actor (Union Minister for Earth Sciences Dr Harsh Vardhan).
