In [3]:
!pip install ollama -q

In [2]:
import ollama

In [4]:
model_name = "gemma2"

In [None]:
import os
import sqlite3
import logging
import time
from tqdm import tqdm

# Start timing
start_time = time.time()

os.environ["HUGGINGFACE_API_KEY"] = "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"

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')





def query_chunk(chunk):
    prompt_instruction = (
            "You are a virtual assistant with advanced expertise in a broad spectrum of topics, equipped to utilize high-level critical thinking, cognitive skills, creativity, and innovation.\n"
            "Your goal is to deliver the most straightforward and accurate answer possible for each question, ensuring high-quality and useful responses for the user.\n"
            "A physical asset is a tangible resource that a company owns and uses in the production of goods and services. Ensure that a geographical location or region is never considered as an asset.\n"
            "A financial asset or other non-physical asset should never be included as a physical asset. Examples of financial assets include equity commitments, corporate facilities, accounts receivable, and short-term investments. Never include these in the list of physical assets.\n"
            "Here is an example:"
            "Text: [...] Our principal asset is the Grasberg mine, which we discovered in 1988. Grasberg contains the largest single gold reserve and one of the largest copper reserves of any mine in the world, located in Sudirman Mountain Range, Papua in Indonesia. Our principal operating subsidiary is PT Freeport Indonesia, a limited liability company organized under the laws of the Republic of Indonesia and incorporated in Delaware. [...]"
            "Query: Does this text mention any physical assets, locations, and ownerships?\n"
            "physical assets: [Grasberg mine]\nlocations: [Sudirman Mountain Range, Papua, Indonesia]\nownerships: [PT Freeport Indonesia, Freeport-McMoRan Copper & Gold Inc., Government of Indonesia]\n\n"
            "relationships: [asset: 'Grasberg mine', location: 'Sudirman Mountain Range, Papua, Indonesia', ownership: 'PT Freeport Indonesia']\n\n"

            "Now, let's analyze the following text:\n"
            f"Text: {chunk}\nQuery: Let's think step-by-step. Does this text mention any physical assets, locations, and ownerships?\n"
            "If yes, please specify them in the following format:\n"
            "physical assets: [ ]\nlocations: [ ]\nownerships: [ ]\n\n"
            "Additionally, identify the relationships between them, specifying the location of each physical asset and the ownership details. "
            "Format the relationships as follows:\nrelationships: [asset: '', location: '', ownership: '']"
        )

    prompt = f"{prompt_instruction}"
    print(100*'-')
    print(f"Prompt: {prompt}")

    response = ollama.chat(model=model_name, messages=[{'role': 'user', 'content': prompt}])
    output = response['message']['content']

    generated_text = output
    cleaned_text = cleanup_generated_text(generated_text)
    print(cleaned_text)
    print(100*'-')
    return cleaned_text.strip()

def cleanup_generated_text(text):
    """ Function to remove unwanted conversations and repeated content from generated text. """
    unwanted_phrases = [
        "assistant", "You're welcome", "ha", "okay", "nice", "goodbye", "thank you",
        "ahem", "we're", "I'll", "That's", "sounds", "agreed", "chatty", "wave",
        "handshake", "hug", "laughter", "applause", "mic drop", "explosion",
        "fireworks", "finale", "I'm done", "let's just stop", "ha", "same to you",
        "mission accomplished", "excellent", "guilty as charged", "my secret's safe with you",
        "under wraps", "fun little chat", "see you next time", "have a great day",
        "virtual smile", "smile", "wrapped this up"
    ]
    lines = text.split("\n")
    cleaned_lines = []
    for line in lines:
        if any(phrase.lower() in line.lower() for phrase in unwanted_phrases):
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines)


def save_results_to_file(file_path, results):
    with open(file_path, 'w', encoding='utf-8') as file:
        for chunk_id, result in results:
            file.write(f"Chunk ID: {chunk_id}\n{result}\n\n")
    logging.info(f"Results saved to {file_path}")

def list_tables(db_path):
    try:
        with sqlite3.connect(db_path) as conn:
            cursor = conn.cursor()
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
            tables = cursor.fetchall()
            return [table[0] for table in tables]
    except sqlite3.Error as e:
        logging.error(f"Database error: {e}")
        return []
    except Exception as e:
        logging.error(f"Error: {e}")
        return []

def extract_document_texts(db_path):
    try:
        with sqlite3.connect(db_path) as conn:
            cursor = conn.cursor()
            cursor.execute("SELECT document_text FROM filings")
            rows = cursor.fetchall()
            return [row[0] for row in rows]
    except sqlite3.Error as e:
        logging.error(f"Database error: {e}")
    except Exception as e:
        logging.error(f"Error: {e}")
    return []

def process_document_chunks(db_path):
    document_texts = extract_document_texts(db_path)

    if not document_texts:
        logging.info(f"No documents found in the 'filings' table.")
        return []

    results = []
    chunk_size = 2048  # Increased chunk size

    for document_text in tqdm(document_texts[:1], desc="Processing documents"):  # Process only the first document for simplicity
        chunks = [document_text[i:i + chunk_size] for i in range(0, len(document_text), chunk_size)]
        for chunk_id, chunk in tqdm(enumerate(chunks[:30]), total=len(chunks[:30]), desc="Processing chunks"):  # Process only the first 10 chunks
            result = query_chunk(chunk)
            results.append((chunk_id, result))
            logging.info(f"Processed chunk {chunk_id}")

    return results

# Specify the path to the database
db_path = r'C:\Users\avani\Desktop\Thesis\oilandgas\OXY\test3_OXY_10-K.db'

# Verify the database file exists
if not os.path.exists(db_path):
    logging.error(f"Database file not found: {db_path}")
else:
    logging.info(f"Database file found: {db_path}")

    # List all tables in the database to verify the 'filings' table exists
    tables = list_tables(db_path)
    print("Tables in the database:", tables)

    if 'filings' in tables:
        # Process the chunks and query for 'assets'
        results = process_document_chunks(db_path)

        # Save results to the file
        output_file_path = r'C:\Users\avani\Desktop\Thesis\oilandgas\OXY\OXY_10K_test.txt'
        save_results_to_file(output_file_path, results)

        # Print the generated response
        if results:
            logging.info("Information about 'assets', 'locations', and 'ownerships' found in the following chunks:")
            for chunk_id, result in results:
                logging.info(f"Chunk {chunk_id+1}:\n{result}")


2024-07-21 21:20:08,594 - INFO - Database file found: C:\Users\avani\Desktop\Thesis\oilandgas\OXY\test3_OXY_10-K.db


Tables in the database: ['filings', 'financial_statements', 'management_discussions']


Processing documents:   0%|                                                                          | 0/1 [00:00<?, ?it/s]
Processing chunks:   0%|                                                                            | 0/30 [00:00<?, ?it/s][A

----------------------------------------------------------------------------------------------------
Prompt: You are a virtual assistant with advanced expertise in a broad spectrum of topics, equipped to utilize high-level critical thinking, cognitive skills, creativity, and innovation.
Your goal is to deliver the most straightforward and accurate answer possible for each question, ensuring high-quality and useful responses for the user.
A physical asset is a tangible resource that a company owns and uses in the production of goods and services. Ensure that a geographical location or region is never considered as an asset.
A financial asset or other non-physical asset should never be included as a physical asset. Examples of financial assets include equity commitments, corporate facilities, accounts receivable, and short-term investments. Never include these in the list of physical assets.
Here is an example:Text: [...] Our principal asset is the Grasberg mine, which we discovered in 1