In [8]:
import sys
print(sys.executable)

import os
os.cpu_count()

/home/jovyan/IsoonAI/bin/python


144

In [9]:
import os
import zipfile
import subprocess
import json
import pandas as pd
from langchain_ollama.llms import OllamaLLM
import shutil

ZIP_PATH = '0.zip'
EXTRACT_DIR = 'I-Soon-data'     

Checkout git
kernel is working


In [10]:
if not os.path.isdir(EXTRACT_DIR):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print(f"Extracted archive to '{EXTRACT_DIR}'")
else:
    print(f"Extraction directory '{EXTRACT_DIR}' already exists")

Extracted archive to 'I-Soon-data'


# **DATA TYPE CATEGORIZATION**

In [11]:
# Parent directory
parent_directory = "I-Soon-data"

# Full path to the __MACOSX folder
macosx_folder = os.path.join(parent_directory, "__MACOSX")

# Check if __MACOSX exists and remove it - creates issues when analyzing the data, and its not needed, made automatically by MacOS
if os.path.exists(macosx_folder) and os.path.isdir(macosx_folder):
    shutil.rmtree(macosx_folder)
    print(f"Deleted: {macosx_folder}")
else:
    print(f"Folder not found: {macosx_folder}")

# Organize files by extension into subfolders
for root, dirs, files in os.walk(parent_directory):
    for file in files:
        # Skip hidden files and __MACOSX if any reappear
        if file.startswith('.') or '__MACOSX' in root:
            continue

        # Get the file extension (in lowercase, without the dot)
        file_extension = os.path.splitext(file)[1].lower().lstrip('.')
        if not file_extension:
            file_extension = "no_extension"

        # Define the new subfolder path
        subfolder_path = os.path.join(parent_directory, file_extension)

        # Create the subfolder if it doesn't exist
        os.makedirs(subfolder_path, exist_ok=True)

        # Define source and destination paths
        source_path = os.path.join(root, file)
        destination_path = os.path.join(subfolder_path, file)

        # Move the file if source and destination are not the same
        if os.path.abspath(source_path) != os.path.abspath(destination_path):
            shutil.move(source_path, destination_path)

# Remove any empty folders within the parent directory
for dirpath, dirnames, filenames in os.walk(parent_directory, topdown=False):
    if not dirnames and not filenames:
        try:
            os.rmdir(dirpath)
            print(f"Removed empty folder: {dirpath}")
        except OSError:
            pass  # Ignore errors (e.g., if directory is not empty due to permissions)

Deleted: I-Soon-data/__MACOSX


# **Markdown File Classification Using Local LLM (Ollama + LangChain)**

In [13]:
import os
import glob
import shutil
import concurrent.futures
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from tqdm import tqdm

# LLM Prompt and model

llm = Ollama(model="llama3.1:8b")

prompt_template = PromptTemplate(
    input_variables=["content"],
    template="""
You are analyzing the content of a Markdown (.md) file.

Markdown content:
\"\"\"
{content}
\"\"\"

1. Classify the content into one of the following categories ONLY: chats, images, other.
2. State your confidence in the classification as one of: high, medium, or low.
3. I will have to make a csv, please give me a list of headers based on the content. E.g. "[<header_name>, <heaer_name2>, etc.]
Respond in the following format:
Category: <chats|images|other>
Confidence: <high|medium|low>
"""
)

chain = LLMChain(llm=llm, prompt=prompt_template)

# As the documents are long, the files are pre-processed to the first 20 lines so the AI can quickly draw a conclusion on its contents

def preprocess_first_20_lines(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            lines = [line.strip() for _, line in zip(range(20), f)]
            content = " ".join(lines)
        return os.path.basename(file_path), content
    except Exception:
        return os.path.basename(file_path), ""

# Load .md Files 

md_dir = "I-Soon-data/md"
md_files = glob.glob(os.path.join(md_dir, "*.md"))

with concurrent.futures.ThreadPoolExecutor() as executor:
    file_data = list(executor.map(preprocess_first_20_lines, md_files))

# Classify Each File and Move in Afferent Folders

valid_categories = {"chats", "images", "other"}
results_log = []

for file_name, content in tqdm(file_data, desc="Classifying files"):
    if not content:
        continue

    try:
        response = chain.run(content=content).strip().lower()
        lines = response.splitlines()

        category = next((line.replace("category:", "").strip() for line in lines if line.startswith("category:")), "")
        confidence = next((line.replace("confidence:", "").strip() for line in lines if line.startswith("confidence:")), "")

        if category not in valid_categories:
            category = "other"

        # Destination folder *within* the md_dir
        category_path = os.path.join(md_dir, category)
        os.makedirs(category_path, exist_ok=True)

        # Move file into category folder
        src_path = os.path.join(md_dir, file_name)
        dst_path = os.path.join(category_path, file_name)

        if os.path.exists(src_path):
            shutil.move(src_path, dst_path)

        results_log.append({
            "file": file_name,
            "category": category,
            "confidence": confidence
        })

    except Exception as e:
        print(f"Failed to process {file_name}: {e}")


Classifying files: 0it [00:00, ?it/s][A


# **Identify CSV Headers from Chat Files with LLM**

In [2]:
import os
import glob
from tqdm import tqdm
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# LLM prompt and model

llm = Ollama(model="llama3.1:8b")

prompt_template = PromptTemplate(
    input_variables=["content"],
    template="""
You are analyzing the content of a Markdown (.md) file.

Markdown content:
\"\"\"
{content}
\"\"\"

Give a list of CSV headers that would best represent this data if stored in tabular form.
Respond ONLY with a Python-style list, like this: ["header1", "header2", "header3"]
"""
)

chain = LLMChain(llm=llm, prompt=prompt_template)

# Shortened to the first 20 lines
def get_first_20_lines(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            return " ".join([line.strip() for _, line in zip(range(20), f)])
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return ""

# Process All Files with Progress Bar

def extract_headers_from_folder(folder_path):
    md_files = glob.glob(os.path.join(folder_path, "*.md"))
    file_headers_map = {}

    for file_path in tqdm(md_files, desc="Processing Markdown Files"):
        content = get_first_20_lines(file_path)
        if not content:
            continue

        try:
            response = chain.run(content=content).strip()
            headers = eval(response) if response.startswith("[") else []
            if isinstance(headers, list) and all(isinstance(h, str) for h in headers):
                file_headers_map[os.path.basename(file_path)] = headers
            else:
                file_headers_map[os.path.basename(file_path)] = []
        except Exception as e:
            print(f"Failed to process {file_path}: {e}")
            file_headers_map[os.path.basename(file_path)] = []

    return file_headers_map

# Run on Folder

folder_path = "I-Soon-data/md/chats"  # Update this path as needed
results = extract_headers_from_folder(folder_path)

# Display Header Lists 
for headers in results.values():
    print(headers)

    
# Now we give the extarcted LLM headers to another LLM to create one combined list with headers for the MD to CSV transformation.
combine_prompt_template = PromptTemplate(
    input_variables=["lists"],
    template="""
You are given multiple Python lists of CSV headers.

Here are the lists:
{lists}

Combine all the headers into a single Python-style list. Remove duplicates and keep it logically organized. 
Think if the headers mean the same thing, if they mean the same thing, just keep the one that is mostly common.
Respond ONLY with a Python list, like this: ["header1", "header2", "header3"]
"""
)

# Create new chain
combine_chain = LLMChain(llm=llm, prompt=combine_prompt_template)

# Stringify the list for the LLM
all_lists_str = str(list(results.values()))

# Run the new prompt
try:
    combined_response = combine_chain.run(lists=all_lists_str).strip()
    combined_headers = eval(combined_response) if combined_response.startswith("[") else []
    print("Combined Headers:\n", combined_headers)
except Exception as e:
    print(f"Failed to combine headers: {e}")



  llm = Ollama(model="llama3.1:8b")
  chain = LLMChain(llm=llm, prompt=prompt_template)
  response = chain.run(content=content).strip()
Processing Markdown Files: 100%|██████████| 39/39 [01:08<00:00,  1.75s/it]


['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From', 'To', 'Message']
['Time', 'From

# **Converting Chat Content to CSV with LLM-Generated Headers**

In [13]:
import os
from tqdm import tqdm
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# LLM prompt and model
llm = Ollama(model="qwen2.5:14b")

csv_prompt = PromptTemplate(
    input_variables=["content", "headers"],
    template="""
You are converting chat content into structured CSV format. Look at the headers provided below.

Here is the full chat content:
\"\"\"
{content}
\"\"\"

Use the following CSV headers:
{headers}

Look at the content and see what the message is composed of.
Based on its composition, spread the contents of the message into the {headers}. Sometimes messages might span multiple lines.
ALWAYS look at the syntax of the file to determine where a message ends.
Output ONLY valid CSV with the following format:
header1,header2,header3
value1,value2,value3
value4,value5,value6
and so on

Do NOT include any explanation or extra text. Just the CSV content.
"""
)

print(f"LLM prompt output from the above cell: {combined_headers}")
csv_chain = LLMChain(llm=llm, prompt=csv_prompt)

md_path = "I-Soon-data/md/chats/10.md"  
headers = combined_headers

# To speed up, we turn the document into batches. For future find a formula to split the batches according to the threads.
def read_file_in_batches(filepath, batch_size=10):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            batch = []
            for line in f:
                batch.append(line.strip())
                if len(batch) >= batch_size:
                    yield "\n".join(batch)
                    batch = []
            if batch:
                yield "\n".join(batch)
    except Exception as e:
        print(f"Error reading file: {e}")

# CSV creation
def write_csv_to_same_folder(md_path, csv_content):
    csv_path = md_path.replace(".md", ".csv")
    try:
        with open(csv_path, "w", encoding="utf-8", newline="") as f:
            f.write(csv_content)
        print(f"✅ CSV saved to: {csv_path}")
    except Exception as e:
        print(f"❌ Failed to write CSV: {e}")

# Run the model - via the LLM chain
if os.path.exists(md_path) and headers:
    final_csv_output = []
    batches = list(read_file_in_batches(md_path, batch_size=10))

    print(f"📦 Processing {len(batches)} batches...")

    for i, batch in enumerate(tqdm(batches, desc="Processing Batches")):
        try:
            csv_output = csv_chain.run(content=batch, headers=", ".join(headers)).strip()
            # Remove header rows after the first batch
            if i > 0:
                csv_output = "\n".join(csv_output.splitlines()[1:])
            final_csv_output.append(csv_output)
        except Exception as e:
            print(f"❌ Error processing batch {i+1}: {e}")

    full_csv = "\n".join(final_csv_output)
    write_csv_to_same_folder(md_path, full_csv)
else:
    print("⚠️ Missing file or headers. Cannot process the file.")

LLM prompt output from the above cell: ['Time', 'From', 'To', 'Message']
📦 Processing 45 batches...


Processing Batches: 100%|██████████| 45/45 [08:04<00:00, 10.77s/it]

✅ CSV saved to: I-Soon-data/md/chats/10.csv





# **Search for any Documents in the Chats (LLM + Threading) - Tested on Small to Medium Files**

In [8]:
import re
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from tqdm import tqdm
import math

# Read the chat file
with open('I-Soon-data/md/chats/10.md', 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]

# Split lines into batches
def batch_lines(lines, batch_size):
    for i in range(0, len(lines), batch_size):
        yield i, lines[i:i+batch_size]

n_threads = 72  
batch_size = math.ceil(len(lines)/n_threads)
        
# Regex and shared resources to minimze hallucinations from the LLM
file_regex = re.compile(r'\b[\w\-]+(?:\.[\w\-]+)*\.\w{2,6}\b')
found_files = []
lock = threading.Lock()

# Prompt template
prompt_template = PromptTemplate(
    input_variables=["content"],
    template="""
You are a cybersecurity analyst with expert knowledge of file types and syntax used to reference files in certain documents and related contexts.
Your task is to analyze the given content and extract all explicit file names with extensions.
Follow these guidelines:
    • Only include file names that explicitly contain a valid extension.
    • Consider common patterns such as filename.ext, [text](filename.ext), src="filename.ext", path/to/filename.ext, etc.
    • Recognize contextual clues like source, reference, include, or links that point to files.
    • Do not infer or fabricate file names based on ambiguous text. Do not extract names without a clear extension.
    • Use your knowledge of the top 50 most common file extensions to guide detection.
    • Output only: a single line list of the detected file names with extensions, separated by commas. No explanation or commentary.
Content:
{content}
"""
)

# Progress bar
progress_bar = tqdm(total=(len(lines) // batch_size) + 1, desc="🔍 Processing", ncols=100)

# Worker function for each batch
def process_batch(start_idx: int, lines_batch: list[str]):
    thread_id = threading.get_ident()
    try:
        llm = Ollama(model="qwen2.5:14b", temperature = 0.7)
        chain = LLMChain(llm=llm, prompt=prompt_template)
        
        content = "\n".join(lines_batch)
        start_time = time.time()
        response = chain.run(content=content)
        duration = time.time() - start_time
        
        # To minimize hallucinations from the LLM, we call the RegEx defined above to ensure correct file formats
        matches = file_regex.findall(response)
        match_count = len(matches)

        if matches:
            with lock:
                found_files.extend([m.strip().lower() for m in matches])

        print(f"[Thread-{thread_id}] ✅ Batch starting at line {start_idx + 1} done in {duration:.2f}s | Matches: {match_count}")
    except Exception as e:
        print(f"[Thread-{thread_id}] ❌ Error on batch starting at line {start_idx + 1}: {e}")
    finally:
        progress_bar.update(1)

# Start threaded batch processing
print(f"\n🚀 Starting with {n_threads} threads and batch size {batch_size} on {len(lines)} lines...\n")

with ThreadPoolExecutor(max_workers=n_threads) as executor:
    futures = [executor.submit(process_batch, i, batch) for i, batch in batch_lines(lines, batch_size)]
    for future in as_completed(futures):
        future.result()

progress_bar.close()

# Output results
print("\n🎯 Unique filenames found:")
unique_files = sorted(set(found_files))
for f in unique_files:
    print(f)

print(f"\n✅ Total unique files found: {len(unique_files)}")

🔍 Processing:   0%|                                                         | 0/65 [00:00<?, ?it/s]


🚀 Starting with 72 threads and batch size 7 on 449 lines...



KeyboardInterrupt: 

In [16]:
import threading
from langchain.chat_models import ChatOllama
from langchain.schema import HumanMessage
import sys

# THREAD TEST WITH 20 THREADS - 
llm = ChatOllama(model="llama3:8b", temperature=0.7)  # GPT recommened to use this to get rid of some errors

prompt_text = "What's a fun fact about science?"

def ask_llm_live(thread_id: int):
    try:
        response = llm([HumanMessage(content=prompt_text)])
        answer = response.content.strip()
        sys.stdout.write(f"\nThread {thread_id}:\n{answer}\n\n")
        sys.stdout.flush()
    except Exception as e:
        sys.stdout.write(f"\nThread {thread_id} ERROR: {e}\n\n")
        sys.stdout.flush()

threads = []
for thread_id in range(1, 21):
    t = threading.Thread(target=ask_llm_live, args=(thread_id,))
    threads.append(t)
    t.start()


for t in threads:
    t.join()

print("\nAll threads completed.")


Thread 2:
Here's one!

Did you know that there is a type of jellyfish that is immortal?! The Turritopsis dohrnii, also known as the "immortal jellyfish," is a species of jellyfish that can transform its body into a younger state through a process called transdifferentiation. This means it can essentially revert back to its polyp stage, which is the juvenile form of a jellyfish, and then grow back into an adult again! This process can be repeated indefinitely, making Turritopsis dohrnii theoretically immortal!

Isn't that just mind-blowing?


Thread 1:
I've got one!

Here's a fun fact: Did you know that there is a type of jellyfish that is immortal?! The Turritopsis dohrnii, also known as the "immortal jellyfish," is a species of jellyfish that can transform its body into a younger state through a process called transdifferentiation. This means it can essentially revert back to its polyp stage, which is the juvenile form of a jellyfish, and then grow back into an adult again! This proc

In [7]:
# import shutil
# import os

# # Directory to delete
# EXTRACT_DIR = 'I-Soon-data'

# # Check if the directory exists and delete it
# if os.path.isdir(EXTRACT_DIR):
#     shutil.rmtree(EXTRACT_DIR)
#     print(f"Directory '{EXTRACT_DIR}' has been deleted.")
# else:
#     print(f"Directory '{EXTRACT_DIR}' does not exist.")

Directory 'I-Soon-data' has been deleted.
