# **IMPORTS & PATHS** 

In [None]:
import os
import zipfile
import subprocess
import json
import pandas as pd
from langchain_ollama.llms import OllamaLLM
import shutil


# Paths and model settings\ nZIP_PATH = 'I-Soon-data.zip'      # Path to your downloaded zip file
ZIP_PATH = '0.zip'    # Directory to extract contents
EXTRACT_DIR = 'I-Soon-data'        # Directory to extract contents

# **ZIP FILE EXTRACTION**

In [67]:
if not os.path.isdir(EXTRACT_DIR):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print(f"Extracted archive to '{EXTRACT_DIR}'")
else:
    print(f"Extraction directory '{EXTRACT_DIR}' already exists")

Extracted archive to 'I-Soon-data'


# **DATA TYPE CATEGORIZATION**

In [68]:
# Define the parent directory
parent_directory = "I-Soon-data"

# Full path to the __MACOSX folder
macosx_folder = os.path.join(parent_directory, "__MACOSX")

# Check if __MACOSX exists and remove it
if os.path.exists(macosx_folder) and os.path.isdir(macosx_folder):
    shutil.rmtree(macosx_folder)
    print(f"Deleted: {macosx_folder}")
else:
    print(f"Folder not found: {macosx_folder}")

# Organize files by extension into subfolders
for root, dirs, files in os.walk(parent_directory):
    for file in files:
        # Skip hidden files and __MACOSX if any reappear
        if file.startswith('.') or '__MACOSX' in root:
            continue

        # Get the file extension (in lowercase, without the dot)
        file_extension = os.path.splitext(file)[1].lower().lstrip('.')
        if not file_extension:
            file_extension = "no_extension"

        # Define the new subfolder path
        subfolder_path = os.path.join(parent_directory, file_extension)

        # Create the subfolder if it doesn't exist
        os.makedirs(subfolder_path, exist_ok=True)

        # Define source and destination paths
        source_path = os.path.join(root, file)
        destination_path = os.path.join(subfolder_path, file)

        # Move the file if source and destination are not the same
        if os.path.abspath(source_path) != os.path.abspath(destination_path):
            shutil.move(source_path, destination_path)

# Remove any empty folders within the parent directory
for dirpath, dirnames, filenames in os.walk(parent_directory, topdown=False):
    if not dirnames and not filenames:
        try:
            os.rmdir(dirpath)
            print(f"Removed empty folder: {dirpath}")
        except OSError:
            pass  # Ignore errors (e.g., if directory is not empty due to permissions)

Deleted: I-Soon-data/__MACOSX


# **Markdown File Classification Using Local LLM (Ollama + LangChain)**

In [69]:
import os
import glob
import shutil
import concurrent.futures
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from tqdm import tqdm

# === Step 1: Set up LLM and Prompt ===

# Initialize a local LLM using Ollama (e.g., llama3.2)
llm = Ollama(model="llama3.2")  # Replace with the model of your choice

# Define a structured prompt to:
# - Summarize Markdown file content
# - Classify it into one of three categories
# - Provide confidence level
prompt_template = PromptTemplate(
    input_variables=["content"],
    template="""
You are analyzing the content of a Markdown (.md) file.

Markdown content:
\"\"\"
{content}
\"\"\"

1. Summarize the file in one short and precise sentence.
2. Classify it into one of the following categories ONLY: chats, images, other.
3. State your confidence in the classification as one of: high, medium, or low.

Respond in the following format:
Summary: <your summary>
Category: <chats|images|other>
Confidence: <high|medium|low>
"""
)

# Create the LangChain chain that will run the LLM with the above prompt
chain = LLMChain(llm=llm, prompt=prompt_template)

# === Step 2: Preprocessing Function ===

# Reads the first 20 lines of a Markdown file and joins them into a single string
def preprocess_first_20_lines(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            lines = [line.strip() for _, line in zip(range(20), f)]
            content = " ".join(lines)
        return os.path.basename(file_path), content
    except Exception:
        return os.path.basename(file_path), ""

# === Step 3: Load .md Files ===

# Directory containing original Markdown files
md_dir = "I-Soon-data/md"

# Get all .md file paths
md_files = glob.glob(os.path.join(md_dir, "*.md"))

# Use multithreading to preprocess the Markdown files
with concurrent.futures.ThreadPoolExecutor() as executor:
    file_data = list(executor.map(preprocess_first_20_lines, md_files))

# === Step 4: Classify Each File with LLM ===

# Directory to store the filtered and organized files
filtered_dir = "Filtered-markdowns"
os.makedirs(filtered_dir, exist_ok=True)

# Allow only these categories for folder placement
valid_categories = {"chats", "images", "other"}

# This will store a structured log of the results
results_log = []

# Iterate over all files and classify each one
for file_name, content in tqdm(file_data, desc="Classifying files"):
    if not content:
        continue  # Skip empty files or failed reads

    try:
        # Run the LLM prompt with the extracted content
        response = chain.run(content=content).strip().lower()
        lines = response.splitlines()

        # Parse the LLM's structured response
        summary = next((line.replace("summary:", "").strip() for line in lines if line.startswith("summary:")), "")
        category = next((line.replace("category:", "").strip() for line in lines if line.startswith("category:")), "")
        confidence = next((line.replace("confidence:", "").strip() for line in lines if line.startswith("confidence:")), "")

        # Normalize category (fallback to 'other' if invalid)
        if category not in valid_categories:
            category = "other"

        # Create subfolder for this category if it doesn't exist
        category_path = os.path.join(filtered_dir, category)
        os.makedirs(category_path, exist_ok=True)

        # Copy the original .md file to the appropriate category folder
        src_path = os.path.join(md_dir, file_name)
        dst_path = os.path.join(category_path, file_name)
        if os.path.exists(src_path):
            shutil.copy2(src_path, dst_path)

        # Append results to the log
        results_log.append({
            "file": file_name,
            "summary": summary,
            "category": category,
            "confidence": confidence
        })

    except Exception as e:
        print(f"Failed to process {file_name}: {e}")

# === Optional Step: Save log as JSON or CSV ===
# Uncomment the below lines to save results for auditing

# import json
# with open("classification_results.json", "w", encoding="utf-8") as f:
#     json.dump(results_log, f, indent=2)

# import csv
# with open("classification_results.csv", "w", newline="", encoding="utf-8") as f:
#     writer = csv.DictWriter(f, fieldnames=["file", "summary", "category", "confidence"])
#     writer.writeheader()
#     writer.writerows(results_log)

Classifying files: 100%|██████████| 70/70 [01:24<00:00,  1.21s/it]


# **Stage 2: LLM-Based Categorization and File Organization by Description**

In [64]:
import os
import shutil
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import Ollama
from tqdm import tqdm

# Initialize second LLM
llm2 = Ollama(model="taozhiyuai/llama-3-refueled:q4_k_m")

# Prompt for classification based on description
prompt_template_stage2 = PromptTemplate(
    input_variables=["description"],
    template="""
You are a strict content classifier.

Given the following short description of a Markdown (.md) file:

\"\"\"
{description}
\"\"\"

Classify the content into one of these categories only:
- chats
- images
- other

Return only one of those three exact words (in lowercase). Do not use synonyms or explanations. Do not make up new categories.
"""
)

# Set up LangChain chain
chain2 = LLMChain(llm=llm2, prompt=prompt_template_stage2)

# Directories
original_dir = "I-Soon-data/md"
filtered_dir = "Filtered-markdowns"
os.makedirs(filtered_dir, exist_ok=True)

# Allowed categories
valid_categories = {"chats", "images", "other"}

# Reclassify and copy files
for file_name, description in tqdm(results.items()):
    try:
        category = chain2.run(description=description).strip().lower()
        if category not in valid_categories:
            category = "other"  # fallback to default

        # Create destination folder
        category_path = os.path.join(filtered_dir, category)
        os.makedirs(category_path, exist_ok=True)

        # Copy the file
        src_path = os.path.join(original_dir, file_name)
        dst_path = os.path.join(category_path, file_name)

        if os.path.exists(src_path):
            shutil.copy2(src_path, dst_path)

    except Exception as e:
        print(f"Failed to classify or copy {file_name}: {e}")

100%|██████████| 70/70 [00:47<00:00,  1.47it/s]


# **Finding connetions between the MD files - reduced size due to performance issues**

Idea: Have the LLM search the markdown files and look for any files linked outside the chats.

# *RegEx based identification of linked files within the chat*

In [112]:
import os
from collections import defaultdict

# Assumes document_reference_dict = { "chat1.md": ["file1.png", "log.txt"], ... }

base_dir = "I-Soon-data"

# === Step 1: Index ALL files (with their full folder path) ===
file_locations = defaultdict(list)  # filename -> list of folders it's found in

for root, _, files in os.walk(base_dir):
    for file in files:
        file_lower = file.lower()
        rel_folder = os.path.relpath(root, base_dir)  # relative to base
        file_locations[file_lower].append(rel_folder)

print(f"📦 Indexed {len(file_locations)} unique filenames from I-Soon-data.\n")

# === Step 2: Compare extracted references to indexed files ===
detailed_matches = []

for chat_file, refs in document_reference_dict.items():
    for ref in refs:
        ref_lower = ref.lower()
        if ref_lower in file_locations:
            for folder in file_locations[ref_lower]:
                detailed_matches.append(f"{ref}: found in folder: {folder} and in chat: {chat_file}")
        else:
            detailed_matches.append(f"{ref}: ❌ NOT FOUND (referenced in chat: {chat_file})")

# === Step 3: Display results ===
print("🔍 File Match Results:\n")
for line in detailed_matches:
    print(line)

📦 Indexed 578 unique filenames from I-Soon-data.

🔍 File Match Results:

7e198208-6603-438d-836d-d24a47670b8d.md: ❌ NOT FOUND (referenced in chat: 18.md)
0dd1a611-58dd-4012-abc4-076bd01ffae9.md: ❌ NOT FOUND (referenced in chat: 1.md)
1188a898-edb0-40f9-a05c-d59664095e3b.md: ❌ NOT FOUND (referenced in chat: 1.md)
1979c730-7ed1-4058-b8ad-a992784ff16e.md: ❌ NOT FOUND (referenced in chat: 1.md)
27ab5f96-629f-4a93-8682-4991ce1eca50.md: ❌ NOT FOUND (referenced in chat: 1.md)
28d70cf7-5244-4834-8169-d491781f260c.md: ❌ NOT FOUND (referenced in chat: 1.md)
2a92145d-eec2-4373-9db6-8ec19894037f.md: ❌ NOT FOUND (referenced in chat: 1.md)
2ae2a05b-eea1-4367-8d14-e4a34d19577c.md: ❌ NOT FOUND (referenced in chat: 1.md)
2ef9c906-8b3b-4b96-af3e-b1a4226245d5.md: ❌ NOT FOUND (referenced in chat: 1.md)
351180cd-d982-40a2-8881-8f954cae8fd4.md: ❌ NOT FOUND (referenced in chat: 1.md)
3e339adc-f10e-442c-b4e8-2a95a14caebb.md: ❌ NOT FOUND (referenced in chat: 1.md)
4093fa6f-a7fb-4a3a-8d68-d92b85036931.md: ❌ NOT

# *Search which files are in the leaked data*

In [110]:
import os

# Your reference dictionary from previous step
# document_reference_dict = {"1.md": ["abc.png", "log1.log", ...], ...}

base_dir = "I-Soon-data"

# === Step 1: Dynamically collect all actual filenames from subfolders ===
actual_files = set()

# Automatically detect all subfolders (e.g., log, png, txt, md)
folders_to_check = [
    folder for folder in os.listdir(base_dir)
    if os.path.isdir(os.path.join(base_dir, folder))
]

for folder in folders_to_check:
    folder_path = os.path.join(base_dir, folder)
    for file in os.listdir(folder_path):
        actual_files.add(file.lower())  # normalize case

print(f"📁 Indexed {len(actual_files)} files from {len(folders_to_check)} folders: {folders_to_check}\n")

# === Step 2: Compare against referenced files from the markdowns ===
comparison_results = {}

for doc, refs in document_reference_dict.items():
    found = []
    missing = []

    for ref in refs:
        if ref.lower() in actual_files:
            found.append(ref)
        else:
            missing.append(ref)

    comparison_results[doc] = {
        "found": found,
        "missing": missing
    }

# === Step 3: Display a few examples ===
for doc, result in list(comparison_results.items())[:5]:
    print(f"\n📄 {doc}")
    print(f"✅ Found: {result['found']}")
    print(f"❌ Missing: {result['missing']}")

📁 Indexed 578 files from 5 folders: ['txt', '0', 'png', 'md', 'log']


📄 18.md
✅ Found: []
❌ Missing: ['7e198208-6603-438d-836d-d24a47670b8d.md']

📄 1.md
✅ Found: []
❌ Missing: ['0dd1a611-58dd-4012-abc4-076bd01ffae9.md', '1188a898-edb0-40f9-a05c-d59664095e3b.md', '1979c730-7ed1-4058-b8ad-a992784ff16e.md', '27ab5f96-629f-4a93-8682-4991ce1eca50.md', '28d70cf7-5244-4834-8169-d491781f260c.md', '2a92145d-eec2-4373-9db6-8ec19894037f.md', '2ae2a05b-eea1-4367-8d14-e4a34d19577c.md', '2ef9c906-8b3b-4b96-af3e-b1a4226245d5.md', '351180cd-d982-40a2-8881-8f954cae8fd4.md', '3e339adc-f10e-442c-b4e8-2a95a14caebb.md', '4093fa6f-a7fb-4a3a-8d68-d92b85036931.md', '4887236f-21ed-49dd-9ed3-465875df3554.md', '4b32e3c0-0887-49b2-a164-aac04b1d3da4.md', '4f29810a-bff7-451a-bae8-9285cbfa0592.md', '5c84f2e2-414e-4716-bc96-0218b1bc964f.md', '5c91055f-f8d7-421a-9e67-82ca5fe67e86.md', '5ce4ba5d-9eb3-4056-831f-9542d7c9d72b.md', '5f6a537a-d9b3-48c5-a1d8-6307d8b78bdd.md', '660e4ce8-4b44-41eb-acc4-5a88475bab49.md', '671e