In [6]:
import sys
print(sys.executable)

import os
os.cpu_count()

/home/jovyan/IsoonAI/bin/python


144

In [7]:
import os
import zipfile
import subprocess
import json
import pandas as pd
from langchain_ollama.llms import OllamaLLM
import shutil


# Paths and model settings\ nZIP_PATH = 'I-Soon-data.zip'      # Path to your downloaded zip file
ZIP_PATH = '0.zip'    # Directory to extract contents
EXTRACT_DIR = 'I-Soon-data'        # Directory to extract contents
print("Checkout git")
print("kernel is working")

Checkout git
kernel is working


In [8]:
if not os.path.isdir(EXTRACT_DIR):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print(f"Extracted archive to '{EXTRACT_DIR}'")
else:
    print(f"Extraction directory '{EXTRACT_DIR}' already exists")

Extracted archive to 'I-Soon-data'


# **DATA TYPE CATEGORIZATION**

In [9]:
# Define the parent directory
parent_directory = "I-Soon-data"

# Full path to the __MACOSX folder
macosx_folder = os.path.join(parent_directory, "__MACOSX")

# Check if __MACOSX exists and remove it
if os.path.exists(macosx_folder) and os.path.isdir(macosx_folder):
    shutil.rmtree(macosx_folder)
    print(f"Deleted: {macosx_folder}")
else:
    print(f"Folder not found: {macosx_folder}")

# Organize files by extension into subfolders
for root, dirs, files in os.walk(parent_directory):
    for file in files:
        # Skip hidden files and __MACOSX if any reappear
        if file.startswith('.') or '__MACOSX' in root:
            continue

        # Get the file extension (in lowercase, without the dot)
        file_extension = os.path.splitext(file)[1].lower().lstrip('.')
        if not file_extension:
            file_extension = "no_extension"

        # Define the new subfolder path
        subfolder_path = os.path.join(parent_directory, file_extension)

        # Create the subfolder if it doesn't exist
        os.makedirs(subfolder_path, exist_ok=True)

        # Define source and destination paths
        source_path = os.path.join(root, file)
        destination_path = os.path.join(subfolder_path, file)

        # Move the file if source and destination are not the same
        if os.path.abspath(source_path) != os.path.abspath(destination_path):
            shutil.move(source_path, destination_path)

# Remove any empty folders within the parent directory
for dirpath, dirnames, filenames in os.walk(parent_directory, topdown=False):
    if not dirnames and not filenames:
        try:
            os.rmdir(dirpath)
            print(f"Removed empty folder: {dirpath}")
        except OSError:
            pass  # Ignore errors (e.g., if directory is not empty due to permissions)

Deleted: I-Soon-data/__MACOSX


# **Markdown File Classification Using Local LLM (Ollama + LangChain)**

In [11]:
import os
import glob
import shutil
import concurrent.futures
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from tqdm import tqdm

# === Step 1: Set up LLM and Prompt ===

llm = Ollama(model="llama3.1:8b")

prompt_template = PromptTemplate(
    input_variables=["content"],
    template="""
You are analyzing the content of a Markdown (.md) file.

Markdown content:
\"\"\"
{content}
\"\"\"

1. Classify the content into one of the following categories ONLY: chats, images, other.
2. State your confidence in the classification as one of: high, medium, or low.

Respond in the following format:
Category: <chats|images|other>
Confidence: <high|medium|low>
"""
)

chain = LLMChain(llm=llm, prompt=prompt_template)

# === Step 2: Preprocessing Function ===

def preprocess_first_20_lines(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            lines = [line.strip() for _, line in zip(range(20), f)]
            content = " ".join(lines)
        return os.path.basename(file_path), content
    except Exception:
        return os.path.basename(file_path), ""

# === Step 3: Load .md Files ===

md_dir = "I-Soon-data/md"
md_files = glob.glob(os.path.join(md_dir, "*.md"))

with concurrent.futures.ThreadPoolExecutor() as executor:
    file_data = list(executor.map(preprocess_first_20_lines, md_files))

# === Step 4: Classify Each File and Move ===

valid_categories = {"chats", "images", "other"}
results_log = []

for file_name, content in tqdm(file_data, desc="Classifying files"):
    if not content:
        continue

    try:
        response = chain.run(content=content).strip().lower()
        lines = response.splitlines()

        category = next((line.replace("category:", "").strip() for line in lines if line.startswith("category:")), "")
        confidence = next((line.replace("confidence:", "").strip() for line in lines if line.startswith("confidence:")), "")

        if category not in valid_categories:
            category = "other"

        # Destination folder *within* the md_dir
        category_path = os.path.join(md_dir, category)
        os.makedirs(category_path, exist_ok=True)

        # Move file into category folder
        src_path = os.path.join(md_dir, file_name)
        dst_path = os.path.join(category_path, file_name)

        if os.path.exists(src_path):
            shutil.move(src_path, dst_path)

        results_log.append({
            "file": file_name,
            "category": category,
            "confidence": confidence
        })

    except Exception as e:
        print(f"Failed to process {file_name}: {e}")

# === Optional: Save results to file ===

# import json
# with open("classification_results.json", "w", encoding="utf-8") as f:
#     json.dump(results_log, f, indent=2)

# import csv
# with open("classification_results.csv", "w", newline="", encoding="utf-8") as f:
#     writer = csv.DictWriter(f, fieldnames=["file", "category", "confidence"])
#     writer.writeheader()
#     writer.writerows(results_log)

Classifying files: 100%|██████████| 70/70 [02:37<00:00,  2.24s/it]


# **Stage 2: LLM-Based Categorization and File Organization by Description**

In [12]:
# import os
# import shutil
# from langchain.prompts import PromptTemplate
# from langchain.chains import LLMChain
# from langchain.llms import Ollama
# from tqdm import tqdm

# # Initialize second LLM
# llm2 = Ollama(model="taozhiyuai/llama-3-refueled:q4_k_m")

# # Prompt for classification based on description
# prompt_template_stage2 = PromptTemplate(
#     input_variables=["description"],
#     template="""
# You are a strict content classifier.

# Given the following short description of a Markdown (.md) file:

# \"\"\"
# {description}
# \"\"\"

# Classify the content into one of these categories only:
# - chats
# - images
# - other

# Return only one of those three exact words (in lowercase). Do not use synonyms or explanations. Do not make up new categories.
# """
# )

# # Set up LangChain chain
# chain2 = LLMChain(llm=llm2, prompt=prompt_template_stage2)

# # Directories
# original_dir = "I-Soon-data/md"
# filtered_dir = "Filtered-markdowns"
# os.makedirs(filtered_dir, exist_ok=True)

# # Allowed categories
# valid_categories = {"chats", "images", "other"}

# # Reclassify and copy files
# for file_name, description in tqdm(results.items()):
#     try:
#         category = chain2.run(description=description).strip().lower()
#         if category not in valid_categories:
#             category = "other"  # fallback to default

#         # Create destination folder
#         category_path = os.path.join(filtered_dir, category)
#         os.makedirs(category_path, exist_ok=True)

#         # Copy the file
#         src_path = os.path.join(original_dir, file_name)
#         dst_path = os.path.join(category_path, file_name)

#         if os.path.exists(src_path):
#             shutil.copy2(src_path, dst_path)

#     except Exception as e:
#         print(f"Failed to classify or copy {file_name}: {e}")

# **Finding connetions between the MD files - reduced size due to performance issues**

Idea: Have the LLM search the markdown files and look for any files linked outside the chats.

# *RegEx based identification of linked files within the chat*

In [2]:
import os
import re
import ast
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Step 1: Initialize LLM
llm2 = Ollama(model="llama3.1:8b")

# Step 2: Define the prompt template
prompt_template_file_types = PromptTemplate(
    input_variables=["content"],
    template="""
List the top 50 file extensions most commonly found in cybersecurity leaked data, including data from breaches, ransomware leaks, and dark web dumps. THINK LIKE A CYBERSECURITY EXPERT.
Focus on file types that typically contain credentials, configurations, databases, personal data, internal documentation, archives, or images (e.g., screenshots of sensitive material). 

OUTPUT INSTRUCTIONS:
ONLY OUTPUT the extensions as a clean Python list format, like [<'file_extension'>, <'file_extension'>, etc.] 
Don't include "." and ALWAYS use "'" in the list. 
Do not include any explanations, comments, or extra text. 
JUST GIVE THE LIST.
"""
)

# Step 3: Run the chain to get extensions
chain = LLMChain(llm=llm2, prompt=prompt_template_file_types)
llm_response = chain.run(content="")

# Step 4: Print the LLM output
print("🔍 LLM-generated file extensions list:")
print(llm_response)

# Step 5: Parse LLM response into a Python list
try:
    # Extract only the list portion using regex
    match = re.search(r"\[(.*?)\]", llm_response, re.DOTALL)
    if match:
        list_str = "[" + match.group(1) + "]"
        common_extensions = ast.literal_eval(list_str)
    else:
        raise ValueError("❌ No list found in LLM response.")
except Exception as e:
    raise ValueError("❌ Failed to parse LLM response into a list.") from e

# Step 6: Build dynamic regex
ext_pattern = '|'.join(common_extensions)
file_pattern = re.compile(r'[\w\-/\.]{8,}\.(?:' + ext_pattern + r')', re.IGNORECASE)

# Step 7: Define chats path and extract files
chats_path = 'I-Soon-data/md/chats'
extracted_files_from_chats = {}

for filename in os.listdir(chats_path):
    if filename.endswith('.md'):
        full_path = os.path.join(chats_path, filename)
        with open(full_path, 'r', encoding='utf-8') as file:
            content = file.read()
            matches = file_pattern.findall(content)
            if matches:
                extracted_files_from_chats[filename] = matches

# Step 8: Display results
print("\n📄 Extracted filenames from chats:")
for chat, files in extracted_files_from_chats.items():
    print(f"{chat}:")
    for f in files:
        print(f"  - {f}")

🔍 LLM-generated file extensions list:
[txt, json, cfg, db, mp3, wav, mp4, flv, jpg, jpeg, png, gif, pdf, docx, xlsx, pptx, rar, zip, 7z, tar, gz, bz2, sqlite, mdb, accdb, xml, html, css, js, sql, mdf, bak, dat, log, eml, pst, msg, odt, odp, odg, docs, xls, xla, ppt, xlsm]


ValueError: ❌ Failed to parse LLM response into a list.

# *Search which files are in the leaked data with Threads*

In [1]:
import re
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from tqdm import tqdm
import math

# Load and clean lines
with open('I-Soon-data/md/chats/10.md', 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]

# Split lines into batches
def batch_lines(lines, batch_size):
    for i in range(0, len(lines), batch_size):
        yield i, lines[i:i+batch_size]

# Parameters
n_threads = 72  
batch_size = math.ceil(len(lines)/n_threads)
        
# Regex and shared resources
file_regex = re.compile(r'\b[\w\-]+(?:\.[\w\-]+)*\.\w{2,6}\b')
found_files = []
lock = threading.Lock()

# Prompt template (adjusted for batches)
prompt_template = PromptTemplate(
    input_variables=["content"],
    template="""
You are a cybersecurity analyst with expert knowledge of file types and syntax used to reference files in certain documents and related contexts.
Your task is to analyze the given content and extract all explicit file names with extensions.
Follow these guidelines:
    • Only include file names that explicitly contain a valid extension.
    • Consider common patterns such as filename.ext, [text](filename.ext), src="filename.ext", path/to/filename.ext, etc.
    • Recognize contextual clues like source, reference, include, or links that point to files.
    • Use your knowledge of the top 50 most common file extensions to guide detection.
    • Do not infer or fabricate file names based on ambiguous text. Do not extract names without a clear extension.
    •  Use your knowledge of the top 50 most common file extensions to guide detection.
    • Output only: a single line list of the detected file names with extensions, separated by commas. No explanation or commentary.
Content:
{content}

"""
)

# Progress bar
progress_bar = tqdm(total=(len(lines) // batch_size) + 1, desc="🔍 Processing", ncols=100)

# Worker function for each batch
def process_batch(start_idx: int, lines_batch: list[str]):
    thread_id = threading.get_ident()
    try:
        llm = Ollama(model="qwen2.5:14b")
        chain = LLMChain(llm=llm, prompt=prompt_template)

        content = "\n".join(lines_batch)
        start_time = time.time()
        response = chain.run(content=content)
        duration = time.time() - start_time

        matches = file_regex.findall(response)
        match_count = len(matches)

        if matches:
            with lock:
                found_files.extend([m.strip().lower() for m in matches])

        print(f"[Thread-{thread_id}] ✅ Batch starting at line {start_idx + 1} done in {duration:.2f}s | Matches: {match_count}")
    except Exception as e:
        print(f"[Thread-{thread_id}] ❌ Error on batch starting at line {start_idx + 1}: {e}")
    finally:
        progress_bar.update(1)

# Start threaded batch processing
print(f"\n🚀 Starting with {n_threads} threads and batch size {batch_size} on {len(lines)} lines...\n")

with ThreadPoolExecutor(max_workers=n_threads) as executor:
    futures = [executor.submit(process_batch, i, batch) for i, batch in batch_lines(lines, batch_size)]
    for future in as_completed(futures):
        future.result()

progress_bar.close()

# Output results
print("\n🎯 Unique filenames found:")
unique_files = sorted(set(found_files))
for f in unique_files:
    print(f)

print(f"\n✅ Total unique files found: {len(unique_files)}")

  llm = Ollama(model="qwen2.5:14b")
  chain = LLMChain(llm=llm, prompt=prompt_template)
  response = chain.run(content=content)



🚀 Starting with 72 threads and batch size 7 on 449 lines...



🔍 Processing:   2%|▊                                                | 1/65 [00:32<34:21, 32.22s/it]

[Thread-140575202334272] ✅ Batch starting at line 8 done in 32.20s | Matches: 0
[Thread-140575177156160] ✅ Batch starting at line 29 done in 32.21s | Matches: 0


🔍 Processing:   5%|██▎                                              | 3/65 [00:35<09:47,  9.48s/it]

[Thread-140574581569088] ✅ Batch starting at line 71 done in 35.25s | Matches: 0


🔍 Processing:   6%|███                                              | 4/65 [00:36<06:42,  6.60s/it]

[Thread-140575210726976] ✅ Batch starting at line 1 done in 36.40s | Matches: 0
[Thread-140574573176384] ✅ Batch starting at line 78 done in 36.49s | Matches: 0


🔍 Processing:   9%|████▌                                            | 6/65 [00:38<03:43,  3.79s/it]

[Thread-140574564783680] ✅ Batch starting at line 85 done in 38.30s | Matches: 0


🔍 Processing:  11%|█████▎                                           | 7/65 [00:39<02:58,  3.08s/it]

[Thread-140571930785344] ✅ Batch starting at line 288 done in 39.36s | Matches: 0


🔍 Processing:  12%|██████                                           | 8/65 [00:40<02:24,  2.53s/it]

[Thread-140570303387200] ✅ Batch starting at line 449 done in 40.40s | Matches: 0


🔍 Processing:  14%|██████▊                                          | 9/65 [00:41<01:58,  2.12s/it]

[Thread-140573533005376] ✅ Batch starting at line 148 done in 41.50s | Matches: 0


🔍 Processing:  15%|███████▍                                        | 10/65 [00:42<01:43,  1.87s/it]

[Thread-140573516219968] ✅ Batch starting at line 162 done in 42.76s | Matches: 0


🔍 Processing:  17%|████████                                        | 11/65 [00:44<01:31,  1.69s/it]

[Thread-140572987741760] ✅ Batch starting at line 204 done in 44.00s | Matches: 0


🔍 Processing:  18%|████████▊                                       | 12/65 [00:44<01:18,  1.48s/it]

[Thread-140570815080000] ✅ Batch starting at line 421 done in 44.92s | Matches: 0


🔍 Processing:  20%|█████████▌                                      | 13/65 [00:46<01:11,  1.37s/it]

[Thread-140572996134464] ✅ Batch starting at line 197 done in 46.06s | Matches: 0


🔍 Processing:  22%|██████████▎                                     | 14/65 [00:47<01:10,  1.39s/it]

[Thread-140571897214528] ✅ Batch starting at line 316 done in 47.47s | Matches: 0


🔍 Processing:  23%|███████████                                     | 15/65 [00:48<01:02,  1.25s/it]

[Thread-140571880429120] ✅ Batch starting at line 330 done in 48.42s | Matches: 0


🔍 Processing:  25%|███████████▊                                    | 16/65 [00:49<01:02,  1.28s/it]

[Thread-140574606747200] ✅ Batch starting at line 50 done in 49.78s | Matches: 0


🔍 Processing:  26%|████████████▌                                   | 17/65 [00:51<01:06,  1.38s/it]

[Thread-140574027912768] ✅ Batch starting at line 134 done in 51.41s | Matches: 0


🔍 Processing:  28%|█████████████▎                                  | 18/65 [00:52<01:01,  1.30s/it]

[Thread-140573541398080] ✅ Batch starting at line 141 done in 52.51s | Matches: 0


🔍 Processing:  29%|██████████████                                  | 19/65 [00:53<01:01,  1.33s/it]

[Thread-140570311779904] ✅ Batch starting at line 442 done in 53.88s | Matches: 0


🔍 Processing:  31%|██████████████▊                                 | 20/65 [00:55<00:56,  1.27s/it]

[Thread-140573507827264] ✅ Batch starting at line 169 done in 55.02s | Matches: 0


🔍 Processing:  32%|███████████████▌                                | 21/65 [00:56<01:01,  1.40s/it]

[Thread-140572417300032] ✅ Batch starting at line 281 done in 56.72s | Matches: 0


🔍 Processing:  34%|████████████████▏                               | 22/65 [00:57<00:57,  1.33s/it]

[Thread-140574061483584] ✅ Batch starting at line 106 done in 57.91s | Matches: 0


🔍 Processing:  35%|████████████████▉                               | 23/65 [00:58<00:51,  1.22s/it]

[Thread-140573491041856] ✅ Batch starting at line 183 done in 58.87s | Matches: 0


🔍 Processing:  37%|█████████████████▋                              | 24/65 [01:00<00:54,  1.34s/it]

[Thread-140575185548864] ✅ Batch starting at line 22 done in 60.50s | Matches: 0


🔍 Processing:  38%|██████████████████▍                             | 25/65 [01:02<00:57,  1.43s/it]

[Thread-140571385521728] ✅ Batch starting at line 344 done in 62.11s | Matches: 0


🔍 Processing:  40%|███████████████████▏                            | 26/65 [01:03<00:54,  1.40s/it]

[Thread-140570831865408] ✅ Batch starting at line 407 done in 63.42s | Matches: 0


🔍 Processing:  42%|███████████████████▉                            | 27/65 [01:04<00:48,  1.27s/it]

[Thread-140572459263552] ✅ Batch starting at line 246 done in 64.42s | Matches: 0


🔍 Processing:  43%|████████████████████▋                           | 28/65 [01:05<00:43,  1.17s/it]

[Thread-140572954170944] ✅ Batch starting at line 232 done in 65.35s | Matches: 0


🔍 Processing:  45%|█████████████████████▍                          | 29/65 [01:05<00:32,  1.12it/s]

[Thread-140574615139904] ✅ Batch starting at line 43 done in 65.61s | Matches: 4


🔍 Processing:  46%|██████████████████████▏                         | 30/65 [01:07<00:40,  1.17s/it]

[Thread-140571360343616] ✅ Batch starting at line 365 done in 67.40s | Matches: 0


🔍 Processing:  48%|██████████████████████▉                         | 31/65 [01:08<00:39,  1.17s/it]

[Thread-140571913999936] ✅ Batch starting at line 302 done in 68.58s | Matches: 0


🔍 Processing:  49%|███████████████████████▋                        | 32/65 [01:13<01:13,  2.24s/it]

[Thread-140572450870848] ✅ Batch starting at line 253 done in 73.31s | Matches: 2


🔍 Processing:  51%|████████████████████████▎                       | 33/65 [01:14<01:01,  1.91s/it]

[Thread-140573524612672] ✅ Batch starting at line 155 done in 74.47s | Matches: 0


🔍 Processing:  52%|█████████████████████████                       | 34/65 [01:15<00:53,  1.73s/it]

[Thread-140573499434560] ✅ Batch starting at line 176 done in 75.78s | Matches: 0


🔍 Processing:  54%|█████████████████████████▊                      | 35/65 [01:16<00:44,  1.49s/it]

[Thread-140572425692736] ✅ Batch starting at line 274 done in 76.69s | Matches: 0


🔍 Processing:  55%|██████████████████████████▌                     | 36/65 [01:17<00:38,  1.33s/it]

[Thread-140571343558208] ✅ Batch starting at line 379 done in 77.64s | Matches: 0


🔍 Processing:  58%|████████████████████████████                    | 38/65 [01:19<00:26,  1.01it/s]

[Thread-140574053090880] ✅ Batch starting at line 113 done in 78.98s | Matches: 0
[Thread-140572442478144] ✅ Batch starting at line 260 done in 79.16s | Matches: 2


🔍 Processing:  60%|████████████████████████████▊                   | 39/65 [01:20<00:29,  1.15s/it]

[Thread-140571888821824] ✅ Batch starting at line 323 done in 80.69s | Matches: 0


🔍 Processing:  62%|█████████████████████████████▌                  | 40/65 [01:21<00:29,  1.16s/it]

[Thread-140570848650816] ✅ Batch starting at line 393 done in 81.87s | Matches: 0
[Thread-140574078268992] ✅ Batch starting at line 92 done in 81.92s | Matches: 0


🔍 Processing:  65%|███████████████████████████████                 | 42/65 [01:23<00:23,  1.04s/it]

[Thread-140574589961792] ✅ Batch starting at line 64 done in 83.71s | Matches: 0
[Thread-140572434085440] ✅ Batch starting at line 267 done in 83.69s | Matches: 0


🔍 Processing:  68%|████████████████████████████████▍               | 44/65 [01:25<00:19,  1.09it/s]

[Thread-140574598354496] ✅ Batch starting at line 57 done in 85.16s | Matches: 0
[Thread-140573004527168] ✅ Batch starting at line 190 done in 85.23s | Matches: 0


🔍 Processing:  71%|█████████████████████████████████▉              | 46/65 [01:26<00:16,  1.18it/s]

[Thread-140570806687296] ✅ Batch starting at line 428 done in 86.58s | Matches: 0


🔍 Processing:  72%|██████████████████████████████████▋             | 47/65 [01:27<00:16,  1.11it/s]

[Thread-140572467656256] ✅ Batch starting at line 239 done in 87.70s | Matches: 0
[Thread-140570823472704] ✅ Batch starting at line 414 done in 87.69s | Matches: 0


🔍 Processing:  75%|████████████████████████████████████▏           | 49/65 [01:29<00:14,  1.11it/s]

[Thread-140574036305472] ✅ Batch starting at line 127 done in 89.54s | Matches: 0


🔍 Processing:  77%|████████████████████████████████████▉           | 50/65 [01:30<00:14,  1.04it/s]

[Thread-140571905607232] ✅ Batch starting at line 309 done in 90.68s | Matches: 0


🔍 Processing:  78%|█████████████████████████████████████▋          | 51/65 [01:31<00:14,  1.02s/it]

[Thread-140575193941568] ✅ Batch starting at line 15 done in 91.93s | Matches: 0


🔍 Processing:  80%|██████████████████████████████████████▍         | 52/65 [01:32<00:13,  1.02s/it]

[Thread-140572962563648] ✅ Batch starting at line 225 done in 92.91s | Matches: 0


🔍 Processing:  82%|███████████████████████████████████████▏        | 53/65 [01:34<00:12,  1.03s/it]

[Thread-140571377129024] ✅ Batch starting at line 351 done in 93.97s | Matches: 0


🔍 Processing:  83%|███████████████████████████████████████▉        | 54/65 [01:35<00:12,  1.15s/it]

[Thread-140570840258112] ✅ Batch starting at line 400 done in 95.44s | Matches: 0


🔍 Processing:  85%|████████████████████████████████████████▌       | 55/65 [01:36<00:11,  1.15s/it]

[Thread-140574069876288] ✅ Batch starting at line 99 done in 96.64s | Matches: 0


🔍 Processing:  86%|█████████████████████████████████████████▎      | 56/65 [01:37<00:09,  1.07s/it]

[Thread-140571922392640] ✅ Batch starting at line 295 done in 97.47s | Matches: 0


🔍 Processing:  88%|██████████████████████████████████████████      | 57/65 [01:38<00:08,  1.03s/it]

[Thread-140570320172608] ✅ Batch starting at line 435 done in 98.40s | Matches: 0


🔍 Processing:  89%|██████████████████████████████████████████▊     | 58/65 [01:39<00:07,  1.01s/it]

[Thread-140571351950912] ✅ Batch starting at line 372 done in 99.38s | Matches: 0


🔍 Processing:  91%|███████████████████████████████████████████▌    | 59/65 [01:42<00:09,  1.58s/it]

[Thread-140575168763456] ✅ Batch starting at line 36 done in 102.34s | Matches: 2


🔍 Processing:  92%|████████████████████████████████████████████▎   | 60/65 [01:43<00:07,  1.58s/it]

[Thread-140570857043520] ✅ Batch starting at line 386 done in 103.88s | Matches: 0


🔍 Processing:  94%|█████████████████████████████████████████████   | 61/65 [01:45<00:05,  1.43s/it]

[Thread-140571368736320] ✅ Batch starting at line 358 done in 104.97s | Matches: 0


🔍 Processing:  95%|█████████████████████████████████████████████▊  | 62/65 [01:46<00:04,  1.36s/it]

[Thread-140572970956352] ✅ Batch starting at line 218 done in 106.16s | Matches: 0


🔍 Processing:  97%|██████████████████████████████████████████████▌ | 63/65 [01:48<00:03,  1.77s/it]

[Thread-140574044698176] ✅ Batch starting at line 120 done in 108.90s | Matches: 0


🔍 Processing:  98%|███████████████████████████████████████████████▎| 64/65 [01:49<00:01,  1.51s/it]

[Thread-140571393914432] ✅ Batch starting at line 337 done in 109.80s | Matches: 0


🔍 Processing: 100%|████████████████████████████████████████████████| 65/65 [01:50<00:00,  1.70s/it]

[Thread-140572979349056] ✅ Batch starting at line 211 done in 110.74s | Matches: 2

🎯 Unique filenames found:
0-4ea07c23-a1a6-411b-bcfb-552d095b66c9.png
0-5a84cde3-7175-4044-8c88-d4c883a8fd38.png
0-6bcc0131-e4ad-421e-bb1f-d8ebe5eeec7b.png
0-79d9b7f2-cfe4-4615-9b75-8fea33fc0c9d.png
0-e705d192-90ee-4fd1-9dcd-061958d1817f.png
12756724-394c-4576-b373-7c53f1abbd94.md
4ea07c23-a1a6-411b-bcfb-552d095b66c9.png
5a84cde3-7175-4044-8c88-d4c883a8fd38.png
6bcc0131-e4ad-421e-bb1f-d8ebe5eeec7b.png
79d9b7f2-cfe4-4615-9b75-8fea33fc0c9d.png
e705d192-90ee-4fd1-9dcd-061958d1817f.png
安洵.7z

✅ Total unique files found: 12





In [None]:
# import shutil
# import os

# # Directory to delete
# EXTRACT_DIR = 'I-Soon-data'

# # Check if the directory exists and delete it
# if os.path.isdir(EXTRACT_DIR):
#     shutil.rmtree(EXTRACT_DIR)
#     print(f"Directory '{EXTRACT_DIR}' has been deleted.")
# else:
#     print(f"Directory '{EXTRACT_DIR}' does not exist.")