In [6]:
import sys
print(sys.executable)

import os
os.cpu_count()

/home/jovyan/IsoonAI/bin/python


144

In [7]:
import os
import zipfile
import subprocess
import json
import pandas as pd
from langchain_ollama.llms import OllamaLLM
import shutil


# Paths and model settings\ nZIP_PATH = 'I-Soon-data.zip'      # Path to your downloaded zip file
ZIP_PATH = '0.zip'    # Directory to extract contents
EXTRACT_DIR = 'I-Soon-data'        # Directory to extract contents
print("Checkout git")
print("kernel is working")

Checkout git
kernel is working


In [8]:
if not os.path.isdir(EXTRACT_DIR):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print(f"Extracted archive to '{EXTRACT_DIR}'")
else:
    print(f"Extraction directory '{EXTRACT_DIR}' already exists")

Extracted archive to 'I-Soon-data'


# **DATA TYPE CATEGORIZATION**

In [9]:
# Define the parent directory
parent_directory = "I-Soon-data"

# Full path to the __MACOSX folder
macosx_folder = os.path.join(parent_directory, "__MACOSX")

# Check if __MACOSX exists and remove it
if os.path.exists(macosx_folder) and os.path.isdir(macosx_folder):
    shutil.rmtree(macosx_folder)
    print(f"Deleted: {macosx_folder}")
else:
    print(f"Folder not found: {macosx_folder}")

# Organize files by extension into subfolders
for root, dirs, files in os.walk(parent_directory):
    for file in files:
        # Skip hidden files and __MACOSX if any reappear
        if file.startswith('.') or '__MACOSX' in root:
            continue

        # Get the file extension (in lowercase, without the dot)
        file_extension = os.path.splitext(file)[1].lower().lstrip('.')
        if not file_extension:
            file_extension = "no_extension"

        # Define the new subfolder path
        subfolder_path = os.path.join(parent_directory, file_extension)

        # Create the subfolder if it doesn't exist
        os.makedirs(subfolder_path, exist_ok=True)

        # Define source and destination paths
        source_path = os.path.join(root, file)
        destination_path = os.path.join(subfolder_path, file)

        # Move the file if source and destination are not the same
        if os.path.abspath(source_path) != os.path.abspath(destination_path):
            shutil.move(source_path, destination_path)

# Remove any empty folders within the parent directory
for dirpath, dirnames, filenames in os.walk(parent_directory, topdown=False):
    if not dirnames and not filenames:
        try:
            os.rmdir(dirpath)
            print(f"Removed empty folder: {dirpath}")
        except OSError:
            pass  # Ignore errors (e.g., if directory is not empty due to permissions)

Deleted: I-Soon-data/__MACOSX


# **Markdown File Classification Using Local LLM (Ollama + LangChain)**

In [11]:
import os
import glob
import shutil
import concurrent.futures
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from tqdm import tqdm

# === Step 1: Set up LLM and Prompt ===

llm = Ollama(model="llama3.1:8b")

prompt_template = PromptTemplate(
    input_variables=["content"],
    template="""
You are analyzing the content of a Markdown (.md) file.

Markdown content:
\"\"\"
{content}
\"\"\"

1. Classify the content into one of the following categories ONLY: chats, images, other.
2. State your confidence in the classification as one of: high, medium, or low.

Respond in the following format:
Category: <chats|images|other>
Confidence: <high|medium|low>
"""
)

chain = LLMChain(llm=llm, prompt=prompt_template)

# === Step 2: Preprocessing Function ===

def preprocess_first_20_lines(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            lines = [line.strip() for _, line in zip(range(20), f)]
            content = " ".join(lines)
        return os.path.basename(file_path), content
    except Exception:
        return os.path.basename(file_path), ""

# === Step 3: Load .md Files ===

md_dir = "I-Soon-data/md"
md_files = glob.glob(os.path.join(md_dir, "*.md"))

with concurrent.futures.ThreadPoolExecutor() as executor:
    file_data = list(executor.map(preprocess_first_20_lines, md_files))

# === Step 4: Classify Each File and Move ===

valid_categories = {"chats", "images", "other"}
results_log = []

for file_name, content in tqdm(file_data, desc="Classifying files"):
    if not content:
        continue

    try:
        response = chain.run(content=content).strip().lower()
        lines = response.splitlines()

        category = next((line.replace("category:", "").strip() for line in lines if line.startswith("category:")), "")
        confidence = next((line.replace("confidence:", "").strip() for line in lines if line.startswith("confidence:")), "")

        if category not in valid_categories:
            category = "other"

        # Destination folder *within* the md_dir
        category_path = os.path.join(md_dir, category)
        os.makedirs(category_path, exist_ok=True)

        # Move file into category folder
        src_path = os.path.join(md_dir, file_name)
        dst_path = os.path.join(category_path, file_name)

        if os.path.exists(src_path):
            shutil.move(src_path, dst_path)

        results_log.append({
            "file": file_name,
            "category": category,
            "confidence": confidence
        })

    except Exception as e:
        print(f"Failed to process {file_name}: {e}")

# === Optional: Save results to file ===

# import json
# with open("classification_results.json", "w", encoding="utf-8") as f:
#     json.dump(results_log, f, indent=2)

# import csv
# with open("classification_results.csv", "w", newline="", encoding="utf-8") as f:
#     writer = csv.DictWriter(f, fieldnames=["file", "category", "confidence"])
#     writer.writeheader()
#     writer.writerows(results_log)

Classifying files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 70/70 [02:37<00:00,  2.24s/it]


# **Stage 2: LLM-Based Categorization and File Organization by Description**

In [12]:
# import os
# import shutil
# from langchain.prompts import PromptTemplate
# from langchain.chains import LLMChain
# from langchain.llms import Ollama
# from tqdm import tqdm

# # Initialize second LLM
# llm2 = Ollama(model="taozhiyuai/llama-3-refueled:q4_k_m")

# # Prompt for classification based on description
# prompt_template_stage2 = PromptTemplate(
#     input_variables=["description"],
#     template="""
# You are a strict content classifier.

# Given the following short description of a Markdown (.md) file:

# \"\"\"
# {description}
# \"\"\"

# Classify the content into one of these categories only:
# - chats
# - images
# - other

# Return only one of those three exact words (in lowercase). Do not use synonyms or explanations. Do not make up new categories.
# """
# )

# # Set up LangChain chain
# chain2 = LLMChain(llm=llm2, prompt=prompt_template_stage2)

# # Directories
# original_dir = "I-Soon-data/md"
# filtered_dir = "Filtered-markdowns"
# os.makedirs(filtered_dir, exist_ok=True)

# # Allowed categories
# valid_categories = {"chats", "images", "other"}

# # Reclassify and copy files
# for file_name, description in tqdm(results.items()):
#     try:
#         category = chain2.run(description=description).strip().lower()
#         if category not in valid_categories:
#             category = "other"  # fallback to default

#         # Create destination folder
#         category_path = os.path.join(filtered_dir, category)
#         os.makedirs(category_path, exist_ok=True)

#         # Copy the file
#         src_path = os.path.join(original_dir, file_name)
#         dst_path = os.path.join(category_path, file_name)

#         if os.path.exists(src_path):
#             shutil.copy2(src_path, dst_path)

#     except Exception as e:
#         print(f"Failed to classify or copy {file_name}: {e}")

# **Finding connetions between the MD files - reduced size due to performance issues**

Idea: Have the LLM search the markdown files and look for any files linked outside the chats.

# *RegEx based identification of linked files within the chat*

In [13]:
import os
import re
import ast
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Step 1: Initialize LLM
llm2 = Ollama(model="llama3.1:8b")

# Step 2: Define the prompt template
prompt_template_file_types = PromptTemplate(
    input_variables=["content"],
    template="""
List the top 50 file extensions most commonly found in cybersecurity leaked data, including data from breaches, ransomware leaks, and dark web dumps. THINK LIKE A CYBERSECURITY EXPERT.
Focus on file types that typically contain credentials, configurations, databases, personal data, internal documentation, archives, or images (e.g., screenshots of sensitive material). 

OUTPUT INSTRUCTIONS:
ONLY OUTPUT the extensions as a clean Python list format, like [<'file_extension'>, <'file_extension'>, etc.] 
Don't include "." and ALWAYS use "'" in the list. 
Do not include any explanations, comments, or extra text. 
JUST GIVE THE LIST.
"""
)

# Step 3: Run the chain to get extensions
chain = LLMChain(llm=llm2, prompt=prompt_template_file_types)
llm_response = chain.run(content="")

# Step 4: Print the LLM output
print("üîç LLM-generated file extensions list:")
print(llm_response)

# Step 5: Parse LLM response into a Python list
try:
    # Extract only the list portion using regex
    match = re.search(r"\[(.*?)\]", llm_response, re.DOTALL)
    if match:
        list_str = "[" + match.group(1) + "]"
        common_extensions = ast.literal_eval(list_str)
    else:
        raise ValueError("‚ùå No list found in LLM response.")
except Exception as e:
    raise ValueError("‚ùå Failed to parse LLM response into a list.") from e

# Step 6: Build dynamic regex
ext_pattern = '|'.join(common_extensions)
file_pattern = re.compile(r'[\w\-/\.]{8,}\.(?:' + ext_pattern + r')', re.IGNORECASE)

# Step 7: Define chats path and extract files
chats_path = 'I-Soon-data/md/chats'
extracted_files_from_chats = {}

for filename in os.listdir(chats_path):
    if filename.endswith('.md'):
        full_path = os.path.join(chats_path, filename)
        with open(full_path, 'r', encoding='utf-8') as file:
            content = file.read()
            matches = file_pattern.findall(content)
            if matches:
                extracted_files_from_chats[filename] = matches

# Step 8: Display results
print("\nüìÑ Extracted filenames from chats:")
for chat, files in extracted_files_from_chats.items():
    print(f"{chat}:")
    for f in files:
        print(f"  - {f}")

üîç LLM-generated file extensions list:
['mdb', 'xlsb', 'xlsm', 'xlsx', 'xltx', 'xltm', 'pptm', 'pptx', 'sldm', 'sldx', 'docm', 'docx', 'dotx', 'odp', 'ods', 'odt', 'mp4', 'avi', 'flv', 'mov', 'wmv', 'jpg', 'jpeg', 'png', 'gif', 'bmp', 'tif', 'tiff', 'txt', 'csv', 'json', 'xml', 'ini', 'config', 'db', 'sqlite3', 'mdb', 'accdb', 'mdf', 'dbf', 'dwg', 'dxf', 'doc', 'xls', 'ppt', 'rar', 'zip', '7z', 'tar', 'gz', 'bz2', 'xz', 'lzma', 'sql', 'dat', 'log', 'tmp', 'bak', 'cfg', 'conf', 'key']

üìÑ Extracted filenames from chats:
1.md:
  - ÂÆâÊ¥µÈ°πÁõÆ-ÊäïËµÑÊÑèÂêë‰π¶-200730.DOCX
  - ËØæÁ®ãË°®2020_Ë•øÂÆâ_ÂÜÖÁâà.docx
  - c404_indictment_0.docx
  - wong_indictment_redacted_0.docx
  - zhr_indictment_redacted_0.docx
  - ‰∫ßÊïôËûçÂêàÂª∫ËÆæÈ°πÁõÆÁî≥Êä•‰π¶20201028.docx
  - tpyrced_ÊâìÂáªÊï¥Ê≤ªÊ∂âÁΩëÁäØÁΩ™ÊäÄÊúØÊúçÂä°ÂêàÂêå0201-Â§ßÁêÜ.docx
  - ‰∏™‰∫∫ÁÆÄÂéÜË°®-ÂæêÂ≠êËØë.docx
  - Â§™ÊûÅÂÖ¨Âè∏Âü∫Êú¨ÊÉÖÂÜµ‰ªãÁªç20210809.pptx
  - 20220110.doc
  - Â∏ÇÂú∫ÂØπÂ§ñ2022-Êµ∑ÂçóÂÆâÊ¥µ.pptx
  - ÂâçÁ´ØÈîÄÂîÆÂèçÈ¶àÈó

# *Search which files are in the leaked data with Threads*

In [4]:
import re
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from tqdm import tqdm

# Load and clean lines
with open('I-Soon-data/md/chats/10.md', 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]

# Regex and shared resources
file_regex = re.compile(r'\b[\w\-]+(?:\.[\w\-]+)*\.\w{2,6}\b')
found_files = []
lock = threading.Lock()

# Prompt template
prompt_template = PromptTemplate(
    input_variables=["content"],
    template="""
You are a cybersecurity analyst with expert knowledge of file types and syntax used to reference files in certain documents and related contexts.
Your task is to analyze the given content and extract all explicit file names with extensions.
Follow these guidelines:
    ‚Ä¢ Only include file names that explicitly contain a valid extension.
    ‚Ä¢ Consider common patterns such as filename.ext, [text](filename.ext), src="filename.ext", path/to/filename.ext, etc.
    ‚Ä¢ Recognize contextual clues like source, reference, include, or links that point to files.
    ‚Ä¢ Use your knowledge of the top 50 most common file extensions to guide detection.
    ‚Ä¢ Do not infer or fabricate file names based on ambiguous text. Do not extract names without a clear extension.
    ‚Ä¢ Output only: a single line list of the detected file names with extensions, separated by commas. No explanation or commentary.
Content:
{content}
"""
)

# Progress bar setup
progress_bar = tqdm(total=len(lines), desc="üîç Processing", ncols=100)

# Worker function
def process_line(i: int, content: str):
    thread_id = threading.get_ident()
    try:
        llm = Ollama(model="llama3.3:70b")
        chain = LLMChain(llm=llm, prompt=prompt_template)

        start = time.time()
        response = chain.run(content=content)
        duration = time.time() - start

        matches = file_regex.findall(response)
        match_count = len(matches)

        if matches:
            with lock:
                found_files.extend([m.strip().lower() for m in matches])

        print(f"[Thread-{thread_id}] ‚úÖ Line {i+1} done in {duration:.2f}s | Matches: {match_count}")
    except Exception as e:
        print(f"[Thread-{thread_id}] ‚ùå Error on line {i+1}: {e}")
    finally:
        progress_bar.update(1)

# Run with 2 threads
n_threads = 2

print(f"\nüöÄ Starting with {n_threads} threads on {len(lines)} lines...\n")

with ThreadPoolExecutor(max_workers=n_threads) as executor:
    futures = [executor.submit(process_line, i, line) for i, line in enumerate(lines)]
    for future in as_completed(futures):
        future.result()

progress_bar.close()

# Output results
print("\nüéØ Unique filenames found:")
unique_files = sorted(set(found_files))
for f in unique_files:
    print(f)

print(f"\n‚úÖ Total unique files found: {len(unique_files)}")

üîç Processing:   0%|                                                        | 0/449 [00:00<?, ?it/s]


üöÄ Starting with 2 threads on 449 lines...



üîç Processing:   0%|                                            | 1/449 [01:49<13:39:46, 109.79s/it]

[Thread-139673477314112] ‚úÖ Line 2 done in 109.78s | Matches: 0


üîç Processing:   0%|‚ñè                                             | 2/449 [01:50<5:39:49, 45.61s/it]

[Thread-139673895675456] ‚úÖ Line 1 done in 110.48s | Matches: 0


üîç Processing:   1%|‚ñé                                             | 3/449 [02:03<3:49:35, 30.89s/it]

[Thread-139673477314112] ‚úÖ Line 3 done in 14.05s | Matches: 0


üîç Processing:   1%|‚ñç                                             | 4/449 [02:04<2:20:36, 18.96s/it]

[Thread-139673895675456] ‚úÖ Line 4 done in 14.03s | Matches: 0


üîç Processing:   1%|‚ñå                                             | 5/449 [02:18<2:06:23, 17.08s/it]

[Thread-139673477314112] ‚úÖ Line 5 done in 14.42s | Matches: 0


üîç Processing:   1%|‚ñå                                             | 6/449 [02:18<1:24:56, 11.50s/it]

[Thread-139673895675456] ‚úÖ Line 6 done in 14.43s | Matches: 0


üîç Processing:   2%|‚ñã                                             | 7/449 [02:34<1:34:18, 12.80s/it]

[Thread-139673477314112] ‚úÖ Line 7 done in 16.16s | Matches: 0


üîç Processing:   2%|‚ñä                                             | 8/449 [02:35<1:05:43,  8.94s/it]

[Thread-139673895675456] ‚úÖ Line 8 done in 16.15s | Matches: 0


üîç Processing:   2%|‚ñâ                                             | 9/449 [02:50<1:21:27, 11.11s/it]

[Thread-139673477314112] ‚úÖ Line 9 done in 16.54s | Matches: 0


üîç Processing:   2%|‚ñà                                              | 10/449 [02:51<57:42,  7.89s/it]

[Thread-139673895675456] ‚úÖ Line 10 done in 16.54s | Matches: 0


üîç Processing:   2%|‚ñà                                            | 11/449 [03:05<1:10:39,  9.68s/it]

[Thread-139673477314112] ‚úÖ Line 11 done in 14.42s | Matches: 0


üîç Processing:   3%|‚ñà‚ñé                                             | 12/449 [03:06<50:33,  6.94s/it]

[Thread-139673895675456] ‚úÖ Line 12 done in 14.42s | Matches: 0


üîç Processing:   3%|‚ñà‚ñé                                           | 13/449 [03:20<1:07:10,  9.24s/it]

[Thread-139673477314112] ‚úÖ Line 13 done in 15.22s | Matches: 0


üîç Processing:   3%|‚ñà‚ñç                                             | 14/449 [03:22<50:55,  7.02s/it]

[Thread-139673895675456] ‚úÖ Line 14 done in 16.44s | Matches: 0


üîç Processing:   3%|‚ñà‚ñå                                           | 15/449 [03:37<1:07:19,  9.31s/it]

[Thread-139673477314112] ‚úÖ Line 15 done in 16.49s | Matches: 0


üîç Processing:   4%|‚ñà‚ñã                                             | 16/449 [03:37<48:24,  6.71s/it]

[Thread-139673895675456] ‚úÖ Line 16 done in 15.27s | Matches: 0


üîç Processing:   4%|‚ñà‚ñã                                           | 17/449 [03:51<1:04:31,  8.96s/it]

[Thread-139673477314112] ‚úÖ Line 17 done in 14.87s | Matches: 0


üîç Processing:   4%|‚ñà‚ñâ                                             | 18/449 [03:52<46:28,  6.47s/it]

[Thread-139673895675456] ‚úÖ Line 18 done in 14.87s | Matches: 0


üîç Processing:   4%|‚ñà‚ñâ                                           | 19/449 [04:06<1:01:14,  8.54s/it]

[Thread-139673477314112] ‚úÖ Line 19 done in 14.05s | Matches: 0


üîç Processing:   4%|‚ñà‚ñà                                             | 20/449 [04:06<44:13,  6.18s/it]

[Thread-139673895675456] ‚úÖ Line 20 done in 14.06s | Matches: 0


üîç Processing:   5%|‚ñà‚ñà‚ñé                                            | 22/449 [04:20<55:06,  7.74s/it]

[Thread-139673477314112] ‚úÖ Line 21 done in 14.48s | Matches: 0
[Thread-139673895675456] ‚úÖ Line 22 done in 13.80s | Matches: 0


üîç Processing:   5%|‚ñà‚ñà‚ñé                                          | 23/449 [04:33<1:04:16,  9.05s/it]

[Thread-139673477314112] ‚úÖ Line 23 done in 13.02s | Matches: 0


üîç Processing:   5%|‚ñà‚ñà‚ñå                                            | 24/449 [04:34<48:37,  6.86s/it]

[Thread-139673895675456] ‚úÖ Line 24 done in 13.70s | Matches: 0


üîç Processing:   6%|‚ñà‚ñà‚ñå                                          | 25/449 [04:49<1:04:36,  9.14s/it]

[Thread-139673477314112] ‚úÖ Line 25 done in 15.92s | Matches: 0


üîç Processing:   6%|‚ñà‚ñà‚ñã                                            | 26/449 [04:50<47:46,  6.78s/it]

[Thread-139673895675456] ‚úÖ Line 26 done in 15.93s | Matches: 0


üîç Processing:   6%|‚ñà‚ñà‚ñã                                          | 27/449 [05:13<1:20:23, 11.43s/it]

[Thread-139673477314112] ‚úÖ Line 27 done in 23.76s | Matches: 0


üîç Processing:   6%|‚ñà‚ñà‚ñä                                          | 28/449 [05:19<1:08:49,  9.81s/it]

[Thread-139673895675456] ‚úÖ Line 28 done in 28.90s | Matches: 0


üîç Processing:   6%|‚ñà‚ñà‚ñâ                                          | 29/449 [05:29<1:09:35,  9.94s/it]

[Thread-139673477314112] ‚úÖ Line 29 done in 16.09s | Matches: 0


üîç Processing:   7%|‚ñà‚ñà‚ñà                                          | 30/449 [05:37<1:05:35,  9.39s/it]

[Thread-139673895675456] ‚úÖ Line 30 done in 18.35s | Matches: 0


üîç Processing:   7%|‚ñà‚ñà‚ñà                                          | 31/449 [05:57<1:27:04, 12.50s/it]

[Thread-139673477314112] ‚úÖ Line 31 done in 27.95s | Matches: 0


üîç Processing:   7%|‚ñà‚ñà‚ñà‚ñè                                         | 32/449 [06:01<1:10:28, 10.14s/it]

[Thread-139673895675456] ‚úÖ Line 32 done in 24.44s | Matches: 0


üîç Processing:   7%|‚ñà‚ñà‚ñà‚ñé                                         | 33/449 [06:11<1:10:18, 10.14s/it]

[Thread-139673477314112] ‚úÖ Line 33 done in 14.71s | Matches: 0


üîç Processing:   8%|‚ñà‚ñà‚ñà‚ñå                                           | 34/449 [06:15<55:42,  8.05s/it]

[Thread-139673895675456] ‚úÖ Line 34 done in 13.30s | Matches: 0


üîç Processing:   8%|‚ñà‚ñà‚ñà‚ñã                                           | 35/449 [06:24<57:19,  8.31s/it]

[Thread-139673477314112] ‚úÖ Line 35 done in 12.06s | Matches: 0


üîç Processing:   8%|‚ñà‚ñà‚ñà‚ñä                                           | 36/449 [06:27<46:32,  6.76s/it]

[Thread-139673895675456] ‚úÖ Line 36 done in 12.04s | Matches: 0


üîç Processing:   8%|‚ñà‚ñà‚ñà‚ñä                                           | 37/449 [06:37<52:58,  7.72s/it]

[Thread-139673477314112] ‚úÖ Line 37 done in 13.09s | Matches: 0


üîç Processing:   8%|‚ñà‚ñà‚ñà‚ñâ                                           | 38/449 [06:40<44:20,  6.47s/it]

[Thread-139673895675456] ‚úÖ Line 38 done in 13.52s | Matches: 0


üîç Processing:   9%|‚ñà‚ñà‚ñà‚ñà                                           | 39/449 [06:54<59:31,  8.71s/it]

[Thread-139673477314112] ‚úÖ Line 39 done in 17.51s | Matches: 0


üîç Processing:   9%|‚ñà‚ñà‚ñà‚ñà                                         | 40/449 [07:21<1:37:09, 14.25s/it]

[Thread-139673895675456] ‚úÖ Line 40 done in 41.13s | Matches: 2


üîç Processing:   9%|‚ñà‚ñà‚ñà‚ñà                                         | 41/449 [07:34<1:32:56, 13.67s/it]

[Thread-139673477314112] ‚úÖ Line 41 done in 39.49s | Matches: 0


üîç Processing:   9%|‚ñà‚ñà‚ñà‚ñà‚ñè                                        | 42/449 [07:39<1:14:54, 11.04s/it]

[Thread-139673895675456] ‚úÖ Line 42 done in 17.22s | Matches: 0


üîç Processing:  10%|‚ñà‚ñà‚ñà‚ñà‚ñé                                        | 43/449 [08:19<2:15:18, 20.00s/it]

[Thread-139673477314112] ‚úÖ Line 43 done in 45.81s | Matches: 2


üîç Processing:  10%|‚ñà‚ñà‚ñà‚ñà‚ñç                                        | 44/449 [08:24<1:44:33, 15.49s/it]

[Thread-139673895675456] ‚úÖ Line 44 done in 45.86s | Matches: 0


üîç Processing:  10%|‚ñà‚ñà‚ñà‚ñà‚ñå                                        | 45/449 [08:40<1:43:52, 15.43s/it]

[Thread-139673477314112] ‚úÖ Line 45 done in 20.25s | Matches: 0


üîç Processing:  10%|‚ñà‚ñà‚ñà‚ñà‚ñå                                        | 46/449 [09:15<2:23:22, 21.35s/it]

[Thread-139673895675456] ‚úÖ Line 46 done in 50.44s | Matches: 2


üîç Processing:  10%|‚ñà‚ñà‚ñà‚ñà‚ñã                                        | 47/449 [09:19<1:48:53, 16.25s/it]

[Thread-139673477314112] ‚úÖ Line 47 done in 39.52s | Matches: 0


üîç Processing:  11%|‚ñà‚ñà‚ñà‚ñà‚ñä                                        | 48/449 [09:30<1:38:28, 14.73s/it]

[Thread-139673895675456] ‚úÖ Line 48 done in 15.56s | Matches: 0


üîç Processing:  11%|‚ñà‚ñà‚ñà‚ñà‚ñâ                                        | 49/449 [09:37<1:21:05, 12.16s/it]

[Thread-139673477314112] ‚úÖ Line 49 done in 17.35s | Matches: 0


üîç Processing:  11%|‚ñà‚ñà‚ñà‚ñà‚ñà                                        | 50/449 [09:45<1:13:14, 11.01s/it]

[Thread-139673895675456] ‚úÖ Line 50 done in 14.50s | Matches: 0


üîç Processing:  11%|‚ñà‚ñà‚ñà‚ñà‚ñà                                        | 51/449 [09:50<1:01:07,  9.22s/it]

[Thread-139673477314112] ‚úÖ Line 51 done in 13.35s | Matches: 0


üîç Processing:  12%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè                                       | 52/449 [10:02<1:06:04,  9.99s/it]

[Thread-139673895675456] ‚úÖ Line 52 done in 16.80s | Matches: 0


üîç Processing:  12%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                                         | 53/449 [10:06<54:39,  8.28s/it]

[Thread-139673477314112] ‚úÖ Line 53 done in 16.09s | Matches: 0


üîç Processing:  12%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã                                         | 54/449 [10:16<58:44,  8.92s/it]

[Thread-139673895675456] ‚úÖ Line 54 done in 14.72s | Matches: 0


üîç Processing:  12%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä                                         | 55/449 [10:25<57:14,  8.72s/it]

[Thread-139673477314112] ‚úÖ Line 55 done in 18.66s | Matches: 0


üîç Processing:  12%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä                                         | 56/449 [10:33<56:48,  8.67s/it]

[Thread-139673895675456] ‚úÖ Line 56 done in 16.80s | Matches: 0


üîç Processing:  13%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ                                         | 57/449 [10:37<47:10,  7.22s/it]

[Thread-139673477314112] ‚úÖ Line 57 done in 12.40s | Matches: 0


üîç Processing:  13%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä                                       | 58/449 [10:54<1:05:05,  9.99s/it]

[Thread-139673895675456] ‚úÖ Line 58 done in 20.28s | Matches: 0


üîç Processing:  13%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ                                       | 59/449 [11:02<1:01:53,  9.52s/it]

[Thread-139673477314112] ‚úÖ Line 59 done in 24.88s | Matches: 0


üîç Processing:  13%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                                       | 60/449 [11:14<1:05:52, 10.16s/it]

[Thread-139673895675456] ‚úÖ Line 60 done in 20.09s | Matches: 0


üîç Processing:  14%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç                                        | 61/449 [11:20<58:41,  9.08s/it]

[Thread-139673477314112] ‚úÖ Line 61 done in 18.19s | Matches: 0


üîç Processing:  14%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç                                        | 62/449 [11:30<59:08,  9.17s/it]

[Thread-139673895675456] ‚úÖ Line 62 done in 15.93s | Matches: 0


üîç Processing:  14%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                                        | 63/449 [11:34<49:28,  7.69s/it]

[Thread-139673477314112] ‚úÖ Line 63 done in 13.63s | Matches: 0


üîç Processing:  14%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã                                        | 64/449 [11:47<59:54,  9.34s/it]

[Thread-139673895675456] ‚úÖ Line 64 done in 17.42s | Matches: 0


üîç Processing:  14%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä                                        | 65/449 [11:51<49:10,  7.68s/it]

[Thread-139673477314112] ‚úÖ Line 65 done in 17.00s | Matches: 0


üîç Processing:  15%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ                                        | 66/449 [11:59<49:40,  7.78s/it]

[Thread-139673895675456] ‚úÖ Line 66 done in 11.84s | Matches: 0


üîç Processing:  15%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                                        | 67/449 [12:05<46:10,  7.25s/it]

[Thread-139673477314112] ‚úÖ Line 67 done in 14.03s | Matches: 0


üîç Processing:  15%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                                        | 68/449 [12:15<51:41,  8.14s/it]

[Thread-139673895675456] ‚úÖ Line 68 done in 16.23s | Matches: 0


üîç Processing:  15%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè                                       | 69/449 [12:20<45:01,  7.11s/it]

[Thread-139673477314112] ‚úÖ Line 69 done in 14.92s | Matches: 0


üîç Processing:  16%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé                                       | 70/449 [12:31<53:39,  8.50s/it]

[Thread-139673895675456] ‚úÖ Line 70 done in 16.43s | Matches: 0


üîç Processing:  16%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç                                       | 71/449 [12:35<44:40,  7.09s/it]

[Thread-139673477314112] ‚úÖ Line 71 done in 15.54s | Matches: 0


üîç Processing:  16%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                                       | 72/449 [12:43<46:22,  7.38s/it]

[Thread-139673895675456] ‚úÖ Line 72 done in 11.87s | Matches: 0


üîç Processing:  16%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã                                       | 73/449 [12:50<44:17,  7.07s/it]

[Thread-139673477314112] ‚úÖ Line 73 done in 14.39s | Matches: 0


üîç Processing:  16%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã                                       | 74/449 [13:00<49:39,  7.94s/it]

[Thread-139673895675456] ‚úÖ Line 74 done in 16.32s | Matches: 0


üîç Processing:  17%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä                                       | 75/449 [13:04<42:33,  6.83s/it]

[Thread-139673477314112] ‚úÖ Line 75 done in 14.21s | Matches: 0


üîç Processing:  17%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ                                       | 76/449 [13:17<53:37,  8.63s/it]

[Thread-139673895675456] ‚úÖ Line 76 done in 17.05s | Matches: 0


üîç Processing:  17%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                                       | 77/449 [13:23<49:03,  7.91s/it]

[Thread-139673477314112] ‚úÖ Line 77 done in 19.07s | Matches: 0


üîç Processing:  17%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè                                      | 78/449 [13:34<54:20,  8.79s/it]

[Thread-139673895675456] ‚úÖ Line 78 done in 17.08s | Matches: 0


üîç Processing:  18%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé                                      | 79/449 [13:40<50:03,  8.12s/it]

[Thread-139673477314112] ‚úÖ Line 79 done in 17.39s | Matches: 0


üîç Processing:  18%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé                                      | 80/449 [13:51<54:36,  8.88s/it]

[Thread-139673895675456] ‚úÖ Line 80 done in 17.21s | Matches: 0


üîç Processing:  18%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç                                      | 81/449 [13:55<45:56,  7.49s/it]

[Thread-139673477314112] ‚úÖ Line 81 done in 14.91s | Matches: 0


üîç Processing:  18%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè                                    | 82/449 [14:12<1:02:36, 10.24s/it]

[Thread-139673895675456] ‚úÖ Line 82 done in 20.90s | Matches: 0


üîç Processing:  18%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã                                      | 83/449 [14:19<57:16,  9.39s/it]

[Thread-139673477314112] ‚úÖ Line 83 done in 24.05s | Matches: 0


üîç Processing:  19%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç                                    | 84/449 [14:30<1:00:03,  9.87s/it]

[Thread-139673895675456] ‚úÖ Line 84 done in 18.41s | Matches: 0


KeyboardInterrupt: 

In [3]:
# import shutil
# import os

# # Directory to delete
# EXTRACT_DIR = 'I-Soon-data'

# # Check if the directory exists and delete it
# if os.path.isdir(EXTRACT_DIR):
#     shutil.rmtree(EXTRACT_DIR)
#     print(f"Directory '{EXTRACT_DIR}' has been deleted.")
# else:
#     print(f"Directory '{EXTRACT_DIR}' does not exist.")