# **IMPORTS & PATHS** 

In [24]:
import os
import zipfile
import subprocess
import json
import pandas as pd
from langchain_ollama.llms import OllamaLLM
import shutil


# Paths and model settings\ nZIP_PATH = 'I-Soon-data.zip'      # Path to your downloaded zip file
ZIP_PATH = '0.zip'    # Directory to extract contents
EXTRACT_DIR = 'I-Soon-data'        # Directory to extract contents
ORGANIZED_DIR = 'organized_data'  # Directory to group files by extension
MODEL_NAME = 'llama3.2'             # Local Ollama model identifier
OUTPUT_JSON = 'parsed_md.json'    # Aggregated JSON output
OUTPUT_CSV = 'parsed_md.csv'      # CSV output for DataFrame

# Initialize Ollama LLM via LangChain
model = OllamaLLM(model=MODEL_NAME)

# **ZIP FILE EXTRACTION**

In [25]:
if not os.path.isdir(EXTRACT_DIR):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print(f"Extracted archive to '{EXTRACT_DIR}'")
else:
    print(f"Extraction directory '{EXTRACT_DIR}' already exists")

Extracted archive to 'I-Soon-data'


# **DATA TYPE CATEGORIZATION**

In [27]:
# Define the parent directory
parent_directory = "I-Soon-data"

# Full path to the __MACOSX folder
macosx_folder = os.path.join(parent_directory, "__MACOSX")

# Check if __MACOSX exists and remove it
if os.path.exists(macosx_folder) and os.path.isdir(macosx_folder):
    shutil.rmtree(macosx_folder)
    print(f"Deleted: {macosx_folder}")
else:
    print(f"Folder not found: {macosx_folder}")

# Organize files by extension into subfolders
for root, dirs, files in os.walk(parent_directory):
    for file in files:
        # Skip hidden files and __MACOSX if any reappear
        if file.startswith('.') or '__MACOSX' in root:
            continue

        # Get the file extension (in lowercase, without the dot)
        file_extension = os.path.splitext(file)[1].lower().lstrip('.')
        if not file_extension:
            file_extension = "no_extension"

        # Define the new subfolder path
        subfolder_path = os.path.join(parent_directory, file_extension)

        # Create the subfolder if it doesn't exist
        os.makedirs(subfolder_path, exist_ok=True)

        # Define source and destination paths
        source_path = os.path.join(root, file)
        destination_path = os.path.join(subfolder_path, file)

        # Move the file if source and destination are not the same
        if os.path.abspath(source_path) != os.path.abspath(destination_path):
            shutil.move(source_path, destination_path)

# Remove any empty folders within the parent directory
for dirpath, dirnames, filenames in os.walk(parent_directory, topdown=False):
    if not dirnames and not filenames:
        try:
            os.rmdir(dirpath)
            print(f"Removed empty folder: {dirpath}")
        except OSError:
            pass  # Ignore errors (e.g., if directory is not empty due to permissions)

Folder not found: I-Soon-data/__MACOSX


In [None]:
import os
import glob
import concurrent.futures
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from tqdm import tqdm  # For progress bar

# Initialize local LLM (e.g., qwen:14b via Ollama)
llm = Ollama(model="qwen:14b")

# Prompt template (adapted for short content)
prompt_template = PromptTemplate(
    input_variables=["content"],
    template="""
You are analyzing the content of a Markdown (.md) file.

Markdown content:
\"\"\"
{content}
\"\"\"

How would you classify the markdown file based on the above content? 
Return a single-word category like: chat, image, table, message, document, or text. Do not add explanations.

IMPORTANT: PLEASE PROVIDE YOUR CONDIFENCE LEVEL (HIGH, MEDIUM, LOW) IN PARENTHESES AFTER THE CLASSIFICATION.
"""
)

# Set up LangChain classification chain
chain = LLMChain(llm=llm, prompt=prompt_template)

# Collect .md file paths
md_dir = "I-Soon-data/md"
md_files = glob.glob(os.path.join(md_dir, "*.md"))

# Preprocess: Read only the first line
def preprocess_first_line(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            first_line = f.readline().strip()
        return os.path.basename(file_path), first_line
    except Exception as e:
        return os.path.basename(file_path), ""

# Use thread pool to preprocess
with concurrent.futures.ThreadPoolExecutor() as executor:
    file_data = list(executor.map(preprocess_first_line, md_files))

# Classify using LLM (sequential to prevent concurrency issues)
results = {}
for file_name, first_line in tqdm(file_data):
    if not first_line:
        results[file_name] = "error"
        continue
    classification = chain.run(content=first_line).strip().lower()
    results[file_name] = classification

# Output results
for fname, label in results.items():
    print(f"{fname}: {label}")

100%|██████████| 70/70 [03:22<00:00,  2.89s/it]

dbc9c90e-a3e6-4d71-bb93-5fb8394095ac.md: image
28.md: table
5.md: table
38.md: table
9d7bc879-3250-4013-ac04-5ff9bd6dff40.md: document
18.md: table
9fd06037-11f1-4ad5-9a7d-cbfb3fa4193b.md: image
3348953d-66e9-4cac-8675-65bb5f2ef929.md: image
1.md: table
19.md: table
07f179c5-5705-4dbd-94a7-66eed1e066b0.md: image
29.md: table
01cdc26f-e773-4ad7-8808-d04abf16aae7.md: image
585875ff-f8c5-4a02-acd7-fef37dc9ff11.md: image
4.md: table
39.md: table
64bba692-d430-440c-9f1e-2575f45770af.md: image
178e3898-903d-47cf-bfbe-061e7dc18895.md: image
16.md: table
22.md: table
32.md: table
9fe6b262-9944-417d-a0c4-9f2de1de2994.md: image
f7205881-3904-42ec-ab2c-04f36fa24785.md: image
54990932-71af-48dd-9a7a-2617b1407c54.md: image
26.md: table
12.md: table
36.md: table
eda5b003-9250-4913-b724-74cca86240af.md: image
27.md: table
13.md: table
37.md: table
5a6b122c-39c1-4581-8c1f-2d6f36a9f8a0.md: document
17.md: table
23.md: table
33.md: table
5387a301-0af8-4e24-a197-20189f87b9ef.md: image
48fd4c79-41ca-459e-


