In [4]:
import zipfile
import shutil
import os

In [5]:
def extract_zip(zip_path):

    if not zipfile.is_zipfile(zip_path):
        raise ValueError(f"The file at {zip_path} is not a valid ZIP archive.")
    
    # Determine the output directory name from the zip file name
    base_dir = os.path.dirname(zip_path)
    zip_filename = os.path.basename(zip_path)
    folder_name = os.path.splitext(zip_filename)[0]
    extract_to = os.path.join(base_dir, folder_name)

    # Create the directory if it doesn't exist
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)

    # Extract the ZIP file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(path=extract_to)

    return extract_to

# Example usage
if __name__ == "__main__":
    zip_file_path = '0.zip'  # Replace with your actual ZIP file path
    extracted_dir = extract_zip(zip_file_path)
    print(f"Extracted to: {extracted_dir}")

Extracted to: 0


In [6]:
# Parent directory
parent_directory = "0"

# Full path to the __MACOSX folder
macosx_folder = os.path.join(parent_directory, "__MACOSX")

# Check if __MACOSX exists and remove it - creates issues when analyzing the data, and its not needed, made automatically by MacOS
if os.path.exists(macosx_folder) and os.path.isdir(macosx_folder):
    shutil.rmtree(macosx_folder)
    print(f"Deleted: {macosx_folder}")
else:
    print(f"Folder not found: {macosx_folder}")

# Organize files by extension into subfolders
for root, dirs, files in os.walk(parent_directory):
    for file in files:
        # Skip hidden files and __MACOSX if any reappear
        if file.startswith('.') or '__MACOSX' in root:
            continue

        # Get the file extension (in lowercase, without the dot)
        file_extension = os.path.splitext(file)[1].lower().lstrip('.')
        if not file_extension:
            file_extension = "no_extension"

        # Define the new subfolder path
        subfolder_path = os.path.join(parent_directory, file_extension)

        # Create the subfolder if it doesn't exist
        os.makedirs(subfolder_path, exist_ok=True)

        # Define source and destination paths
        source_path = os.path.join(root, file)
        destination_path = os.path.join(subfolder_path, file)

        # Move the file if source and destination are not the same
        if os.path.abspath(source_path) != os.path.abspath(destination_path):
            shutil.move(source_path, destination_path)

# Remove any empty folders within the parent directory
for dirpath, dirnames, filenames in os.walk(parent_directory, topdown=False):
    if not dirnames and not filenames:
        try:
            os.rmdir(dirpath)
            print(f"Removed empty folder: {dirpath}")
        except OSError:
            pass  

Deleted: 0/__MACOSX


In [7]:
# Directory to delete
EXTRACT_DIR = '0/0'

# Check if the directory exists and delete it
if os.path.isdir(EXTRACT_DIR):
    shutil.rmtree(EXTRACT_DIR)
    print(f"Directory '{EXTRACT_DIR}' has been deleted.")
else:
    print(f"Directory '{EXTRACT_DIR}' does not exist.")

Directory '0/0' has been deleted.


In [10]:
import os
import pandas as pd

# Dataframe with all file location

In [42]:
def get_folder_file_dataframe(root_dir):
    folders = [f for f in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, f))]

    series_list = []
    for folder in folders:
        folder_path = os.path.join(root_dir, folder)
        files = [
            os.path.join(folder_path, file)
            for file in os.listdir(folder_path)
            if os.path.isfile(os.path.join(folder_path, file)) and not file.startswith('.')
        ]
        s = pd.Series(files, name=folder)
        series_list.append(s)

    df = pd.concat(series_list, axis=1)
    return df

# Example usage
root_directory = "0"
df = get_folder_file_dataframe(root_directory)
df.head(30)        # Show first 5 rows


Unnamed: 0,md,png,log,txt
0,0/md/dbc9c90e-a3e6-4d71-bb93-5fb8394095ac.md,0/png/64bba692-d430-440c-9f1e-2575f45770af_6.png,0/log/77010155050.log,0/txt/IDNET.txt
1,0/md/28.md,0/png/12756724-394c-4576-b373-7c53f1abbd94_0.png,0/log/77753527617.log,0/txt/IDTV.txt
2,0/md/5.md,0/png/f179eb06-0c53-44df-a13f-570be23355bb_1.png,0/log/tele2-lbs.log,0/txt/beeline-77774042222.txt
3,0/md/38.md,0/png/5a6b122c-39c1-4581-8c1f-2d6f36a9f8a0_24.png,0/log/tele2-cdr.log,0/txt/beeline-77051056626.txt
4,0/md/9d7bc879-3250-4013-ac04-5ff9bd6dff40.md,0/png/5a6b122c-39c1-4581-8c1f-2d6f36a9f8a0_30.png,0/log/tele2-crm.log,0/txt/beeline-crm.txt
5,0/md/18.md,0/png/5a6b122c-39c1-4581-8c1f-2d6f36a9f8a0_18.png,0/log/77783030133.log,0/txt/UBSCRIBER.txt
6,0/md/9fd06037-11f1-4ad5-9a7d-cbfb3fa4193b.md,0/png/0-adaf869e-920a-4a17-91bd-e2ef3125c10e.png,,0/txt/beeline-cdr.txt
7,0/md/3348953d-66e9-4cac-8675-65bb5f2ef929.md,0/png/5387a301-0af8-4e24-a197-20189f87b9ef_8.png,,0/txt/CRM.txt
8,0/md/1.md,0/png/0-32eb7662-f212-4811-a7c1-1cfeb121cd99.png,,0/txt/LAC.txt
9,0/md/19.md,0/png/912204cb-8ab7-48b8-9abf-d803f3804d08_11.png,,0/txt/beeline-lbs.txt


# Script to check for queryable files

In [46]:
import os
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# ✅ Initialize Ollama
llm = Ollama(model="qwen2.5:14b")  # Ensure Ollama is running with OLLAMA_NUM_PARALLEL=8

# ✅ Preview first 20 lines
def preview_file(file_path, max_lines=20):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = [next(f).strip() for _ in range(max_lines) if not f.closed]
        return "\n".join(lines)
    except Exception:
        return "Could not preview (binary, missing, or unreadable)."

# ✅ Prompt template
prompt_template = PromptTemplate(
    input_variables=["filename", "extension", "content"],
    template="""
You are evaluating whether a file is suitable for querying with a large language model (LLM) like Mistral or GPT-4.

Filename: {filename}
Extension: {extension}
Content Preview (first 20 lines):
---
{content}
---

Should this file be considered queryable by an LLM?
Reply with "YES" or "NO". REMEMBER, sometimes there might be smaller conversations and these conversations might have references to other documents. 
In that case look at the overall structure, and then decide. If the file contains fully unqueryable elements then conclude.
"""
)

# ✅ LLMChain
chain = LLMChain(llm=llm, prompt=prompt_template)

# ✅ Classifier
def classify_file_queryability(file_path):
    if not isinstance(file_path, str) or not os.path.exists(file_path):
        return (file_path, "SKIPPED")

    filename = os.path.basename(file_path)
    extension = os.path.splitext(filename)[1]
    content = preview_file(file_path)

    try:
        result = chain.run({
            "filename": filename,
            "extension": extension,
            "content": content
        })
        decision = "YES" if "yes" in result.lower() else "NO"
        return (file_path, decision)
    except Exception as e:
        return (file_path, f"ERROR – {str(e)}")

# ✅ Prepare inputs
md_paths = df['md'].dropna().tolist()

# ✅ Multithreading with timing
start_time = time.time()
results = []

with ThreadPoolExecutor(max_workers=8) as executor:
    futures = {executor.submit(classify_file_queryability, path): path for path in md_paths}
    
    for i, future in enumerate(as_completed(futures)):
        file_path = futures[future]
        try:
            result = future.result()
            results.append(result)
        except Exception as e:
            results.append((file_path, f"ERROR – {str(e)}"))

        elapsed = time.time() - start_time
        avg_time = elapsed / (i + 1)
        eta = avg_time * (len(md_paths) - (i + 1))
        print(f"[{i+1}/{len(md_paths)}] {os.path.basename(file_path)} → {result[1]} | ⏱️ ETA: {eta:.1f}s")

# ✅ Build result DataFrame
paths, classifications = zip(*results)
columns = pd.MultiIndex.from_product([['md'], ['Path', 'Queryable']])
md_result_df = pd.DataFrame(list(zip(paths, classifications)), columns=columns)

# ✅ Show result
md_result_df.head()

[1/70] dbc9c90e-a3e6-4d71-bb93-5fb8394095ac.md → NO | ⏱️ ETA: 1364.0s
[2/70] 28.md → NO | ⏱️ ETA: 844.1s
[3/70] 3348953d-66e9-4cac-8675-65bb5f2ef929.md → NO | ⏱️ ETA: 574.4s
[4/70] 5.md → YES | ⏱️ ETA: 595.7s
[5/70] 9d7bc879-3250-4013-ac04-5ff9bd6dff40.md → YES | ⏱️ ETA: 522.6s
[6/70] 38.md → YES | ⏱️ ETA: 516.4s
[7/70] 9fd06037-11f1-4ad5-9a7d-cbfb3fa4193b.md → NO | ⏱️ ETA: 475.6s
[8/70] 18.md → YES | ⏱️ ETA: 436.7s
[9/70] 1.md → YES | ⏱️ ETA: 441.9s


KeyboardInterrupt: 