<a href="https://colab.research.google.com/github/calyaconsult/juypter-notebooks/blob/main/File_Archive_Clusters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define the file path in your Google Drive
file_path = '/content/drive/My Drive/Data/unique_filenames.txt'

In [None]:
import os
# Install rapidfuzz first if needed:
# !pip install rapidfuzz
from rapidfuzz import fuzz

# --- Configuration ---
SIMILARITY_THRESHOLD = 85   # adjust as needed
MIN_CLUSTER_SIZE = 5        # only print clusters with >=5 files
SKIP_EXTENSIONS = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'}
# Skip certain filename patterns
SKIP_PATTERNS = ["lorem-","ipsum"]  # anything containing this substring will be ignored

In [None]:
# --- Helper function: Longest Common Substring ---
def longest_common_substring(a, b):
    m, n = len(a), len(b)
    dp = [[0]*(n+1) for _ in range(m+1)]
    length, end = 0, 0
    for i in range(1, m+1):
        for j in range(1, n+1):
            if a[i-1] == b[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
                if dp[i][j] > length:
                    length, end = dp[i][j], i
    return a[end-length:end]

# --- Load filenames and apply filters ---
with open(file_path) as f:
    filenames = [line.strip() for line in f if line.strip()]

# Filter out unwanted extensions and patterns
filtered_filenames = [
    f for f in filenames
    if os.path.splitext(f)[1].lower() not in SKIP_EXTENSIONS
    and not any(pat in f for pat in SKIP_PATTERNS)
]

# --- Group filenames by fuzzy similarity ---
groups = []
used = set()

for i, f1 in enumerate(filtered_filenames):
    if f1 in used:
        continue
    used.add(f1)
    group = [f1]
    for f2 in filtered_filenames[i+1:]:
        if f2 in used:
            continue
        if fuzz.ratio(f1, f2) >= SIMILARITY_THRESHOLD:
            group.append(f2)
            used.add(f2)
    if len(group) >= MIN_CLUSTER_SIZE:
        groups.append(group)

# --- Print clusters with longest common substring ---
for idx, g in enumerate(groups, 1):
    lcs = g[0]
    for f in g[1:]:
        lcs = longest_common_substring(lcs, f)
    print(f"Cluster {idx} (LCS: '{lcs}') → {len(g)} files")
    for f in g:
        print("   ", f)
    print()


In [None]:
import csv

output_csv = "/content/drive/My Drive/Data/filename_clusters.csv"

with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    # Header
    writer.writerow(["cluster_index", "lcs", "num_files", "file_list"])

    for idx, g in enumerate(groups, 1):
        lcs = g[0]
        for f in g[1:]:
            lcs = longest_common_substring(lcs, f)
        writer.writerow([idx, lcs, len(g), "; ".join(g)])

print(f"Clusters exported to '{output_csv}'")


Clusters exported to '/content/drive/My Drive/Data/filename_clusters.csv'
