<a href="https://colab.research.google.com/github/biruk50/Medium_articles/blob/main/Clustering_%2B_DP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install hdbscan

Collecting hdbscan
  Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m3.0/4.2 MB[0m [31m86.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.2/4.2 MB[0m [31m87.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hdbscan
Successfully installed hdbscan-0.8.40


In [2]:
from collections import deque,namedtuple
from google.colab import files  # For file upload in Google Colab
from hdbscan import HDBSCAN
from typing import List, Dict
import numpy as np
import math

In [36]:
# Define the Item named tuple
Item = namedtuple("Item", ['index', 'value', 'weight', 'cluster_id'])

# Upload and read file
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

with open(file_name, 'r') as file:
    input_data = file.read().strip()

# Parse the input data
lines = input_data.split('\n')
first_line = lines[0].split()
item_count = int(first_line[0])
capacity = int(first_line[1])

print(f"capacity {capacity}")

items = []
for i, line in enumerate(lines[1:], start=1):
    value, weight = map(int, line.split())
    items.append(Item(i, value, weight, -1))  # Initialize with no cluster

# Clustering with HDBSCAN
min_cluster_size = int(math.log10(item_count)) + 1
hdbscan_clusterer = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=1)
X = np.array([(item.weight, item.value) for item in items])
cluster_labels = hdbscan_clusterer.fit_predict(X)

# Organize items into clusters and noise
cluster_dict: Dict[int, List[Item]] = {}
noise_items = []

for label, item in zip(cluster_labels, items):
    if label == -1:  # Noise point
        noise_items.append(item)
    else:
        if label not in cluster_dict:
            cluster_dict[label] = []
        cluster_dict[label].append(item._replace(cluster_id=label))

# Print cluster and noise information
print(f"\nClusters number: {len(cluster_dict)}")

print(f"\nNoise Items number: {len(noise_items)}")
#print([item.index for item in noise_items])

# Initialize knapsack values
final_value = 0
total_weight = 0
taken = [0] * item_count
remaining_capacity = capacity

top_half_noise_items = noise_items[:int(len(noise_items)//2)]

# Calculate value-to-weight ratio for noise items and clusters
noise_ratio = sum(item.value for item in noise_items) / sum(item.weight for item in top_half_noise_items) if top_half_noise_items else 0
cluster_ratio = sum(sum(item.value for item in cluster) for cluster in cluster_dict.values()) / sum(sum(item.weight for item in cluster) for cluster in cluster_dict.values()) if cluster_dict else 0

# Allocate capacity based on ratio comparison
noise_capacity =  int(remaining_capacity * ( noise_ratio / (noise_ratio + cluster_ratio)) )

print(f"noise_capacity {noise_capacity}")

# Step 2: Dynamic Programming for Noise Items
if top_half_noise_items:
    step_size = max(1, noise_capacity // (5 * len(top_half_noise_items)))
    noise_columns = noise_capacity // step_size + 1
    noise_dp_table = [[0] * (noise_columns + 1) for _ in range(len(top_half_noise_items) + 1)]

    for i in range(1, len(top_half_noise_items) + 1):
        for j in range(1, noise_columns + 1):
            item = top_half_noise_items[i - 1]
            column_capacity = j * step_size
            current = 0
            if item.weight <= column_capacity:
                previous_column_index = (column_capacity - item.weight) // step_size
                current = item.value + noise_dp_table[i - 1][previous_column_index]
            noise_dp_table[i][j] = max(noise_dp_table[i - 1][j], current)

    # Backtrack to Identify Selected Noise Items
    selected_noise_items = []
    remaining_noise_capacity = noise_capacity
    for i in range(len(top_half_noise_items), 0, -1):
        column_index = remaining_noise_capacity // step_size
        if noise_dp_table[i][column_index] != noise_dp_table[i - 1][column_index]:
            selected_noise_items.append(top_half_noise_items[i - 1])
            remaining_noise_capacity -= top_half_noise_items[i - 1].weight

    # Mark selected noise items
    for item in selected_noise_items:
        taken[item.index - 1] = 1
        final_value += item.value
        total_weight += item.weight
        remaining_capacity -= item.weight

print(f"remaining capacity after noise: {remaining_capacity} ")
print(f"min_cluster_size : { min_cluster_size }")
# Adjust cluster weights and initialize DP table
if cluster_dict and remaining_capacity > 0:
    adjusted_capacity = remaining_capacity // min_cluster_size
    print(f"adjusted_capacity {adjusted_capacity}")
    step_size = max(1, adjusted_capacity // (5 * len(cluster_dict)))
    print(f"step_size {step_size}")
    columns = adjusted_capacity // step_size +1
    print(f"columns {columns}")

    # Prepare cluster representatives using average value and weight
    cluster_representatives = []
    for cluster_id, cluster_items in cluster_dict.items():
    # Compute average value and weight of the cluster items
      avg_value = sum(item.value for item in cluster_items) // len(cluster_items)
      avg_weight = sum(item.weight for item in cluster_items) // len(cluster_items)
      cluster_representatives.append({
        "index": cluster_id,
        "value": avg_value,
        "weight": avg_weight,
        "cluster_id": cluster_id
      })

    # Initialize DP table
    dp_table = [[0] * (columns + 1) for _ in range(len(cluster_representatives) + 1)]
    # Fill DP table
    for i in range(1, len(cluster_representatives) + 1):
        for j in range(1, columns + 1):
            current_item = cluster_representatives[i - 1]
            column_capacity = j * step_size

            current = 0
            if current_item["weight"] <= column_capacity:
                previous_column_index = (column_capacity - current_item["weight"]) // step_size
                current = current_item["value"] + dp_table[i - 1][previous_column_index]
            dp_table[i][j] = max(dp_table[i - 1][j], current)


    # Print DP table (optional)
    print("\nDynamic Programming Table:")
    print (f"columns {len(dp_table[0])}")

    # Backtrack to Identify Selected Clusters
    selected_clusters = []
    remaining_cap = adjusted_capacity
    orginal_cap=adjusted_capacity

    for i in range(len(cluster_representatives), 0, -1):
        current_column_index = remaining_cap // step_size
        if orginal_cap==remaining_cap:
          current_column_index +=1

        if dp_table[i][current_column_index] != dp_table[i - 1][current_column_index]:
            selected_clusters.append(cluster_representatives[i - 1]["cluster_id"])
            remaining_cap -= cluster_representatives[i - 1]["weight"]

    print("Clusters number: {selected_clusters}")

selected_clusters.sort(key=lambda cid: sum(item.weight for item in cluster_dict[cid]))
# Round-robin selection of items from selected clusters
selected_cluster_items = {
    cluster_id: deque(sorted(cluster_dict[cluster_id], key=lambda item: item.weight))
    for cluster_id in selected_clusters
}

progress = True
while remaining_capacity > 0 and progress and any(selected_cluster_items.values()):
    progress = False
    for cluster_id, queue in selected_cluster_items.items():
        if queue:
            item = queue.popleft()
            if remaining_capacity >= item.weight:
                taken[item.index - 1] = 1
                final_value += item.value
                total_weight += item.weight
                remaining_capacity -= item.weight
                progress = True

# Output results
print("\nSelected items (0 = not taken, 1 = taken):")
print(taken)
print(f"Final knapsack value: {final_value}")
print(f"Total weight: {total_weight}")
print(f"Remaining capacity after selection: {remaining_capacity}")


Saving ks_10000_0 to ks_10000_0 (25)
capacity 1000000

Clusters number: 882

Noise Items number: 1496
noise_capacity 666738
remaining capacity after noise: 333568 
min_cluster_size : 5
adjusted_capacity 66713
step_size 15
columns 4448

Dynamic Programming Table:
columns 4449
Clusters number: {selected_clusters}

Selected items (0 = not taken, 1 = taken):
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [13]:
import heapq
class Node:
    def __init__(self, level, value, weight, bound, taken):
        self.level = level  # Current level in decision tree
        self.value = value  # Total value so far
        self.weight = weight  # Total weight so far
        self.bound = bound  # Upper bound of the value
        self.taken = taken  # Items taken so far

    def __lt__(self, other):
        return self.bound > other.bound  # Max-heap for priority queue


def calculate_bound(node, capacity, items):
    if node.weight >= capacity:
        return 0  # Exceeded capacity, bound is 0

    bound = node.value
    total_weight = node.weight
    level = node.level

    while level < len(items) and total_weight + items[level].weight <= capacity:
        total_weight += items[level].weight
        bound += items[level].value
        level += 1

    if level < len(items):
        bound += (capacity - total_weight) * (items[level].value / items[level].weight)  # Fractional value

    return bound


def branch_and_bound_knapsack(items, capacity):
    items = sorted(items, key=lambda x: x.value / x.weight, reverse=True)  # Sort by value-to-weight ratio
    pq = []  # Priority queue (max-heap)
    root = Node(level=-1, value=0, weight=0, bound=calculate_bound(Node(-1, 0, 0, 0, []), capacity, items), taken=[])
    heapq.heappush(pq, root)
    max_value = 0
    best_taken = []

    while pq:
        current = heapq.heappop(pq)

        if current.bound > max_value and current.level < len(items) - 1:
            next_level = current.level + 1

            # Branch where we take the item
            if current.weight + items[next_level].weight <= capacity:
                taken_with = current.taken + [1]
                node_with = Node(
                    level=next_level,
                    value=current.value + items[next_level].value,
                    weight=current.weight + items[next_level].weight,
                    bound=calculate_bound(Node(next_level, current.value + items[next_level].value,
                                               current.weight + items[next_level].weight, 0, []), capacity, items),
                    taken=taken_with,
                )
                if node_with.value > max_value:
                    max_value = node_with.value
                    best_taken = node_with.taken
                heapq.heappush(pq, node_with)

            # Branch where we don't take the item
            taken_without = current.taken + [0]
            node_without = Node(
                level=next_level,
                value=current.value,
                weight=current.weight,
                bound=calculate_bound(Node(next_level, current.value, current.weight, 0, []), capacity, items),
                taken=taken_without,
            )
            heapq.heappush(pq, node_without)

    return max_value, best_taken


In [5]:
# Define the Item named tuple
Item = namedtuple("Item", ['index', 'value', 'weight', 'cluster_id'])

# Upload and read file
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

with open(file_name, 'r') as file:
    input_data = file.read().strip()

# Parse the input data
lines = input_data.split('\n')
first_line = lines[0].split()
item_count = int(first_line[0])
capacity = int(first_line[1])

print(f"capacity {capacity}")

items = []
for i, line in enumerate(lines[1:], start=1):
    value, weight = map(int, line.split())
    items.append(Item(i, value, weight, -1))

# Branch and Bound
max_value, taken = branch_and_bound_knapsack(items, capacity)
print("Branch and Bound Solution:")
print("Max Value:", max_value)
print("Items Taken:", taken)

Saving ks_10000_0 to ks_10000_0
capacity 1000000
Branch and Bound Solution:
Max Value: 1099893
Items Taken: [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [66]:
# Define the Item named tuple
Item = namedtuple("Item", ['index', 'value', 'weight', 'cluster_id'])

# Upload and read file
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

with open(file_name, 'r') as file:
    input_data = file.read().strip()

# Parse the input data
lines = input_data.split('\n')
first_line = lines[0].split()
item_count = int(first_line[0])
capacity = int(first_line[1])

print(f"capacity {capacity}")

items = []
for i, line in enumerate(lines[1:], start=1):
    value, weight = map(int, line.split())
    items.append(Item(i, value, weight, -1))  # Initialize with no cluster

# Clustering with HDBSCAN
min_cluster_size = int(math.log10(item_count)) + 1
hdbscan_clusterer = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=1)
X = np.array([(item.weight, item.value) for item in items])
cluster_labels = hdbscan_clusterer.fit_predict(X)

# Organize items into clusters and noise
cluster_dict: Dict[int, List[Item]] = {}
noise_items = []

for label, item in zip(cluster_labels, items):
    if label == -1:  # Noise point
        noise_items.append(item)
    else:
        if label not in cluster_dict:
            cluster_dict[label] = []
        cluster_dict[label].append(item._replace(cluster_id=label))

# Print cluster and noise information
print(f"\nClusters number: {len(cluster_dict)}")

print(f"\nNoise Items number: {len(noise_items)}")
#print([item.index for item in noise_items])

# Initialize knapsack values
final_value = 0
total_weight = 0
taken = [0] * item_count
remaining_capacity = capacity


noise_items.sort(key=lambda x: x.value / (x.weight), reverse=True)
cut_off_index = int(math.log2(len(noise_items)) )
print(f"cut_off_index { cut_off_index }")

selected_noise_items = noise_items[:cut_off_index]
for item in selected_noise_items:
    if remaining_capacity >= item.weight:
          taken[item.index - 1] = 1
          final_value += item.value
          total_weight += item.weight
          remaining_capacity -= item.weight


print(f"remaining capacity after greedy: {remaining_capacity} ")
print(f"min_cluster_size : {min_cluster_size}")
# Adjust cluster weights and initialize DP table
if cluster_dict:
    adjusted_capacity = remaining_capacity // min_cluster_size
    print(f"adjusted_capacity {adjusted_capacity}")
    step_size = max(1, adjusted_capacity // (4 * len(cluster_dict)))
    print(f"step_size {step_size}")
    columns = adjusted_capacity // step_size +1
    print(f"columns {columns}")

    # Prepare cluster representatives using average value and weight
    cluster_representatives = []
    for cluster_id, cluster_items in cluster_dict.items():
    # Compute average value and weight of the cluster items
      avg_value = sum(item.value for item in cluster_items) // len(cluster_items)
      avg_weight = sum(item.weight for item in cluster_items) // len(cluster_items)
      cluster_representatives.append({
        "index": cluster_id,
        "value": avg_value,
        "weight": avg_weight,
        "cluster_id": cluster_id
      })

    # Initialize DP table
    dp_table = [[0] * (columns + 1) for _ in range(len(cluster_representatives) + 1)]
    # Fill DP table
    for i in range(1, len(cluster_representatives) + 1):
        for j in range(1, columns + 1):
            current_item = cluster_representatives[i - 1]
            column_capacity = j * step_size

            current = 0
            if current_item["weight"] <= column_capacity:
                previous_column_index = (column_capacity - current_item["weight"]) // step_size
                current = current_item["value"] + dp_table[i - 1][previous_column_index]
            dp_table[i][j] = max(dp_table[i - 1][j], current)


    # Print DP table (optional)
    print("\nDynamic Programming Table:")
    print (f"columns {len(dp_table[0])}")

    # Backtrack to Identify Selected Clusters
    selected_clusters = []
    remaining_cap = adjusted_capacity
    orginal_cap=adjusted_capacity

    for i in range(len(cluster_representatives), 0, -1):
        current_column_index = remaining_cap // step_size
        if orginal_cap==remaining_cap:
          current_column_index +=1

        if dp_table[i][current_column_index] != dp_table[i - 1][current_column_index]:
            selected_clusters.append(cluster_representatives[i - 1]["cluster_id"])
            remaining_cap -= cluster_representatives[i - 1]["weight"]

    print("Clusters number: {selected_clusters}")

selected_clusters.sort(key=lambda cid: sum(item.weight for item in cluster_dict[cid]))
# Round-robin selection of items from selected clusters
selected_cluster_items = {
    cluster_id: deque(sorted(cluster_dict[cluster_id], key=lambda item: item.weight))
    for cluster_id in selected_clusters
}

progress = True
while remaining_capacity > 0 and progress and any(selected_cluster_items.values()):
    progress = False
    for cluster_id, queue in selected_cluster_items.items():
        if queue:
            item = queue.popleft()
            if remaining_capacity >= item.weight:
                taken[item.index - 1] = 1
                final_value += item.value
                total_weight += item.weight
                remaining_capacity -= item.weight
                progress = True

# Output results
print("\nSelected items (0 = not taken, 1 = taken):")
print(taken)
print(f"Final knapsack value: {final_value}")
print(f"Total weight: {total_weight}")
print(f"Remaining capacity after selection: {remaining_capacity}")

Saving ks_10000_0 to ks_10000_0 (42)
capacity 1000000

Clusters number: 882

Noise Items number: 1496
cut_off_index 10
remaining capacity after greedy: 74039 
min_cluster_size : 5
adjusted_capacity 14807
step_size 4
columns 3702

Dynamic Programming Table:
columns 3703
Clusters number: {selected_clusters}

Selected items (0 = not taken, 1 = taken):
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [64]:
# Define the Item named tuple
Item = namedtuple("Item", ['index', 'value', 'weight', 'cluster_id'])

# Upload and read file
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

with open(file_name, 'r') as file:
    input_data = file.read().strip()

# Parse the input data
lines = input_data.split('\n')
first_line = lines[0].split()
item_count = int(first_line[0])
capacity = int(first_line[1])

print(f"capacity {capacity}")

items = []
for i, line in enumerate(lines[1:], start=1):
    value, weight = map(int, line.split())
    items.append(Item(i, value, weight, -1))  # Initialize with no cluster

# Clustering with HDBSCAN
min_cluster_size = int(math.log10(item_count)) + 1
hdbscan_clusterer = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=1)
X = np.array([(item.weight, item.value) for item in items])
cluster_labels = hdbscan_clusterer.fit_predict(X)

# Organize items into clusters and noise
cluster_dict: Dict[int, List[Item]] = {}
noise_items = []

for label, item in zip(cluster_labels, items):
    if label == -1:  # Noise point
        noise_items.append(item)
    else:
        if label not in cluster_dict:
            cluster_dict[label] = []
        cluster_dict[label].append(item._replace(cluster_id=label))

# Print cluster and noise information
print(f"\nClusters number: {len(cluster_dict)}")

print(f"\nNoise Items number: {len(noise_items)}")
#print([item.index for item in noise_items])

# Initialize knapsack values
final_value = 0
total_weight = 0
taken = [0] * item_count
remaining_capacity = capacity


noise_items.sort(key=lambda x: x.value / (x.weight), reverse=True)
top_noise_half = noise_items[:int(len(noise_items)* 0.1)]

print(f"min_cluster_size : {min_cluster_size}")
# Adjust cluster weights and initialize DP table
if cluster_dict:
    adjusted_capacity = capacity
    print(f"adjusted_capacity {adjusted_capacity}")
    step_size = max(1, adjusted_capacity // (4 * len(cluster_dict)))
    print(f"step_size {step_size}")
    columns = adjusted_capacity // step_size +1
    print(f"columns {columns}")

    # Prepare cluster representatives using average value and weight
    cluster_representatives = []
    for cluster_id, cluster_items in cluster_dict.items():
    # Compute average value and weight of the cluster items
      avg_value = sum(item.value for item in cluster_items) // len(cluster_items)
      avg_weight = sum(item.weight for item in cluster_items) // len(cluster_items)
      cluster_representatives.append({
        "index": cluster_id,
        "value": avg_value,
        "weight": avg_weight,
        "cluster_id": cluster_id
      })

      # Treat each noise item as its own cluster
    start_num=len(cluster_representatives) +1
    noise_cluster_dict = {}  # Create a separate dictionary for noise items

    for noise_item in top_noise_half:
      cluster_representatives.append({
        "index": noise_item.index,
        "value": noise_item.value,
        "weight": noise_item.weight,
        "cluster_id": start_num  # Assign a special cluster ID for individual noise items
        })
      noise_cluster_dict[start_num] = [noise_item]
      start_num+=1

    # Initialize DP table
    dp_table = [[0] * (columns + 1) for _ in range(len(cluster_representatives) + 1)]
    # Fill DP table
    for i in range(1, len(cluster_representatives) + 1):
        for j in range(1, columns + 1):
            current_item = cluster_representatives[i - 1]
            column_capacity = j * step_size

            current = 0
            if current_item["weight"] <= column_capacity:
                previous_column_index = (column_capacity - current_item["weight"]) // step_size
                current = current_item["value"] + dp_table[i - 1][previous_column_index]
            dp_table[i][j] = max(dp_table[i - 1][j], current)


    # Print DP table (optional)
    print("\nDynamic Programming Table:")
    print (f"rows {len(dp_table)}")
    print (f"columns {len(dp_table[0])}")

    # Backtrack to Identify Selected Clusters
    selected_clusters = []
    remaining_cap = adjusted_capacity
    orginal_cap=adjusted_capacity

    for i in range(len(cluster_representatives), 0, -1):
        current_column_index = remaining_cap // step_size
        if orginal_cap==remaining_cap:
          current_column_index +=1

        if dp_table[i][current_column_index] != dp_table[i - 1][current_column_index]:
            selected_clusters.append(cluster_representatives[i - 1]["cluster_id"])
            remaining_cap -= cluster_representatives[i - 1]["weight"]

    print("Clusters number: {selected_clusters}")

# Fix sorting and round-robin logic to handle both cluster_dict and noise_cluster_dict
selected_clusters.sort(
    key=lambda cid: sum(item.weight for item in (cluster_dict.get(cid, []) or noise_cluster_dict.get(cid, [])))
)

# Prepare selected_cluster_items from both cluster_dict and noise_cluster_dict
selected_cluster_items = {
    cluster_id: deque(sorted(
        cluster_dict.get(cluster_id, noise_cluster_dict.get(cluster_id, [])),
        key=lambda item: item.weight
    ))
    for cluster_id in selected_clusters
}

progress = True
while remaining_capacity > 0 and progress and any(selected_cluster_items.values()):
    progress = False
    for cluster_id, queue in selected_cluster_items.items():
        if queue:
            item = queue.popleft()
            if remaining_capacity >= item.weight:
                taken[item.index - 1] = 1
                final_value += item.value
                total_weight += item.weight
                remaining_capacity -= item.weight
                progress = True

# Output results
print("\nSelected items (0 = not taken, 1 = taken):")
print(taken)
print(f"Final knapsack value: {final_value}")
print(f"Total weight: {total_weight}")
print(f"Remaining capacity after selection: {remaining_capacity}")

Saving ks_19_0 to ks_19_0 (3)
capacity 31181

Clusters number: 7

Noise Items number: 1
min_cluster_size : 2
adjusted_capacity 31181
step_size 1113
columns 29

Dynamic Programming Table:
rows 8
columns 30
Clusters number: {selected_clusters}

Selected items (0 = not taken, 1 = taken):
[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]
Final knapsack value: 11798
Total weight: 31096
Remaining capacity after selection: 85
