In [1]:
from concurrent.futures import ProcessPoolExecutor
import numpy as np  
from libKMCUDA import kmeans_cuda


def run_kmeans_on_device(data, clusters=16, seed=3, device=1):
    centroids, _ = kmeans_cuda(data, clusters, verbosity=1, seed=seed, device=device) 
    # kmeans_cuda(arr, 16, verbosity=1, seed=3, device=3)
    # return np.array(centroids, assignments)
    return centroids

devices = [1, 2, 4, 8]  # 這些是 2 的冪次方值，分別對應於第 1, 2, 3, 4 張 GPU
# devices = [3, 12, 48, 192]  # 分別對應於第 12, 34, 56, 78 張 GPU
results = [None] * 12

In [1]:
import timm
float_model = timm.create_model('deit3_small_patch16_224.fb_in1k', pretrained=True)
data_config = timm.data.resolve_model_data_config(float_model)
val_transform = timm.data.create_transform(**data_config, is_training=False)

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
deit_models = timm.list_models("deit*")

In [16]:
deit_models

['deit3_base_patch16_224',
 'deit3_base_patch16_384',
 'deit3_huge_patch14_224',
 'deit3_large_patch16_224',
 'deit3_large_patch16_384',
 'deit3_medium_patch16_224',
 'deit3_small_patch16_224',
 'deit3_small_patch16_384',
 'deit_base_distilled_patch16_224',
 'deit_base_distilled_patch16_384',
 'deit_base_patch16_224',
 'deit_base_patch16_384',
 'deit_small_distilled_patch16_224',
 'deit_small_patch16_224',
 'deit_tiny_distilled_patch16_224',
 'deit_tiny_patch16_224']

In [19]:
model = timm.create_model("deit_base_distilled_patch16_224", pretrained=True)

Downloading model.safetensors: 100%|██████████| 349M/349M [00:18<00:00, 18.5MB/s] 


In [17]:
from torchinfo import summary

In [None]:
summary(model, (1,3,224,224))

In [21]:
model

VisionTransformerDistilled(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): 

In [2]:
val_transform

Compose(
    Resize(size=248, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(224, 224))
    ToTensor()
    Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
)

In [3]:
train_transform = timm.data.create_transform(**data_config, is_training=True)
train_transform

Compose(
    RandomResizedCropAndInterpolation(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
    RandomHorizontalFlip(p=0.5)
    ColorJitter(brightness=(0.6, 1.4), contrast=(0.6, 1.4), saturation=(0.6, 1.4), hue=None)
    ToTensor()
    Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
)

In [12]:
float_model.blocks

Sequential(
  (0): Block(
    (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
    (attn): Attention(
      (qkv): Linear(in_features=384, out_features=1152, bias=True)
      (q_norm): Identity()
      (k_norm): Identity()
      (attn_drop): Dropout(p=0.0, inplace=False)
      (proj): Linear(in_features=384, out_features=384, bias=True)
      (proj_drop): Dropout(p=0.0, inplace=False)
    )
    (ls1): LayerScale()
    (drop_path1): Identity()
    (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
    (mlp): Mlp(
      (fc1): Linear(in_features=384, out_features=1536, bias=True)
      (act): GELU(approximate='none')
      (drop1): Dropout(p=0.0, inplace=False)
      (norm): Identity()
      (fc2): Linear(in_features=1536, out_features=384, bias=True)
      (drop2): Dropout(p=0.0, inplace=False)
    )
    (ls2): LayerScale()
    (drop_path2): Identity()
  )
  (1): Block(
    (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
    (attn): Attention(
  

In [2]:
data_sets = [np.random.rand(2000000, 32).astype(np.float32) for _ in range(12)]

In [4]:

with ProcessPoolExecutor(max_workers=12) as executor:
    futures = []
    for i in range(4):  # 因為有四張 GPU
        for j in range(3):  # 每張 GPU 計算三組數據
            idx = i * 3 + j  # 數據集的索引
            data = data_sets[idx]
            future = executor.submit(run_kmeans_on_device, data, device=devices[i])
            futures.append((idx, future))
    
    for idx, future in futures:
        results[idx] = future.result() # future.result() -> centroids

reassignments threshold: 20000
reassignments threshold: 20000
transposing the samples...
performing kmeans++...
done            
running Lloyd until reassignments drop below 220000
iteration 1: 2000000 reassignments
transposing the samples...
performing kmeans++...
done            
running Lloyd until reassignments drop below 220000
reassignments threshold: 20000
iteration 1: 2000000 reassignments
transposing the samples...
performing kmeans++...
step 1reassignments threshold: 20000
done            
running Lloyd until reassignments drop below 220000
iteration 1: 2000000 reassignments
transposing the samples...
performing kmeans++...
done            
running Lloyd until reassignments drop below 220000
iteration 1: 2000000 reassignments
reassignments threshold: 20000
transposing the samples...
performing kmeans++...
done            
running Lloyd until reassignments drop below 220000
iteration 1: 2000000 reassignments
reassignments threshold: 20000
transposing the samples...
reassignmen

iteration 8: 41988 reassignments
iteration 7: 42301 reassignments
iteration 7: 45518 reassignments


In [2]:
def prepare_data():
    return np.random.rand(20000000, 32).astype(np.float32)  
data_set2 = []  
with ProcessPoolExecutor(max_workers=12) as executor:
    futures = []
    for i in range(12):  # 因為有四張 GPU
        future = executor.submit(prepare_data)
        futures.append(future)
    for future in futures:
        data_set2.append(future.result())

In [5]:
data_set2.__len__()

12

In [None]:
results2 = [None] * 12
with ProcessPoolExecutor(max_workers=4) as executor:
    futures = []
    for i in range(4):  # 因為有四個process
        for j in range(3):  # 每兩張 GPU 計算三組數據
            idx = i * 3 + j  # 數據集的索引
            data = data_set2[idx]
            future = executor.submit(run_kmeans_on_device, data, device=devices[i])
            futures.append((idx, future))
    
    for idx, future in futures:
        results2[idx] = future.result() # future.result() -> centroids

In [1]:
from concurrent.futures import ProcessPoolExecutor
import numpy as np  
from libKMCUDA import kmeans_cuda
# from tqdm import tqdm

def run_kmeans_on_device(data, clusters=16, seed=3, device=1):
    centroids, _ = kmeans_cuda(data, clusters, verbosity=1, seed=seed, device=device) 
    # kmeans_cuda(arr, 16, verbosity=1, seed=3, device=3)
    # return np.array(centroids, assignments)
    return centroids

devices = [1, 2, 4, 8]  # 這些是 2 的冪次方值，分別對應於第 1, 2, 3, 4 張 GPU
# devices = [3, 12, 48, 192]  # 分別對應於第 12, 34, 56, 78 張 GPU
results = [None] * 12
data_set = np.load("/work/u1887834/tensor_storage/blocks.0.attn.qkv.npy")

In [3]:

with ProcessPoolExecutor(max_workers=4) as executor:
    futures = []
    for i in range(4):  # 因為有四張 GPU
        for j in range(3):  # 每張 GPU 計算三組數據
            idx = i * 3 + j  # 數據集的索引
            tmp_data = data_set[:, idx * 32: (idx + 1) * 32] # data[:, i * subvec_len: (i + 1) * subvec_len]
            # print(tmp_data.shape) # (batch * patch, subvec_len)
            future = executor.submit(run_kmeans_on_device, tmp_data, device=devices[i])
            futures.append((idx, future))
    
    for idx, future in futures:
        results[idx] = future.result() # future.result() -> kmcuda_centroids

Processing codebooks: 100%|██████████| 3/3 [00:00<00:00, 30.37it/s]
Processing codebooks: 100%|██████████| 3/3 [00:00<00:00, 2620.35it/s]
Processing codebooks: 100%|██████████| 3/3 [00:00<00:00, 11105.84it/s]
Processing codebooks: 100%|██████████| 3/3 [00:00<00:00, 21183.35it/s]


reassignments threshold: 236400
transposing the samples...
performing kmeans++...
done            
running Lloyd until reassignments drop below 2600400
iteration 1: 23640000 reassignments
reassignments threshold: 236400
transposing the samples...
performing kmeans++...
done            
running Lloyd until reassignments drop below 2600400
iteration 1: 23640000 reassignments
reassignments threshold: 236400
transposing the samples...
performing kmeans++...
done            
running Lloyd until reassignments drop below 2600400
iteration 1: 23640000 reassignments
reassignments threshold: 236400
transposing the samples...
performing kmeans++...
done            
running Lloyd until reassignments drop below 2600400
iteration 1: 23640000 reassignments
iteration 2: 5693650 reassignments
iteration 3: 2922650 reassignments
iteration 4: 1979003 reassignments
transposing the samples...
performing kmeans++...
done            
iteration 1: 16 reassignments
iteration 2: 0 reassignments
transposing the s

iteration 7: 799088 reassignments
