# Load data

## Download data

In [1]:
!wget http://cam.yegor256.com/cam-2024-03-02.zip

--2025-02-24 15:29:49--  http://cam.yegor256.com/cam-2024-03-02.zip
16.182.105.181, 54.231.134.189, 52.216.209.237, ...
connected. to cam.yegor256.com (cam.yegor256.com)|16.182.105.181|:80... 
200 OKequest sent, awaiting response... 
Length: 2224871047 (2.1G) [application/zip]
Saving to: ‘cam-2024-03-02.zip’


2025-02-24 15:31:16 (24.6 MB/s) - ‘cam-2024-03-02.zip’ saved [2224871047/2224871047]



In [9]:
!unzip 'cam-2024-03-02.zip' data/all.csv

Archive:  cam-2024-03-02.zip
  inflating: data/all.csv            


## Prepare data

In [1]:
import pandas as pd

In [2]:
import os

csv_files = [os.path.basename(f)[:-4] for f in os.listdir('data/') if f.endswith('.csv')]
df = pd.read_csv('data/all.csv', on_bad_lines='skip', names=(['repo', 'class'] + list(range(49))), skiprows=1)

  df = pd.read_csv('data/all.csv', on_bad_lines='skip', names=(['repo', 'class'] + list(range(49))), skiprows=1)


In [3]:
from sklearn.impute import SimpleImputer
import numpy as np

imputer = SimpleImputer(strategy='mean')
imputed_df = imputer.fit_transform(df.replace('-', np.nan).drop(['repo', 'class'], axis = 1).astype("float32"))

  imputed_df = imputer.fit_transform(df.replace('-', np.nan).drop(['repo', 'class'], axis = 1).astype("float32"))


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_embedded[:, 0], X_embedded[:, 1], cmap='tab10', alpha=0.7)
plt.colorbar(scatter, label="Classes")
plt.title("t-SNE Visualization")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()

# Dissimilarity

In [4]:
import tensorflow as tf
import gc
print(tf.config.list_physical_devices('GPU'))

def remove_unused_objects():
    tf.keras.backend.clear_session()
    gc.collect()

2025-02-24 20:08:04.820514: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-24 20:08:04.833994: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740427684.849132    4133 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740427684.853677    4133 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-24 20:08:04.868909: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [15]:
from abc import ABC, abstractmethod
import tensorflow as tf
import numpy as np

class DissimilarityFunc(ABC):

    @abstractmethod
    def calculate(tf1: np.ndarray, tf2: np.ndarray) -> float:
        pass


In [18]:
class KruskalDissimilarity(DissimilarityFunc):

    def __init__(self, batch_size: int = 3072):
        self.batch_size = batch_size

    def calculate(self, df1: np.ndarray, df2: np.ndarray) -> float:

        def distance(tf1: tf.Tensor, tf2: tf.Tensor):
            return np.sqrt(tf.reduce_sum(tf.square(tf1 - tf2)).numpy())
        
        def pairwise_function(pairs):
            return tf.reduce_sum(tf.abs(pairs[..., 0] - pairs[..., 1]))
    
        def combinations_distances(tensor, distance_func):
            n = tf.shape(tensor)[0]
    
            i_idx, j_idx = tf.meshgrid(tf.range(n), tf.range(n), indexing='ij')
            mask = i_idx < j_idx
            i_idx_upper = tf.boolean_mask(i_idx, mask)
            j_idx_upper = tf.boolean_mask(j_idx, mask)
    
            indices_upper = tf.stack([i_idx_upper, j_idx_upper], axis=-1)
            pairs = tf.gather(tensor, indices_upper)
            result = tf.vectorized_map(pairwise_function, pairs)
            del i_idx, j_idx, mask, i_idx_upper, j_idx_upper, indices_upper, pairs
            return result
    
        total_numerator = 0.0
        total_denominator = 0.0
    
        num_batches = (df1.shape[0] + self.batch_size - 1) // self.batch_size
    
        for batch_idx in range(num_batches):
            start_idx = batch_idx * self.batch_size
            end_idx = min((batch_idx + 1) * self.batch_size, df1.shape[0])
    
            df1_batch = tf.convert_to_tensor(df1[start_idx:end_idx], dtype=tf.float64)
            df2_batch = tf.convert_to_tensor(df2[start_idx:end_idx], dtype=tf.float64)
    
            df1_distances = combinations_distances(df1_batch, distance)
            df2_distances = combinations_distances(df2_batch, distance)
    
            batch_numerator = tf.reduce_sum(tf.square(df1_distances - df2_distances))
            batch_denominator = tf.reduce_sum(tf.square(df1_distances))
    
            total_numerator += batch_numerator
            total_denominator += batch_denominator
    
            del df1_batch, df2_batch, df1_distances, df2_distances, batch_numerator, batch_denominator
            tf.keras.backend.clear_session()
            gc.collect()
    
        dissimilarity = tf.sqrt(total_numerator) / tf.sqrt(total_denominator)
    
        return dissimilarity.numpy()

In [19]:
KruskalDissimilarity().calculate(
    np.array([[1, 2, 3, 4], [2, 4, 6, 8]]),
    np.array([[1, 2], [2, 55]])
)

np.float64(17.0)

# Optimization algorithm

In [23]:
from sko.PSO import PSO
from typing import Callable, Tuple
from pydantic import BaseModel
import time

class TrainingArguments(BaseModel):
  func: Callable[..., float]
  dim: int
  pop: int
  max_iter: int
  lower_bound: list[float]
  upper_bound: list[float]
  w: float | None = None
  c1: float | None = None
  c2: float | None = None

def trainingWithDim(
    func: Callable[[Tuple[int, ...], int], float],
    dim, target_dim, pop, max_iter, w = None, c1 = None, c2 = None
):
  return TrainingArguments(
      func=lambda x: func(x[:-1], x[-1]),
      dim=dim + 1,
      pop=pop,
      max_iter=max_iter,
      lower_bound = ([0] * dim) + [2],
      upper_bound = ([1] * dim) + [target_dim],
      w = w,
      c1 = c1,
      c2 = c2,
  )

# Probs = Probabilities
# returns function that can be supplied to training algorithm
def trainingFunction(
    df: np.ndarray,
    dissimilarityFunc: DissimilarityFunc
    ) -> float:
  def optimizationFunction(featuresProbs, dimNumberProbs):
    t1 = time.time()
    topProbColumnsIdxs = sorted(enumerate(featuresProbs), key=lambda x: x[1], reverse=True)
    idxs = list(sorted(list(map(lambda el: round(el[0]), topProbColumnsIdxs[:round(dimNumberProbs)]))))
    dissimilarity = dissimilarityFunc.calculate(df, df[:, idxs])
    t2 = time.time()
    print(f"Er: {dissimilarity}({round(dimNumberProbs)}) of {idxs}. Took {t2 - t1} to run optim func")
    return dissimilarity
  return optimizationFunction

In [24]:
def categoricalPso(trainingArguments: TrainingArguments):
  hyperparameters = dict()
  hyperparameters.update({"w": trainingArguments.w}) if trainingArguments.w is not None else ()
  hyperparameters.update({"c1": trainingArguments.c1}) if trainingArguments.c1 is not None else ()
  hyperparameters.update({"c2": trainingArguments.c2}) if trainingArguments.c2 is not None else ()
  return PSO(
      func=trainingArguments.func,
      n_dim=trainingArguments.dim,
      pop=trainingArguments.pop,
      max_iter=trainingArguments.max_iter,
      lb=trainingArguments.lower_bound,
      ub=trainingArguments.upper_bound,
      verbose=True,
      **hyperparameters
  )

In [None]:

catPsoWithDim = categoricalPso(
  trainingWithDim(
    func=trainingFunction(imputed_df[np.random.choice(imputed_df.shape[0], size=50000, replace=False)], KruskalDissimilarity()),
    dim=imputed_df.shape[1],
    target_dim=5,
    pop=100,
    max_iter=3
  )
)

In [13]:
import dill

with open('cat_pso_with_dim_init_50000.dill', 'wb') as f:
    dill.dump(catPsoWithDim, f)

In [42]:
catPsoWithDim.run()
print('best_x is ', catPsoWithDim.gbest_x, 'best_y is', catPsoWithDim.gbest_y)

Er: 0.9977779957360353(3) of [2, 3, 4]. Took 8.828739404678345 to run optim func
Er: 0.9938203166282895(2) of [2, 12]. Took 8.875041007995605 to run optim func
Er: 94588856.58225007(5) of [8, 9, 20, 31, 35]. Took 8.831591844558716 to run optim func
Er: 3.552939571225447(2) of [4, 5]. Took 8.864304304122925 to run optim func
Er: 3.4281690865381167(3) of [0, 4, 6]. Took 8.830193758010864 to run optim func
Er: 0.9927244001668996(5) of [1, 3, 11, 22, 29]. Took 8.7992103099823 to run optim func
Er: 0.03150536445301944(2) of [0, 2]. Took 8.806965827941895 to run optim func
Er: 94588889.8206531(5) of [9, 20, 22, 27, 37]. Took 8.997760772705078 to run optim func
Er: 0.028193402040551107(4) of [0, 3, 10, 20]. Took 8.928230047225952 to run optim func
Er: 0.028193402040551107(4) of [0, 3, 4, 7]. Took 8.908781290054321 to run optim func
Er: 3.442579533488147(2) of [4, 12]. Took 8.89612364768982 to run optim func
Er: 0.03150536445301944(2) of [0, 2]. Took 8.809745788574219 to run optim func
Er: 6.6

In [72]:
import dill

with open('cat_pso_with_dim_50000_after_3_iter.dill', 'wb') as f:
    dill.dump(catPsoWithDim, f)

In [32]:
metrics = "ACoCo AHF AoCiH BugNum CAMC CAMC-cvc CC CoCo CoCoMn CoCoMx DOER Final FOut Getters HSD HSE HSV IR IRC IRCA IRLoC LCOM5 LCOM5-cvc LoC MHF MIdx MMAC MMAC-cvc NAPC NCSS NHD NHD-cvc NoBL NoCA NoCC NoCL NoCM NoGA NoII NOM NoMP NOMPMx NOMR NoOA NoOM NOP NoSA NoSMP NOSMPMx NoTP NULLs PCN PVN PVNMN PVNMx RAF SAHF SCOM SCOM-cvc Setters SMHF".split(" ")

In [33]:
def map_metrics(probs, n):
    topProbColumnsIdxs = sorted(enumerate(probs), key=lambda x: x[1], reverse=True)
    idxs = list(sorted(list(map(lambda el: round(el[0]), topProbColumnsIdxs[:round(n)]))))
    return np.array(metrics)[idxs]

In [29]:
import dill
with open('cat_pso_with_dim_50000_after_3_iter.dill', 'rb') as f:
    catPsoWithDim = dill.load(f)

In [37]:
[f"{map_metrics(catPsoWithDim.pbest_x[i], 5)}, {x[0]}" for i, x in enumerate(catPsoWithDim.pbest_y) if x == 0]

["['ACoCo' 'AHF' 'CAMC' 'CoCoMn' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'AoCiH' 'BugNum' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'CAMC' 'Final' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'AoCiH' 'CAMC' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'CC' 'CoCoMx' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'CAMC' 'Final' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'AoCiH' 'BugNum' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'AoCiH' 'CoCo' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'CoCoMn' 'FOut' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'BugNum' 'CAMC-cvc' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'CAMC' 'CoCoMx' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'BugNum' 'CAMC-cvc' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'AoCiH' 'CAMC' 'NOSMPMx'], 0.0",
 "['ACoCo' 'AHF' 'BugNum' 'CAMC-cvc' 'NOSMPMx'], 0.0"]

In [None]:
# [(array(['ACoCo', 'AHF', 'CAMC', 'CoCoMn', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'AoCiH', 'BugNum', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'CAMC', 'Final', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'AoCiH', 'CAMC', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'CC', 'CoCoMx', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'CAMC', 'Final', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'AoCiH', 'BugNum', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'AoCiH', 'CoCo', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'CoCoMn', 'FOut', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'BugNum', 'CAMC-cvc', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'CAMC', 'CoCoMx', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'BugNum', 'CAMC-cvc', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'AoCiH', 'CAMC', 'NOSMPMx'], dtype='<U9'),
#   array([0.])),
#  (array(['ACoCo', 'AHF', 'BugNum', 'CAMC-cvc', 'NOSMPMx'], dtype='<U9'),
#   array([0.]))]