<a href="https://colab.research.google.com/github/atharva-mohite/CS_403_Machine_Learning_project/blob/main/CS_403_generate_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


**CS 403 project: Density-weighted support vector machines for binary class imbalance learning**

**Result Generation Code**

---



**Instuctions to run the notebook:**

Run the following sections-

1. Add a shortcut of the 'CS_403_project_files' folder ([link](https://drive.google.com/drive/folders/1dQugKuL-zx1mXT4NcWCNCz1W2tdqMYRR?usp=sharing)) and mount the drive.

2. Paste the path to the shortcut of the 'CS_403_project_files' folder A blank CSV file 'project_results.csv' has been added to this folder as a result sheet template.

3. Run all the cells in the notebook. Ignore the warnings (if any)

4. The results will be saved as 'project_results_complete.xlsx' in the 'CS_403_project_files' folder

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, KMeansSMOTE, ADASYN
from imblearn.under_sampling import NearMiss, CondensedNearestNeighbour, TomekLinks
from imblearn.combine import SMOTEENN, SMOTETomek
from scipy.spatial.distance import cdist
from sklearn.neighbors import NearestNeighbors
import time
from sklearn import svm
from sklearn.metrics import confusion_matrix

In [4]:
datasets = ['04clover5z-600-5-50-BI.dat','04clover5z-600-5-70-BI.dat','paw02a-600-5-0-BI.dat','paw02a-600-5-70-BI.dat','paw02a-800-7-30-BI.dat']
over_or_under = ['None', 'SMOTE', 'Borderline_SMOTE', 'KMeans_SMOTE', 'ADASYN', 'Near-Miss', 'Condensed_NN', 'TomekLinks', 'SMOTEENN', 'SMOTETomek']
models = ['SVM', 'LSSVM', 'IDLSSVM-CIL']

folder_path = "/content/drive/MyDrive/CS_403_project_files"
results = pd.read_csv(folder_path + "/project_results.csv", header=None)

split_ratio = 0.3
optimized_params = [[(0.2034, 10**1, 2**(-2)), (0.2034, 10**2, 2**(-4)), (0.2034, 10**1, 2**(-4)), (0.2034, 10**4, 2**(-4))],
    [(0.1932, 10**2, 2**(-2)), (0.1932, 10**0, 2**(-2)), (0.1932, 10**0, 2**(-3)), (0.1932, 10**2, 2**(-2))],
    [(0.2068, 10**3, 2**(-2)), (0.2068, 10**1, 2**(-3)), (0.2068, 10**2, 2**(-3)), (0.2068, 10**1, 2**(-3))],
    [(0.2, 10**2, 2**(-1)), (0.2, 10**1, 2**(-2)), (0.2, 10**1, 2**(-2)), (0.2, 10**1, 2**(-2))],
    [(0.1429, 10**0, 2**(-3)), (0.1429, 10**3, 2**(-3)), (0.1429, 10**1, 2**(-4)), (0.1429, 10**2, 2**(-3))]]
E = 0.1

In [6]:
def rbf(x_i, x_j, sigma):
  return np.exp((-cdist(x_i, x_j)**2)/ sigma**2)

In [7]:
def compute_density_weight(X):
  nn_model = NearestNeighbors(n_neighbors=5)
  nn_model.fit(X)
  distances, _ = nn_model.kneighbors(X)
  kth_neighbor_distances = distances[:, -1]
  d = 1 - (kth_neighbor_distances/np.max(kth_neighbor_distances))
  D_ = np.diag(d, k=0)
  return D_

In [8]:
def fit_lssvm(X, y, C_, mu_):

  y = y.reshape(-1,1) if y.ndim==1 else y
  y_labels = np.unique(y, axis=0)
  y = np.where((y == y_labels[0]).all(axis=1),-1,+1)[:,np.newaxis]

  K = rbf(X,X,mu_)
  sigma = np.multiply(y*y.T, K)
  A = np.block([[0, y.T], [y, sigma + C_**-1 *np.eye(len(y))]])
  B = np.array([0]+[1]*len(y))

  A_inv = np.linalg.pinv(A)
  solution = np.dot(A_inv, B)
  b_ = solution[0]
  alpha_ = solution[1:]

  return (alpha_, b_)

In [9]:
def predict_lssvm(sv_X, X, y, alpha_, b_, mu_):

  X_reshaped = X.reshape(1,-1) if X.ndim==1 else X
  y = y.reshape(-1,1) if y.ndim==1 else y
  y_labels = np.unique(y, axis=0)
  y = np.where((y == y_labels[0]).all(axis=1),-1,+1)[:,np.newaxis]

  KxX = rbf(sv_X, X_reshaped, mu_)
  y = np.sign(np.dot(np.multiply(alpha_, y.flatten()), KxX) + b_)
  y_pred_labels = np.where(y==-1, y_labels[0], y_labels[1])

  return y_pred_labels

In [10]:
def fit_idlssvmcil(X, y, D, C_, mu_, E_):
  y = y.reshape(-1,1) if y.ndim==1 else y
  y_labels = np.unique(y, axis=0)
  y = np.where((y == y_labels[0]).all(axis=1),-1,+1)[:,np.newaxis]

  K = rbf(X,X,mu_)
  DTD = D.T*D
  DTD_inv = np.linalg.pinv(DTD)
  DTD_inv = (C_**-1)*DTD_inv + (E_)*np.eye(len(y))
  sigma = np.multiply(y*y.T, K)
  A = np.block([[0, y.T], [y, sigma + DTD_inv]])
  B = np.array([0]+[1]*len(y))

  A_inv = np.linalg.pinv(A)
  solution = np.dot(A_inv, B)
  b_ = solution[0]
  alpha_ = solution[1:]

  return (alpha_, b_)

In [11]:
def predict_idlssvmcil(sv_X, X, y, alpha_, b_, mu_):

  X_reshaped = X.reshape(1,-1) if X.ndim==1 else X
  y = y.reshape(-1,1) if y.ndim==1 else y
  y_labels = np.unique(y, axis=0)
  y = np.where((y == y_labels[0]).all(axis=1),-1,+1)[:,np.newaxis]

  KxX = rbf(sv_X, X_reshaped,mu_)
  y = np.sign(np.dot(np.multiply(alpha_, y.flatten()), KxX) + b_)
  y_pred_labels = np.where(y==-1, y_labels[0], y_labels[1])

  return y_pred_labels

In [12]:
for model_no in range(3):
  print("Model selected:",models[model_no])

  for dataset_no in range(5):
    train_IR, C, mu = optimized_params[dataset_no][model_no]

    file_path = folder_path + "/keel_datasets/" + datasets[dataset_no]
    df = pd.read_csv(file_path, header=None, names=['feature1', 'feature2', 'class'])
    df = df.iloc[7:].reset_index(drop=True).astype(int)

    min_values = df.min()
    max_values = df.max()
    df['feature1'] = (df['feature1'] - int(min_values[0])) / (int(max_values[0]) - int(min_values[0]))
    df['feature2'] = (df['feature2'] - int(min_values[1])) / (int(max_values[1]) - int(min_values[1]))

    df_1 = df[df['class'] == 1]
    df_0 = df[df['class'] == 0]
    train_size = int(df.shape[0]*(1-split_ratio))
    num_train_0 = int((train_size*train_IR)/(1+train_IR))
    df_train_0 = df_0.sample(n=num_train_0, random_state=42)
    df_train_1 = df_1.sample(n=(train_size-num_train_0), random_state=42)
    df_test_0 = df_0.drop(df_train_0.index)
    df_test_1 = df_1.drop(df_train_1.index)
    df_train = pd.concat([df_train_0, df_train_1], ignore_index=True)
    df_test = pd.concat([df_test_0, df_test_1], ignore_index=True)

    for over_or_under_no in range(10):
      X_train = df_train.iloc[:, :2].values
      y_train = df_train.iloc[:,2].values
      X_test = df_test.iloc[:, :2].values
      y_test = df_test.iloc[:,2].values

      if over_or_under_no in [0, 6, 7]:
        if over_or_under_no > 0:
          if over_or_under_no==6:
            sampling = CondensedNearestNeighbour()
          elif over_or_under_no==7:
            sampling = TomekLinks()
          X_train, y_train = sampling.fit_resample(X_train, y_train)
          train_IR = np.sum(y_train == 0)/np.sum(y_train == 1)

        if model_no==0:
          start_time = time.time()
          SVM_model = svm.SVC(C = C, kernel='rbf', gamma = mu, class_weight='balanced')
          SVM_model.fit(X_train, y_train)
          y_pred = SVM_model.predict(X_test)
          end_time = time.time()
          computation_time = end_time - start_time

        elif model_no==1:
          start_time = time.time()
          alpha, b = fit_lssvm(X_train, y_train, C, mu)
          y_pred = predict_lssvm(X_train, X_test, y_train, alpha, b, mu)
          end_time = time.time()
          computation_time = end_time - start_time

        elif model_no==2:
          start_time = time.time()
          D = compute_density_weight(X_train)
          alpha, b = fit_idlssvmcil(X_train, y_train, D, C, mu, E)
          y_pred = predict_idlssvmcil(X_train, X_test, y_train, alpha, b, mu)
          end_time = time.time()
          computation_time = end_time - start_time

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        auc = ((1 + (tp/(tp + fn)) - (fp/(tn + fp)))/2)*100
        g_mean = ((tp/(tp + fp)) * (tp/(tp + fn)))**0.5
        test_acc = ((tp + tn)/(tp + tn + fp + fn))*100

        if over_or_under_no==0:
          results.loc[2 + (24*model_no), ((7*dataset_no) + 3)] = '{:.2f}'.format(train_IR)
          results.loc[2 + (24*model_no), ((7*dataset_no) + 4)] = C
          results.loc[2 + (24*model_no), ((7*dataset_no) + 5)] = mu
          results.loc[2 + (24*model_no), ((7*dataset_no) + 6)] = '{:.4f}'.format(computation_time)
          results.loc[2 + (24*model_no), ((7*dataset_no) + 7)] = '{:.2f}'.format(auc)
          results.loc[2 + (24*model_no), ((7*dataset_no) + 8)] = '{:.5f}'.format(g_mean)
          results.loc[2 + (24*model_no), ((7*dataset_no) + 9)] = '{:.2f}'.format(test_acc)

        else:
          results.loc[12 + (24*model_no) + over_or_under_no, ((7*dataset_no) + 3)] = '{:.2f}'.format(train_IR)
          results.loc[12 + (24*model_no) + over_or_under_no, ((7*dataset_no) + 4)] = C
          results.loc[12 + (24*model_no) + over_or_under_no, ((7*dataset_no) + 5)] = mu
          results.loc[12 + (24*model_no) + over_or_under_no, ((7*dataset_no) + 6)] = '{:.4f}'.format(computation_time)
          results.loc[12 + (24*model_no) + over_or_under_no, ((7*dataset_no) + 7)] = '{:.2f}'.format(auc)
          results.loc[12 + (24*model_no) + over_or_under_no, ((7*dataset_no) + 8)] = '{:.5f}'.format(g_mean)
          results.loc[12 + (24*model_no) + over_or_under_no, ((7*dataset_no) + 9)] = '{:.2f}'.format(test_acc)

      else:
        for num in range(3):
          desired_ratio = [0.4, 0.6, 1.0][num]

          if over_or_under_no==1:
            sampling = SMOTE(sampling_strategy=desired_ratio, k_neighbors=5)
          elif over_or_under_no==2:
            sampling = BorderlineSMOTE(sampling_strategy=desired_ratio, k_neighbors=5)
          elif over_or_under_no==3:
            sampling = KMeansSMOTE(cluster_balance_threshold=0.2, sampling_strategy=desired_ratio, k_neighbors=2, kmeans_estimator=MiniBatchKMeans())
          elif over_or_under_no==4:
            sampling = ADASYN(sampling_strategy=desired_ratio)
          elif over_or_under_no==5:
            sampling = NearMiss(sampling_strategy=desired_ratio)
          elif over_or_under_no==8:
            sampling = SMOTEENN(sampling_strategy=desired_ratio)
          elif over_or_under_no==9:
            sampling = SMOTETomek(sampling_strategy=desired_ratio)
          X_train, y_train = sampling.fit_resample(X_train, y_train)
          train_IR = np.sum(y_train == 0)/np.sum(y_train == 1)

          if model_no==0:
            start_time = time.time()
            SVM_model = svm.SVC(C = C, kernel='rbf', gamma = mu, class_weight='balanced')
            SVM_model.fit(X_train, y_train)
            y_pred = SVM_model.predict(X_test)
            end_time = time.time()
            computation_time = end_time - start_time

          elif model_no==1:
            start_time = time.time()
            alpha, b = fit_lssvm(X_train, y_train, C, mu)
            y_pred = predict_lssvm(X_train, X_test, y_train, alpha, b, mu)
            end_time = time.time()
            computation_time = end_time - start_time

          elif model_no==2:
            start_time = time.time()
            D = compute_density_weight(X_train)
            alpha, b = fit_idlssvmcil(X_train, y_train, D, C, mu, E)
            y_pred = predict_idlssvmcil(X_train, X_test, y_train, alpha, b, mu)
            end_time = time.time()
            computation_time = end_time - start_time

          tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
          auc = ((1 + (tp/(tp + fn)) - (fp/(tn + fp)))/2)*100
          g_mean = ((tp/(tp + fp)) * (tp/(tp + fn)))**0.5
          test_acc = ((tp + tn)/(tp + tn + fp + fn))*100

          if over_or_under_no in [8,9]:
            results.loc[(20 + num + (over_or_under_no - 8)*3) + (24*model_no), ((7*dataset_no) + 3)] = '{:.2f}'.format(train_IR)
            results.loc[(20 + num + (over_or_under_no - 8)*3) + (24*model_no), ((7*dataset_no) + 4)] = C
            results.loc[(20 + num + (over_or_under_no - 8)*3) + (24*model_no), ((7*dataset_no) + 5)] = mu
            results.loc[(20 + num + (over_or_under_no - 8)*3) + (24*model_no), ((7*dataset_no) + 6)] = '{:.4f}'.format(computation_time)
            results.loc[(20 + num + (over_or_under_no - 8)*3) + (24*model_no), ((7*dataset_no) + 7)] = '{:.2f}'.format(auc)
            results.loc[(20 + num + (over_or_under_no - 8)*3) + (24*model_no), ((7*dataset_no) + 8)] = '{:.5f}'.format(g_mean)
            results.loc[(20 + num + (over_or_under_no - 8)*3) + (24*model_no), ((7*dataset_no) + 9)] = '{:.2f}'.format(test_acc)

          else:
            results.loc[(3 + num + (over_or_under_no - 1)*3) + (24*model_no), ((7*dataset_no) + 3)] = '{:.2f}'.format(train_IR)
            results.loc[(3 + num + (over_or_under_no - 1)*3) + (24*model_no), ((7*dataset_no) + 4)] = C
            results.loc[(3 + num + (over_or_under_no - 1)*3) + (24*model_no), ((7*dataset_no) + 5)] = mu
            results.loc[(3 + num + (over_or_under_no - 1)*3) + (24*model_no), ((7*dataset_no) + 6)] = '{:.4f}'.format(computation_time)
            results.loc[(3 + num + (over_or_under_no - 1)*3) + (24*model_no), ((7*dataset_no) + 7)] = '{:.2f}'.format(auc)
            results.loc[(3 + num + (over_or_under_no - 1)*3) + (24*model_no), ((7*dataset_no) + 8)] = '{:.5f}'.format(g_mean)
            results.loc[(3 + num + (over_or_under_no - 1)*3) + (24*model_no), ((7*dataset_no) + 9)] = '{:.2f}'.format(test_acc)

Model selected: SVM




Model selected: LSSVM




Model selected: IDLSSVM-CIL




In [13]:
output_path = '/content/drive/MyDrive/CS_403_project_files/project_results_complete.xlsx'
results.to_excel(output_path, index=False)