In [1]:
import pandas as pd
from sklearn.decomposition import NMF
from utils.process import prepare_df, data_arrays, data_tensors, LABELS
from utils.logging import log
from sklearn import metrics
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from ssnmf.ssnmf import SSNMF_T
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.cluster import KMeans
from torch.utils.data import random_split

In [2]:
df = prepare_df()
df = df[df['Label'] != 'BENIGN']
df = df.sample(1000)
df.groupby('Label')['Label'].count()

Label
Bot                           3
DDoS                        240
DoS GoldenEye                17
DoS Hulk                    396
DoS Slowhttptest              9
DoS slowloris                15
FTP-Patator                  16
PortScan                    293
SSH-Patator                   9
Web Attack   Brute Force      1
Web Attack   XSS              1
Name: Label, dtype: int64

In [3]:
from ssnmf import SSNMF
import random

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


def get_Y_torch(X, y):
    y_unique = torch.unique(y)
    sample_size = X.shape[1]
    Y = torch.zeros(y_unique.shape[0], sample_size, device=device)
    for i in range(sample_size):
        j = ((y_unique == y[i]).nonzero(as_tuple=True)[0])
        Y[j, i] = 1
    return Y


def get_L_torch(Y, fraction_known=0.5):
    L = torch.zeros(Y.shape, device=device)
    m, n = L.shape
    num_samples = int(fraction_known * n)
    labeled_data = torch.randperm(n, dtype=torch.int32, device=device)[:num_samples]
    for i in labeled_data:
        L[:, i] = torch.tensor(1)
    return L


def split_L_idx(L, test_size=0.25):
    sample_size = L.shape[1]
    test_len = int(test_size * sample_size)
    train_idx, test_idx = torch.utils.data.random_split(range(sample_size), [sample_size - test_len, test_len])
    return train_idx, test_idx


def get_L_train(L, test_idx):
    L_train = torch.clone(L)
    L_train[:, test_idx] = torch.tensor(0, dtype=torch.float32, device=device)
    return L_train

#model = SSNMF(M_s,10, modelNum=1)

In [None]:
X, y = data_tensors(df)


Y = get_Y_torch(X.T, y)
for fraction_known in [0.1, 0.3, 0.5, 0.6, 0.8]:
    L = get_L_torch(Y, fraction_known=fraction_known)
    for k in [5, 10, 20, 30]:
        for lam in (0, 0.001, 0.5, 0.1, 0.5, 1, 10, 100, 500, 1000, 5000):
            print(f"fraction known {fraction_known}, k {k}, lam {lam}")
            for ifold in range(5):
                train_idx, test_idx = split_L_idx(L, test_size=0.25)
                L_train = get_L_train(L, test_idx)
                snmf = SSNMF(X.T, k, L=L_train, Y=Y, lam=lam * torch.linalg.norm(X), modelNum=5)
                snmf.mult(numiters=1000)
                y_pred = np.argmax(snmf.S, axis=0)
                kmeans = KMeans(n_clusters=k, random_state=0).fit(snmf.S.T)
                #rand_score = metrics.rand_score(y_train, kmeans.labels_)
                adujsted_rand_score = metrics.adjusted_rand_score(y[test_idx], kmeans.labels_[test_idx])
                print(adujsted_rand_score)
                #a_mutual_score = metrics.adjusted_mutual_info_score(y_train, kmeans.labels_)
                #print(f"Rand score {rand_score}")
                #print(f"Adjusted rand score {adujsted_rand_score}")
                #print(f"Adjusted mutual score {a_mutual_score}")
                results = [fraction_known, k, lam, adujsted_rand_score]
                log(results)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace([np.inf, -np.inf], np.nan, inplace=True)


fraction known 0.1, k 5, lam 0
0.37504765024175124
0.32930948099976237
0.4585955841746441
0.5027264241376738
0.5049724592552858
fraction known 0.1, k 5, lam 0.001
0.5470166517621262
0.43076339298915545
0.5427788666213096
0.44537522694878096
0.438259151945882
fraction known 0.1, k 5, lam 0.5
0.3224050530504364
0.2611707695483946
0.5742802563241816
0.37514588107709407
0.18526791514099028
fraction known 0.1, k 5, lam 0.1
0.44502735711714997
0.4394476097123837
0.504654040437347
0.5140814326422978
0.5471384721204392
fraction known 0.1, k 5, lam 0.5
0.4760545249107202
0.35857632309809107
0.5720016027767437
0.49206893635283666
0.4910254528985639
fraction known 0.1, k 5, lam 1
0.35350730375643796
0.46709430989711775
0.15545733833886793
0.4700788147024995
0.4177142268683114
fraction known 0.1, k 5, lam 10
0.3748818076695888
0.4842773244192078
0.33769876123337184
0.47399271769956214
0.35832157084457084
fraction known 0.1, k 5, lam 100
0.4554511786791363
0.2926979373895293
0.44450843763207776
0.4