In [1]:

import torch
from sklearn import metrics
from sklearn.cluster import KMeans
from ssnmf.ssnmf import SSNMF_T
from torch.utils.data import random_split

from utils.logging import log
from utils.process import prepare_df, data_tensors

In [2]:
df = prepare_df()
df = df[df['Label'] != 'BENIGN']
df = df.sample(1000)
df.groupby('Label')['Label'].count()

Label
Bot                           2
DDoS                        230
DoS GoldenEye                20
DoS Hulk                    409
DoS Slowhttptest             11
DoS slowloris                 5
FTP-Patator                  18
PortScan                    295
SSH-Patator                   4
Web Attack   Brute Force      3
Web Attack   XSS              3
Name: Label, dtype: int64

In [None]:


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


def get_Y_torch(X, y):
    y_unique = torch.unique(y)
    sample_size = X.shape[1]
    Y = torch.zeros(y_unique.shape[0], sample_size, device=device)
    for i in range(sample_size):
        j = ((y_unique == y[i]).nonzero(as_tuple=True)[0])
        Y[j, i] = 1
    return Y


def get_L_torch(Y, fraction_known=0.5):
    L = torch.zeros(Y.shape, device=device)
    m, n = L.shape
    num_samples = int(fraction_known * n)
    labeled_data = torch.randperm(n, dtype=torch.int32, device=device)[:num_samples]
    for i in labeled_data:
        L[:, i] = torch.tensor(1)
    return L


def split_L_idx(L, test_size=0.25):
    sample_size = L.shape[1]
    test_len = int(test_size * sample_size)
    train_idx, test_idx = torch.utils.data.random_split(range(sample_size), [sample_size - test_len, test_len])
    return train_idx, test_idx


def get_L_train(L, test_idx):
    L_train = torch.clone(L)
    L_train[:, test_idx] = torch.tensor(0, dtype=torch.float32, device=device)
    return L_train

#model = SSNMF(M_s,10, modelNum=1)

In [24]:
X, y = data_tensors(df)


Y = get_Y_torch(X.T, y)
for fraction_known in [0.1, 0.3, 0.5, 0.6, 0.8]:
    L = get_L_torch(Y, fraction_known=fraction_known)
    for k in [5, 10, 20, 30]:
        for lam in (0, 0.001, 0.5, 0.1, 0.5, 1, 10, 100, 500, 1000, 5000):
            print(f"fraction known {fraction_known}, k {k}, lam {lam}")
            for ifold in range(5):
                train_idx, test_idx = split_L_idx(L, test_size=0.25)
                L_train = get_L_train(L, test_idx)
                snmf = SSNMF_T(X.T, k, L=L_train, Y=Y, lam=lam * torch.linalg.norm(X), modelNum=5)
                snmf.mult(numiters=1000)
                y_pred = torch.argmax(snmf.S, axis=0)
                kmeans = KMeans(n_clusters=k, random_state=0).fit(snmf.S.T)
                rand_score = metrics.rand_score(y, kmeans.labels_)
                adujsted_rand_score = metrics.adjusted_rand_score(y[test_idx], kmeans.labels_[test_idx])
                print(rand_score)
                #a_mutual_score = metrics.adjusted_mutual_info_score(y_train, kmeans.labels_)
                #print(f"Rand score {rand_score}")
                #print(f"Adjusted rand score {adujsted_rand_score}")
                #print(f"Adjusted mutual score {a_mutual_score}")
                results = [fraction_known, k, lam, adujsted_rand_score]
                log(results)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace([np.inf, -np.inf], np.nan, inplace=True)


fraction known 0.1, k 5, lam 0
0.7461138435386481
0.7878877596645357
0.7874345929354328
0.7482185512360374
0.8020124630920875
fraction known 0.1, k 5, lam 0.001
0.799506954598736
0.7551006432953479
0.7878877596645357
0.7874627899763548
0.7427060297357937
fraction known 0.1, k 5, lam 0.5
0.8147494692914083
0.7940367286598752
0.7839824694968439
0.7547320676890108
0.7351834620326844
fraction known 0.1, k 5, lam 0.1
0.7538821283126488
0.7453968330694896
0.7949350058206749
0.7682807458520139
0.748907364664274
fraction known 0.1, k 5, lam 0.5
0.7728829057453485
0.8097142834124864
0.7786935102496244
0.7527401481553093
0.7766975625672198
fraction known 0.1, k 5, lam 1
0.7829190382392156
0.7545125335846898
0.762441944306816
0.7416043310654856
0.7485951831397808
fraction known 0.1, k 5, lam 10
0.7556021478088885
0.7641035556468603
0.7417553866418533
0.7252399769589895
0.7779241338473251
fraction known 0.1, k 5, lam 100
0.7920448091261737
0.7897286236218696
0.7862583735141166
0.7198180888045663
0

In [3]:
import pandas as pd
results_df = pd.read_csv('OUTPUT/output_cv', names=['fraction_known', 'k', 'lam', 'adjusted rand score'])

for fraction_known in [0.1, 0.3, 0.5, 0.6, 0.8]:
    results_df_f = results_df[(results_df['fraction_known'] == fraction_known)]
    df_grouped = results_df_f.groupby(['k', 'lam'])['adjusted rand score'].mean().reset_index()
    idx = df_grouped['adjusted rand score'] == df_grouped['adjusted rand score'].max()
    print(f'results for fraction {fraction_known}')
    print(df_grouped[idx])


results for fraction 0.1
     k    lam  adjusted rand score
10  10  0.001             0.452656
results for fraction 0.3
   k    lam  adjusted rand score
1  5  0.001             0.465278
results for fraction 0.5
   k  lam  adjusted rand score
4  5  1.0             0.497147
results for fraction 0.6
     k  lam  adjusted rand score
31  30  1.0             0.518191
results for fraction 0.8
   k  lam  adjusted rand score
4  5  1.0             0.630768
