In [1]:
import pandas as pd
from sklearn.decomposition import NMF
from utils.process import prepare_df, data_arrays
from sklearn import metrics
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch

In [2]:
df = prepare_df()
df = df[df['Label'] != 'BENIGN']
df = df.sample(10000)
df.groupby('Label')['Label'].count()

Label
Bot                           33
DDoS                        2296
DoS GoldenEye                185
DoS Hulk                    4128
DoS Slowhttptest             100
DoS slowloris                103
FTP-Patator                  130
Heartbleed                     1
Infiltration                   3
PortScan                    2857
SSH-Patator                  116
Web Attack   Brute Force      28
Web Attack   XSS              20
Name: Label, dtype: int64

In [3]:
class SSNM_wrapper:

    def fit_transform(self, X, **kwargs):
        self.model = SSNMF(X, **kwargs)
        self.model.mult(1000)
        return self

    def fit(self, X, **kwargs):
        return self.fit_transform(X, **kwargs)

    def score(self, X, y):
        y_pred = self.predict()
        return metrics.rand_score(y, y_pred)

    def predict(self):
        return np.argmax(self.model.S, axis=0)



In [4]:
from ssnmf import SSNMF
import random

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


def get_Y(X, y):
    y_unique = list(np.unique(y))
    sample_size = X.shape[1]
    Y = np.zeros((len(y_unique), sample_size))
    for i in range(sample_size):
        j = y_unique.index(y[i])
        Y[j, i] = 1
    return Y


def get_Y_torch(X, y):
    y_unique = torch.unique(y)
    sample_size = X.shape[1]
    Y = torch.zeros(y_unique.shape[0], sample_size, device=device)
    for i in sample_size:
        j = ((y_unique == y[i]).nonzero(as_tuple=True)[0])
        Y[j, i] = 1
    return Y


def get_L(Y, fraction_known=0.5):
    L = np.zeros(Y.shape)
    m, n = L.shape
    labeled_data = random.sample(range(n), int(fraction_known * n))
    for i in labeled_data:
        L[:i] = 1
    return L


def get_L_torch(Y, fraction_known=0.5):
    L = torch.zeros(Y.shape, device=device)
    m, n = L.shape
    num_samples = int(fraction_known * n)
    labeled_data = torch.randperm(n, dtype=torch.int32, device=device)[:num_samples]
    for i in labeled_data:
        L[:i] = 1
    return L

#model = SSNMF(M_s,10, modelNum=1)

In [6]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from ssnmf import SSNMF

skf = StratifiedKFold(n_splits=5)

X, y = data_arrays(df)
skf.get_n_splits(X, y)

k = 12
lam = 50
fraction_known = 0.1

for lam in (0, 0.001, 0.5, 0.1, 0.5, 1, 10, 100, 500, 1000, 5000):
    for train_index, test_index in skf.split(X, y):
        X_train_T, X_test_T = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Y_train = get_Y(X_train_T.T, y_train)
        L_train = get_L(Y_train, fraction_known=fraction_known)
        #print(f"norm X {np.linalg.norm(X_train_T)}")
        snmf = SSNMF(X_train_T.T, k, L=L_train, Y=Y_train, lam=lam * np.linalg.norm(X_train_T), modelNum=3)
        snmf.mult(numiters=1000)
        y_pred = np.argmax(snmf.S, axis=0)
        score = metrics.rand_score(y_train, y_pred)
        print(f'lambda {lam} score {score}')


# #

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace([np.inf, -np.inf], np.nan, inplace=True)


lambda 0 score 0.7708653364995371
lambda 0 score 0.773759056857417
lambda 0 score 0.7722138877275334


KeyboardInterrupt: 

In [None]:
print(metrics.completeness_score(Labels, np.argmax(model.S, axis=0)))
print(metrics.homogeneity_score(Labels, np.argmax(model.S, axis=0)))

array([[83868909.0, 3675920.0, 986.0, ..., 95502143.0, 166023.0,
        102585716.0],
       [8.0, 4.0, 7.0, ..., 8.0, 4.0, 13.0],
       [7.0, 0.0, 0.0, ..., 4.0, 0.0, 6.0],
       ...,
       [50000000.0, 0.0, 0.0, ..., 59600000.0, 0.0, 0.0],
       [76400000.0, 0.0, 0.0, ..., 89800000.0, 0.0, 99200000.0],
       [5738681.0, 0.0, 0.0, ..., 5557884.0, 0.0, 99200000.0]],
      dtype=object)