In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [2]:
data_original = pd.read_csv('kddcup.data')
#data_original

In [3]:
def statistics(data, centroids):
    
    entropy = 0
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    
    distances = np.linalg.norm(data.values[:,:,None]-centroids[None,:,:].transpose(0,2,1), axis=1)
    
    for i in range(centroids.shape[0]):
        
        cluster = np.argmin(distances, axis=1)==i
        
        normal = data_original['normal.'].values[cluster] == 'normal.'
        
        normal_sum = normal.sum()
        
        p_normal = normal_sum/normal.shape[0]
        p_unnormal = (normal.shape[0] - normal_sum)/normal.shape[0]
        
        # Calculate the entropy
        if p_normal == 0 or p_unnormal == 0:
            pass
        else:
            entropy += -(normal.shape[0]/data.shape[0])*(p_normal * np.log(p_normal) + p_unnormal * np.log(p_unnormal))
        
        # Calculate TP, FP, TN, FN
        if p_normal >= p_unnormal:
            TN += normal_sum
            FN += normal.shape[0]-normal_sum
        else:
            TP += normal.shape[0]-normal_sum
            FP += normal_sum
            
        
    return [entropy, TP, FP, TN, FN]

In [5]:
def normalize(data):
    data = data_original.drop('normal.', axis=1)
    def categorical_as_multicolumns(pd_data):
        for column_name in pd_data.columns:
            if pd_data[column_name].dtype.name == 'object':
                for category in np.unique(pd_data[column_name]):
                    pd_data[column_name + '-' + category] = pd_data[column_name] == category
                pd_data.drop(column_name, axis=1, inplace=True)
        return pd_data
    
    def normalize(pd_data):
        for column in pd_data.columns:
            std = pd_data[column].std()
            if np.allclose(std, 0):
                print(f"For column {column} too low std detected: {std}\nDropping column")
                pd_data.drop(column, axis=1, inplace=True)
            else:
                pd_data[column] = (pd_data[column] - pd_data[column].mean())/pd_data[column].std()
        return pd_data

    data = categorical_as_multicolumns(data)
    data = normalize(data)
    return data


stats = pd.DataFrame(index=np.arange(2), columns=['K', 'weighted-entropy', 'TP', 'FP', 'TN', 'FN'])

def find(data, K=[11, 116]):
    #np.random.seed(0)
    global U
    global S
    global V
    U, S, V = np.linalg.svd(data.values.T, full_matrices = False)
    print(f'Calculated SVD. Mean of SVs: {S.mean()}')
    
    DIM_SUBSPACE = 20
    pcs = U[:,:DIM_SUBSPACE] @ np.diag(S[:DIM_SUBSPACE])
    pdata = data @ pcs
    return pdata

    for i, k in enumerate(K):
        centroids = KMeans(k, random_state=0).fit(pdata).cluster_centers_
        #centroids = np.loadtxt(f"./centroids/{k}")
        distances = np.linalg.norm(pdata.values[:,:,None]-centroids[None,:,:].transpose(0,2,1), axis=1)
        closest = np.argmin(distances, axis=1)
        distance = np.min(distances, axis=1)
        print(np.unique(closest))
        data_original[f'K{k}-closest'] = closest
        data_original[f'K{k}-distance'] = distance
        
        
        #centroids = np.loadtxt(f"./centroids/{k}")
        stats.iloc[i] = [k] + statistics(pdata, centroids)
        

In [6]:
d = normalize(data_original.copy())
d

For column 0.13 too low std detected: 0.0
Dropping column


Unnamed: 0,0,215,45076,0.1,0.2,0.3,0.4,0.5,1,0.6,...,SF-REJ,SF-RSTO,SF-RSTOS0,SF-RSTR,SF-S0,SF-S1,SF-S2,SF-S3,SF-SF,SF-SH
0,-0.066833,-0.001777,0.005325,-0.002391,-0.015139,-0.001103,-0.026521,-0.004391,2.442793,-0.002097,...,-0.240993,-0.033048,-0.004991,-0.040683,-0.464665,-0.010422,-0.005733,-0.003195,0.555182,-0.014573
1,-0.066833,-0.001698,0.000208,-0.002391,-0.015139,-0.001103,-0.026521,-0.004391,2.442793,-0.002097,...,-0.240993,-0.033048,-0.004991,-0.040683,-0.464665,-0.010422,-0.005733,-0.003195,0.555182,-0.014573
2,-0.066833,-0.001701,0.001455,-0.002391,-0.015139,-0.001103,-0.026521,-0.004391,2.442793,-0.002097,...,-0.240993,-0.033048,-0.004991,-0.040683,-0.464665,-0.010422,-0.005733,-0.003195,0.555182,-0.014573
3,-0.066833,-0.001695,-0.000942,-0.002391,-0.015139,-0.001103,-0.026521,-0.004391,2.442793,-0.002097,...,-0.240993,-0.033048,-0.004991,-0.040683,-0.464665,-0.010422,-0.005733,-0.003195,0.555182,-0.014573
4,-0.066833,-0.001696,0.000292,-0.002391,-0.015139,-0.001103,-0.026521,-0.004391,2.442793,-0.002097,...,-0.240993,-0.033048,-0.004991,-0.040683,-0.464665,-0.010422,-0.005733,-0.003195,0.555182,-0.014573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4898425,-0.066833,-0.001724,0.001852,-0.002391,-0.015139,-0.001103,-0.026521,-0.004391,2.442793,-0.002097,...,-0.240993,-0.033048,-0.004991,-0.040683,-0.464665,-0.010422,-0.005733,-0.003195,0.555182,-0.014573
4898426,-0.066833,-0.001716,-0.001330,-0.002391,-0.015139,-0.001103,-0.026521,-0.004391,2.442793,-0.002097,...,-0.240993,-0.033048,-0.004991,-0.040683,-0.464665,-0.010422,-0.005733,-0.003195,0.555182,-0.014573
4898427,-0.066833,-0.001717,0.003901,-0.002391,-0.015139,-0.001103,-0.026521,-0.004391,2.442793,-0.002097,...,-0.240993,-0.033048,-0.004991,-0.040683,-0.464665,-0.010422,-0.005733,-0.003195,0.555182,-0.014573
4898428,-0.066833,-0.001716,0.000218,-0.002391,-0.015139,-0.001103,-0.026521,-0.004391,2.442793,-0.002097,...,-0.240993,-0.033048,-0.004991,-0.040683,-0.464665,-0.010422,-0.005733,-0.003195,0.555182,-0.014573


In [8]:
d.shape

(4898430, 121)

In [6]:
find(d, K=[11, 116])

Calculated SVD. Mean of SVs: 1928.633676608329
[ 0  1  2  3  4  5  6  7  8  9 10]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115]


In [6]:
stats

Unnamed: 0,K,weighted-entropy,TP,FP,TN,FN
0,11,0.0359559,3918980,54252,918528,6670


In [7]:
def precision(TP, FP):
    return TP / (TP + FP)

def recall(TP, FN):
    return TP /(TP + FN)

def F1(precision, recall):
    return 2*precision*recall/(precision + recall)

F1(precision(stats['TP'].values, stats['FP'].values), recall(stats['TP'].values, stats['FN'].values))

array([0.9922872629316403], dtype=object)

In [10]:
stats_original = pd.read_csv('./centroids/statistics.csv')
F1(precision(stats_original['TP'].values, stats_original['FP'].values), recall(stats_original['TP'].values, stats_original['FN'].values))

array([0.88975848, 0.88975848, 0.99700044, 0.99019595, 0.99021123,
       0.99021186, 0.99021199, 0.98989867, 0.9898988 , 0.99104026,
       0.99769201, 0.99768971, 0.99777644, 0.99794609, 0.99799483,
       0.99801746, 0.997773  , 0.99797718, 0.99799787, 0.99779739,
       0.99798925, 0.99757602, 0.99796416, 0.99799535, 0.99758253,
       0.99783856, 0.99764506, 0.99784593, 0.9976429 , 0.99764547,
       0.99784741, 0.99784728, 0.99764698, 0.9976443 , 0.99810112,
       0.99764928, 0.99764915, 0.99810087, 0.99773784, 0.99777366,
       0.99810623, 0.99810649, 0.99810598, 0.99810687, 0.99832611,
       0.9978072 , 0.99810509, 0.99810522, 0.99830939, 0.99853021,
       0.99832472, 0.99832484, 0.9985283 , 0.99830838, 0.99852855,
       0.99832522, 0.99852893, 0.9986022 , 0.99852893, 0.99852969,
       0.99866572, 0.9985274 , 0.99852817, 0.99860501, 0.99866645,
       0.99860742, 0.99864278, 0.9985657 , 0.99874488, 0.99864177,
       0.99881161, 0.99874604, 0.99879976, 0.99874769, 0.99874

In [6]:
stats = pd.DataFrame(index=np.arange(1), columns=['K', 'weighted-entropy', 'TP', 'FP', 'TN', 'FN'])
centroids = np.loadtxt(f"./centroids/{11}")
stats.iloc[0] = [11] + statistics(pdata, centroids)

NameError: name 'pdata' is not defined

In [7]:
#data_original
data_original.to_csv('cluster_labels/train_cluster_labeled', index=False)

In [11]:
a = pd.read_csv('cluster_labels/train_cluster_labeled')
a['K116-closest'].unique().shape


(116,)

In [12]:
for i in range(11):
    print((a['K116-closest']==i).shape)
    

(4898430,)
(4898430,)
(4898430,)
(4898430,)
(4898430,)
(4898430,)
(4898430,)
(4898430,)
(4898430,)
(4898430,)
(4898430,)


In [13]:
data_test = pd.read_csv('kddcup.testdata.unlabeled')
data_test

Unnamed: 0,0,udp,private,SF,105,146,0.1,0.2,0.3,0.4,...,1.2,1.3,1.00.1,0.00.6,1.00.2,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11
0,0,udp,private,SF,105,146,0,0,0,0,...,255,254,1.0,0.01,0.00,0.0,0.0,0.0,0.0,0.0
1,0,udp,private,SF,105,146,0,0,0,0,...,255,254,1.0,0.01,0.00,0.0,0.0,0.0,0.0,0.0
2,0,udp,private,SF,105,146,0,0,0,0,...,255,254,1.0,0.01,0.00,0.0,0.0,0.0,0.0,0.0
3,0,udp,private,SF,105,146,0,0,0,0,...,255,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0
4,0,udp,private,SF,105,146,0,0,0,0,...,255,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2984148,0,udp,private,SF,105,147,0,0,0,0,...,255,255,1.0,0.00,0.01,0.0,0.0,0.0,0.0,0.0
2984149,0,udp,private,SF,105,105,0,0,0,0,...,255,255,1.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0
2984150,0,udp,private,SF,105,147,0,0,0,0,...,255,255,1.0,0.00,0.01,0.0,0.0,0.0,0.0,0.0
2984151,0,udp,private,SF,105,105,0,0,0,0,...,255,255,1.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0


In [None]:
data_original.columns

In [None]:
data_test.columns

In [14]:
corrected = pd.read_csv('corrected')
corrected

Unnamed: 0,0,udp,private,SF,105,146,0.1,0.2,0.3,0.4,...,254,1.00.1,0.01,0.00.6,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,normal.
0,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.00,0.0,0.0,0.0,0.0,0.0,normal.
1,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.00,0.0,0.0,0.0,0.0,0.0,normal.
2,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.00,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
3,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
4,0,udp,private,SF,105,146,0,0,0,0,...,255,1.0,0.00,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311023,0,udp,private,SF,105,147,0,0,0,0,...,255,1.0,0.00,0.01,0.0,0.0,0.0,0.0,0.0,normal.
311024,0,udp,private,SF,105,147,0,0,0,0,...,255,1.0,0.00,0.01,0.0,0.0,0.0,0.0,0.0,normal.
311025,0,udp,private,SF,105,147,0,0,0,0,...,255,1.0,0.00,0.01,0.0,0.0,0.0,0.0,0.0,normal.
311026,0,udp,private,SF,105,147,0,0,0,0,...,255,1.0,0.00,0.01,0.0,0.0,0.0,0.0,0.0,normal.


In [None]:
corrected.columns