In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.linalg import svd
import math
import copy
from collections import Counter
from sklearn.mixture import GMM

In [10]:
intrusion_data = pd.read_csv('./Datasets/intrusion_detection_data.csv')

intrusion_train = intrusion_data.sample(frac = 0.8, random_state = 200)
intrusion_validation = intrusion_data.drop(intrusion_train.index)

intrusion_data

Unnamed: 0,duration,service,src_bytes,dst_bytes,hot,num_failed_logins,num_compromised,num_root,num_file_creations,num_access_files,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,xAttack
0,0,20,491,0,0,0,0,0,0,0,...,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0,45,146,0,0,0,0,0,0,0,...,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0,50,0,0,0,0,0,0,0,0,...,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,dos
3,0,25,232,8153,0,0,0,0,0,0,...,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0,25,199,420,0,0,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
5,0,50,0,0,0,0,0,0,0,0,...,19,0.07,0.07,0.00,0.00,0.00,0.00,1.00,1.00,dos
6,0,50,0,0,0,0,0,0,0,0,...,9,0.04,0.05,0.00,0.00,1.00,1.00,0.00,0.00,dos
7,0,50,0,0,0,0,0,0,0,0,...,15,0.06,0.07,0.00,0.00,1.00,1.00,0.00,0.00,dos
8,0,52,0,0,0,0,0,0,0,0,...,23,0.09,0.05,0.00,0.00,1.00,1.00,0.00,0.00,dos
9,0,50,0,0,0,0,0,0,0,0,...,13,0.05,0.06,0.00,0.00,1.00,1.00,0.00,0.00,dos


In [63]:
def make_cov_mat(data):
    sigma = np.array([[0 for _ in range(len(data.keys()))] for _ in range(len(data.keys()))])
    for index, row in data.iterrows():
        row_val = []
        for key in row.keys():
            row_val.append(row[key])
        row_val = np.reshape(row_val, (len(row_val), 1))
        row_val_T = row_val.T
        mat_mul = row_val @ row_val_T
        sigma = sigma + mat_mul
    
    return sigma/len(data)

In [127]:
def find_k_value(s):
    den = sum(s)
    num = 0
    for k in range(0, 29):
        num += s[k]
        var_retained = num/den
        if var_retained >= 0.90:
            break
    return k+1

In [324]:
def dimension_reduction(data):
    cov_mat = make_cov_mat(data)
    u, s, v = svd(cov_mat)
    print(s)
    k = find_k_value(s)
    u_reduced = u[:, :k]
    z_mat = (u_reduced.T @ data.T).T
    return z_mat, u_reduced

data = intrusion_data.iloc[:, :-1]
data = (data - data.mean())/data.std()
z, u_r = dimension_reduction(data)

[6.74131682e+00 4.84997929e+00 2.18496558e+00 1.88895798e+00
 1.47819881e+00 1.34407439e+00 1.13798223e+00 1.02873495e+00
 1.01503603e+00 1.00014789e+00 9.96052923e-01 9.40898304e-01
 7.91107080e-01 7.20715883e-01 6.50767922e-01 5.00647061e-01
 4.49465645e-01 4.06948620e-01 3.47987022e-01 2.12194982e-01
 9.89795611e-02 6.67638496e-02 5.12506468e-02 3.98868645e-02
 2.64486337e-02 1.63126880e-02 9.08728874e-03 4.14368391e-03
 7.17150204e-04]


In [325]:
def check_convergence(x, y):
    return np.linalg.norm(x - y)

In [326]:
def eucledian_dist(row, centroids):
    number = 0
    cluster = 0
    min_dist = float('Inf')
    for centroid in centroids:
        distance = 0
        for i in range(len(row)):
            distance += pow((row[i] - centroid[i]), 2)
        distance = math.sqrt(distance)
        if min_dist > distance:
            min_dist = distance
            cluster = number
        number += 1
    return cluster

In [327]:
def shift_centroid_to_mean(points):
    if len(points) > 0:
        avg = []
        for i in range(14):
            summation = 0
            for j in range(len(points)):
                summation += points[j][i]
            avg.append(summation/len(points))
        return avg
    else:
        return None

In [334]:
def calc_purity(clusters, data, k):
    label = intrusion_data.iloc[:, -1]
    for c in range(k):
        cluster = []
        for i in range(len(data)):
            if clusters[i] == c:
                cluster.append(label[i])
        cluster_label, num_of_occurences = Counter(cluster).most_common(1)[0]
        purity = num_of_occurences/len(cluster)
        print('Purity of cluster - ' + str(c) + ' i.e. - ' + str(cluster_label) + ' is: ' + str(purity))

In [338]:
def kmeans(data, k):
    c = []
#     for _ in range(k):
#         c.append(np.random.randint(-1, 1, size = 14))
    for i in range(k):
        x = np.random.randint(0, np.max(data))
        c.append(data[x])
    centroids = np.array(c, dtype = np.float32)
    centroids_old = np.zeros(centroids.shape)
    clusters = np.zeros(len(data))
    convergence = check_convergence(centroids, centroids_old)
    print(convergence)
    while convergence != 0:
        for i in range(len(data)):
            cluster = eucledian_dist(data[i], centroids)
            clusters[i] = cluster
        print(np.unique(clusters))
        centroids_old = copy.deepcopy(centroids)
        for i in range(k):
            points = []
            for j in range(len(data)):
                if clusters[j] == i:
                    points.append(data[j])
            new = shift_centroid_to_mean(points)
            if new is not None:
                centroids[i] = new
            else:
                continue
        convergence = check_convergence(centroids, centroids_old)
        print(convergence)
#     print(centroids)
    calc_purity(clusters, data, k)

kmeans(z, 5)

7.459925443669484
[0. 1. 2. 3. 4.]
4.143994
[0. 1. 2. 3. 4.]
1.665245
[0. 1. 2. 3. 4.]
1.0091935
[0. 1. 2. 3. 4.]
0.59315103
[0. 1. 2. 3. 4.]
0.13321522
[0. 1. 2. 3. 4.]
0.017708987
[0. 1. 2. 3. 4.]
0.0027795706
[0. 1. 2. 3. 4.]
0.0005352232
[0. 1. 2. 3. 4.]
0.00016393905
[0. 1. 2. 3. 4.]
0.0
Purity of cluster - 0 i.e. - dos is: 0.9783475267258561
Purity of cluster - 1 i.e. - normal is: 0.8621935560930455
Purity of cluster - 2 i.e. - normal is: 0.7341520079654829
Purity of cluster - 3 i.e. - dos is: 0.45787132328530644
Purity of cluster - 4 i.e. - dos is: 0.9939152820032764


In [None]:
gmm = GMM(n_components=4).fit(intrusion_data)
labels = gmm.predict(intrusion_data)