In [28]:
import pandas as pd
# Reading data
all_data_matrix = pd.read_csv("TimeBasedFeatures-10s-Layer2.csv")
all_data_matrix

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,...,Bwd IAT Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label
0,10.0.2.15,57188,82.161.239.177,110,6,7248168,2.112603e+04,29.110804,34515.085714,2.738693e+05,...,7,0,0,0,0,0,0,0,0,AUDIO
1,10.0.2.15,57188,82.161.239.177,110,6,5157723,1.052790e+03,3.683796,286540.166667,8.788385e+05,...,451,0,0,0,0,0,0,0,0,AUDIO
2,10.0.2.15,57188,82.161.239.177,110,6,41,1.324390e+07,48780.487805,41.000000,0.000000e+00,...,0,0,0,0,0,0,0,0,0,AUDIO
3,10.0.2.15,57188,82.161.239.177,110,6,9543565,6.373855e+04,83.930900,11929.456250,9.007219e+04,...,5,0,0,0,0,0,0,0,0,AUDIO
4,10.0.2.15,57188,82.161.239.177,110,6,9987537,1.996166e+05,245.806348,4069.900978,2.036573e+04,...,5,0,0,0,0,0,0,0,0,AUDIO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8039,82.161.239.177,110,10.0.2.15,47257,6,7413083,2.991468e+03,8.633385,117667.984127,6.261953e+05,...,217,0,0,0,0,0,0,0,0,CHAT
8040,82.161.239.177,110,10.0.2.15,47257,6,9807892,6.089994e+02,2.039174,516204.842105,1.971833e+06,...,1134,1420,0,1420,1420,8646908,0,8646908,8646908,CHAT
8041,82.161.239.177,110,10.0.2.15,47257,6,9979796,9.793787e+02,3.306681,311868.625000,1.289556e+06,...,371,2241210,0,2241210,2241210,7341693,0,7341693,7341693,CHAT
8042,82.161.239.177,110,10.0.2.15,47257,6,1327,8.183873e+05,2260.738508,663.500000,5.861915e+02,...,0,0,0,0,0,0,0,0,0,CHAT


In [34]:
import nmslib 
import numpy 
import sys 
import time 
import math
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


#Data
all_data_matrix = pd.read_csv("TimeBasedFeatures-10s-Layer2.csv")

df = all_data_matrix.drop('Source IP', axis=1)
label = df[['label']]
df = df.drop(' Destination IP', axis=1)
print("all data matrix shape after")
print(all_data_matrix.shape)
# Encode label to numbers
le = preprocessing.LabelEncoder()
le.fit(df.label)
df['label'] = le.transform(df.label)
all_data_matrix = df.to_numpy()


(data_matrix_train, query_matrix) = train_test_split(all_data_matrix, test_size = 0.2)
print("train split is:")
print(data_matrix_train.shape)
print(query_matrix.shape)

# We are setting index parameters.
NN = 50 # neighbours 
efC = 100 #the depth of the search 
num_threads = 4
index_time_params = {'NN': NN, 'indexThreadQty': num_threads, 'efConstruction': efC}

#Initialising number of neighbours
K=100

# Space name should correspond to the space name
space_name='kldivgenfast'

# Intitialize the library, specify the space, the type of the vector and add data points
index = nmslib.init(method='sw-graph', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR)
index.addDataPointBatch(data_matrix_train)


# Create an index
start = time.time()
index.createIndex(index_time_params)
end = time.time()
print('Index-time parameters', index_time_params)
print('Indexing time = %f' % (end-start))

# We are setting query-time parameters
efS = 1500
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

# Querying
query_qty = query_matrix.shape[0]
start = time.time()
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time()
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' %(end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))

# Computing gold-standard data
print('Computing gold-standard data')
start = time.time()
gs = []

query_qty = query_matrix.shape[0]
data_qty = data_matrix_train.shape[0]

for i in range(query_qty):
    q = query_matrix[i, :]
    d = numpy.log(data_matrix_train * (1.0 / q))
    dist_vals = numpy.sum(data_matrix_train * d, axis=1)
    tmp = [(dist_vals[i], i) for i in range(data_qty)]
    tmp.sort()
    gs.append([tmp[i][1] for i in range(K)])

end = time.time()

print('brute-force kNN time total=%f (sec), per query=%f (sec)' %(end - start, float(end - start) / query_qty))


# Finally computing recall
recall=0.0
for i in range(0, query_qty):
  correct_set = set(gs[i])
  ret_set = set(nbrs[i][0])
  recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)
recall = recall / query_qty
print('kNN recall %f' % recall)

# Save a meta index and the data
index.saveIndex('dense_index_kldiv.txt', save_data=True)

# Re-intitialize the library, specify the space, the type of the vector.
newIndex = nmslib.init(method='sw-graph', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR)

# Re-load the index and the data
newIndex.loadIndex('dense_index_kldiv.txt', load_data=True)

# Setting query-time parameters and querying
print('Setting query-time parameters', query_time_params)
newIndex.setQueryTimeParams(query_time_params)

K = 500000000
query_qty = query_matrix.shape[0]
start = time.time()
new_nbrs = newIndex.knnQueryBatch(query_matrix, k = K, num_threads = 1)
end = time.time()
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' 
      %(end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))

# Finally computing recall
recall=0.0
for i in range(0, query_qty):
    correct_set = set(gs[i])
    ret_set = set(new_nbrs[i][0])
    recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)
recall = recall / query_qty
print()
print('Final Value of Recall : %f' % recall)
print("DONE")

all data matrix shape after
(8044, 29)
train split is:
(6435, 27)
(1609, 27)
index 1
<nmslib.FloatIndex method='sw-graph' space='kldivgenfast' at 000002467CB1E6D0>
Index-time parameters {'NN': 50, 'indexThreadQty': 4, 'efConstruction': 100}
Indexing time = 0.280799
index
<nmslib.FloatIndex method='sw-graph' space='kldivgenfast' at 000002467CB1E6D0>
Setting query-time parameters {'efSearch': 1500}
kNN time total=0.684352 (sec), per query=0.000425 (sec), per query adjusted for thread number=0.001701 (sec)
Computing gold-standard data




brute-force kNN time total=7.853710 (sec), per query=0.004881 (sec)
kNN recall 0.018073
Setting query-time parameters {'efSearch': 1500}
kNN time total=2.648669 (sec), per query=0.001646 (sec), per query adjusted for thread number=0.006585 (sec)

Final Value of Recall : 0.897116
DONE


TypeError: list indices must be integers or slices, not tuple