In [1]:
try:
    import pandas as pd
    from sklearn.neighbors import NearestNeighbors
    import numpy as np
    import math
    import matplotlib.pyplot as plt
    from kneed import KneeLocator
    import os
    import re
    from datetime import datetime
    import time
    from scipy.spatial.distance import euclidean
    from rtree import index
    from scipy.spatial import distance
except:
    !pip3 install pandas numpy matplotlib kneed rtree

In [2]:
directory = 'real_world_data/NorthAmerica'
files = os.listdir(directory)
csv_files = [file for file in files if file.endswith('.csv')]
dataframes = []
for csv_file in csv_files:  # read each file
    file_path = os.path.join(directory, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df = combined_df.sort_values(by='featureType', ignore_index=True)
combined_df

Unnamed: 0,featureType,latitude,longitude,xCoordinate,yCoordinate
0,0,34.607389,-86.979767,-9671.708811,3848.166081
1,0,42.880273,-87.900801,-9774.123119,4768.068811
2,0,47.462201,-122.254239,-13594.051140,5277.555959
3,0,47.242603,-122.454338,-13616.301130,5253.137775
4,0,47.658780,-117.426047,-13057.180680,5299.414546
...,...,...,...,...,...
3547,8,25.774591,-80.214195,-8919.411529,2866.003756
3548,8,25.774591,-80.214195,-8919.411529,2866.003756
3549,8,27.946530,-82.459269,-9169.052368,3107.512353
3550,8,14.540831,-92.220917,-10254.498100,1616.866636


In [3]:
# x and y values of the original data
X = np.array(df[['xCoordinate', 'yCoordinate']].values)
# attack types of the original data
attack_types = np.array(df['featureType'])
data_to_append = []
num_pts = len(X)
averages = []

In [4]:
# implementing RTree indexing
start_time = time.time()
max_k = round(math.sqrt(len(X))) + 1
all_distances = np.empty((0, max_k))
dp_table = np.zeros((len(X), max_k), dtype=float) #dynamic programming table

for val_to_exclude in np.unique(attack_types):
    mask1 = attack_types == val_to_exclude
    mask2 = attack_types != val_to_exclude
    current_attack_X = X[mask1]  # X of the current attack type
    dataset = X[mask2] # data set not including the current attack type

    idx = index.Index()   # create a spatial index
    for i, point in enumerate(dataset):
        x_coord, y_coord = point[0], point[1]
        idx.insert(i, (x_coord, y_coord, x_coord, y_coord))  # insert each point as a bounding box

    for point in current_attack_X:
        x_coord, y_coord = point[0], point[1]
        nearest_ids = list(idx.nearest((x_coord, y_coord, x_coord, y_coord), max_k))
        if len(nearest_ids) > 0:
            dists = [distance.euclidean(np.array([x_coord, y_coord]), dataset[nid][:2]) for nid in nearest_ids]
            sorted_dists = np.sort(dists)[:max_k] 
            all_distances = np.concatenate((all_distances, [sorted_dists]), axis=0)

# summing distances for each k from 1 to 3
dp_table[:, 2] = np.sum(all_distances[:, :3], axis=1)

# calculating cumulative sums for each k from 3 to max_k
for i in range(len(X)):
    for k in range(3, max_k):
        dp_table[i, k] = dp_table[i, k - 1] + all_distances[i][k]

column_sums = np.sum(dp_table, axis=0)
averages = []
for k in range(2, max_k):
    average = column_sums[k] / (len(X) * (k + 1))
    averages.append(average)

end_time = time.time()
data_to_append.append([start_time, end_time, abs(start_time - end_time), num_pts])

In [5]:
k_value = 0    
x = range(3, len(averages) + 3)
y = averages    
kn = KneeLocator(x, y, curve='concave', direction='increasing')
averages = np.array(averages)
if kn.knee == 3 or kn.knee == len(averages) or kn.knee == None:
    # calculate differences between neighboring points
    differences = averages[1:] - averages[:-1]

    for i in range(len(differences)):
        if i > 0:
            if temp < differences[i]:
                k_value = i
                distance_threshold = averages[i - 1]
                break
        temp = differences[i]     
else:
    k_value = kn.knee
    distance_threshold = averages[k_value - 3]

In [6]:
print(distance_threshold)

with open('required_files/distance_threshold_parameter.txt', 'w') as file:
    file.write(str(distance_threshold))
    print("File written successfully.")

76.56457118073217
File written successfully.
