In [1]:
try:
    import pandas as pd
    from sklearn.neighbors import NearestNeighbors
    import numpy as np
    import math
    import matplotlib.pyplot as plt
    from kneed import KneeLocator
    import os
    import re
    from datetime import datetime
    import time
    from scipy.spatial.distance import euclidean
    from rtree import index
except:
    !pip3 install pandas numpy matplotlib kneed rtree

In [2]:
directory = 'data1'
files = os.listdir(directory)
csv_files = [file for file in files if file.endswith('.csv')]
dataframes = []
for csv_file in csv_files:
    file_path = os.path.join(directory, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df = combined_df.sort_values(by='featureType', ignore_index=True)
combined_df

Unnamed: 0,featureType,latitude,longitude,xCoordinate,yCoordinate
0,0,26.587351,49.984146,5557.983448,2956.378544
1,0,14.782391,42.973093,4778.389924,1643.726883
2,0,12.825748,44.794380,4980.907798,1426.158108
3,0,12.828622,44.796322,4981.123739,1426.477682
4,0,15.554965,48.172800,5356.570962,1729.633192
...,...,...,...,...,...
7183,8,15.325444,44.211895,4916.138421,1704.111621
7184,8,15.325444,44.211895,4916.138421,1704.111621
7185,8,15.325444,44.211895,4916.138421,1704.111621
7186,8,15.325444,44.211895,4916.138421,1704.111621


In [3]:
# x and y values of the original data
X = np.array(df[['xCoordinate', 'yCoordinate']].values)
# attack types of the original data
attack_types = np.array(df['featureType'])
data_to_append = []
num_pts = len(X)
averages = []

In [4]:
# implementing RTree indexing
from scipy.spatial import distance

start_time = time.time()
max_k = round(math.sqrt(len(X))) + 1
all_distances = np.empty((0, max_k))
dp_table = np.zeros((len(X), max_k), dtype=float) #dynamic programming table

for val_to_exclude in np.unique(attack_types):
    mask1 = attack_types == val_to_exclude
    mask2 = attack_types != val_to_exclude
    current_attack_X = X[mask1]  # X of the current attack type
    dataset = X[mask2] # Dataset not including the current attack type

    idx = index.Index()   # create a spatial index
    for i, point in enumerate(dataset):
        x_coord, y_coord = point[0], point[1]
        idx.insert(i, (x_coord, y_coord, x_coord, y_coord))  # Insert each point as a bounding box

    for point in current_attack_X:
        x_coord, y_coord = point[0], point[1]
        nearest_ids = list(idx.nearest((x_coord, y_coord, x_coord, y_coord), max_k))
        if len(nearest_ids) > 0:
            dists = [distance.euclidean(np.array([x_coord, y_coord]), dataset[nid][:2]) for nid in nearest_ids]
            sorted_dists = np.sort(dists)[:max_k] 
            all_distances = np.concatenate((all_distances, [sorted_dists]), axis=0)

# Summing distances for each k from 1 to 3
dp_table[:, 2] = np.sum(all_distances[:, :3], axis=1)

# Calculating cumulative sums for each k from 3 to max_k
for i in range(len(X)):
    for k in range(3, max_k):
        dp_table[i, k] = dp_table[i, k - 1] + all_distances[i][k]

column_sums = np.sum(dp_table, axis=0)
averages = []
for k in range(2, max_k):
    average = column_sums[k] / (len(X) * (k + 1))
    averages.append(average)

end_time = time.time()
data_to_append.append([start_time, end_time, abs(start_time - end_time), num_pts])

In [5]:
k_value = 0    
x = range(3, len(averages) + 3)
y = averages    
kn = KneeLocator(x, y, curve='concave', direction='increasing')
averages = np.array(averages)
if kn.knee == 3 or kn.knee == len(averages) or kn.knee == None:
    # Calculate differences between neighboring points
    differences = averages[1:] - averages[:-1]

    for i in range(len(differences)):
        if i > 0:
            if temp < differences[i]:
                k_value = i
                distance_threshold = averages[i - 1]
                break
        temp = differences[i]     
else:
    k_value = kn.knee
    distance_threshold = averages[k_value]

In [6]:
# Check the value of distance_threshold
print(distance_threshold)

# Open a file in write mode
#with open('/home/amk7r/compression_code/distance_threshold_parameter.txt', 'w') as file:
with open('required_files/distance_threshold_parameter.txt', 'w') as file:
    # Write to the file
    file.write(str(distance_threshold))
    print("File written successfully.")

# Check if the file is created in the expected location
import os
print(os.listdir(os.getcwd()))

7.532365734109275
File written successfully.
['distance_threhold.ipynb', 'c_functions.so', 'regional_colocation.ipynb', 'data1', 'data2', 'c_functions.cpp', 'required_files', '.ipynb_checkpoints', 'ReadMe.md']
