In [1]:
try:
    import pandas as pd
    import numpy as np
    import math
    import itertools
    import time
    from itertools import chain, combinations
    import csv
    import matplotlib.pyplot as plt
    from rtree import index
    import bisect
    import ctypes
except:
    !pip install pandas numpy matplotlib scipy rtree

In [2]:
# parameters
distance_threshold = 10
prevalence_threshold = 0.5

In [3]:
#load data
data = pd.read_csv("synthetic_data/TestCase1_1.csv",dtype={'featureType': str})
data = data.sort_values(by='featureType', ignore_index=True)

In [4]:
data

Unnamed: 0,featureType,xCoordinate,yCoordinate
0,1,5367.2,8088.7
1,1,5024.5,2214.5
2,1,8699.6,5145.2
3,1,1900.2,6739.2
4,1,8719.2,2444.2
...,...,...,...
160210,9,8237.7,8301.6
160211,9,3849.7,636.4
160212,9,34.2,6671.2
160213,9,8338.6,952.1


In [5]:
star_neighbors = {}  # Dictionary to store neighbors of different types within distance_threshold for each instance

In [6]:
# Important Variables
featureInfo = {}  # Dictionary to store count, start row ID, and end row ID for each feature type
instance_table = {}  # Dictionary to store instance table of each candidate colocation pattern may change to only prevalant one later
hashmap = {}  # Hashmap to store unique instance ids of each participating feature to calculate PI
finalPrevalantPatterns = []

In [7]:
#utility function
# for finding the range of number
def findNeighborsInRange(arr, x, y):               #log(n) search
    start_index = bisect.bisect_left(arr, x)
    end_index = bisect.bisect_right(arr, y)
    return arr[start_index:end_index]

In [8]:
# Initialize variables to track count, start row ID, and end row ID
count = 0
start_row_id = 0
prev_feature = None

# Iterate through the DataFrame
for i, row in data.iterrows():
    feature = row['featureType']
    
    # If feature type changes, update feature_info for the previous feature
    if feature != prev_feature:
        if prev_feature is not None:
            featureInfo[prev_feature] = {'count': count, 'start': start_row_id, 'end': i - 1}
        count = 1
        start_row_id = i
        prev_feature = feature
    else:
        count += 1

# Update feature_info for the last feature
if prev_feature is not None:
    featureInfo[prev_feature] = {'count': count, 'start': start_row_id, 'end': len(data) - 1}
    
# Check if the last feature only has one occurrence
if prev_feature is not None and count == 1:
    featureInfo[prev_feature] = {'count': count, 'start': start_row_id, 'end': start_row_id}

featureInfo

{'1': {'count': 15195, 'start': 0, 'end': 15194},
 '10': {'count': 15048, 'start': 15195, 'end': 30242},
 '11': {'count': 3000, 'start': 30243, 'end': 33242},
 '12': {'count': 3000, 'start': 33243, 'end': 36242},
 '13': {'count': 3000, 'start': 36243, 'end': 39242},
 '2': {'count': 15195, 'start': 39243, 'end': 54437},
 '3': {'count': 15195, 'start': 54438, 'end': 69632},
 '4': {'count': 15195, 'start': 69633, 'end': 84827},
 '5': {'count': 15195, 'start': 84828, 'end': 100022},
 '6': {'count': 15048, 'start': 100023, 'end': 115070},
 '7': {'count': 15048, 'start': 115071, 'end': 130118},
 '8': {'count': 15048, 'start': 130119, 'end': 145166},
 '9': {'count': 15048, 'start': 145167, 'end': 160214}}

In [9]:
# write the featureInfo to a csv file
with open('required_files/featureInfo.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['feature', 'count', 'start', 'end'])
    for feature, values in featureInfo.items():
        writer.writerow([feature, values['count'], values['start'], values['end']])

In [10]:
import math
def calc_distance(point1, point2):
    x1, y1 = point1['xCoordinate'], point1['yCoordinate']
    x2, y2 = point2['xCoordinate'], point2['yCoordinate']
    return math.sqrt((x2 - x1)**2 + (y2 - y1)**2)

In [11]:
# Note: I have to filter the nearby points because rtree does not find points within a distance (circle) is does a square
from rtree import index
data['xCoordinate'] = data['xCoordinate'].astype(float)
data['yCoordinate'] = data['yCoordinate'].astype(float)
# Create a spatial index
idx = index.Index()
# Insert each point into the spatial index with its unique identifier
for i, row in data.iterrows():
    x_coord = row['xCoordinate']
    y_coord = row['yCoordinate']
    idx.insert(i, (x_coord, y_coord, x_coord, y_coord))
    
# Iterate over each row in the DataFrame
for i, row in data.iterrows():
    row_id = i
    feature_type = row['featureType']
    x_coord = row['xCoordinate']
    y_coord = row['yCoordinate']

    # Query the spatial index to find nearby points within the distance threshold
    nearby_points = list(idx.intersection((x_coord - distance_threshold, y_coord - distance_threshold, x_coord + distance_threshold, y_coord + distance_threshold)))

    # Filter neighbors based on distance and feature type, and ensure they are greater than the key
    points_to_add = sorted([j for j in nearby_points if j != row_id and calc_distance(data.loc[j], data.loc[row_id]) <= distance_threshold and data.loc[j, 'featureType'] != feature_type and j > row_id])
    # Store the nearby points in the dictionary
    star_neighbors[row_id] = points_to_add

In [12]:
# write the star_neighbor to a csv file
with open('required_files/starNeighbors.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['feature', 'star neighbors'])
    for feature, neighbors in star_neighbors.items():
        values_str = ' '.join(map(str, neighbors))
        writer.writerow([feature, values_str])

In [13]:
lib = ctypes.CDLL('c_functions.so')  # change according to where the shared cpp library is created
lib.colocation_main(ctypes.c_double(prevalence_threshold))

Degree 2 Prevalent Patterns:
(1, 2)
(1, 3)
(1, 4)
(1, 5)
(10, 6)
(10, 7)
(10, 8)
(10, 9)
(2, 3)
(2, 4)
(2, 5)
(3, 4)
(3, 5)
(4, 5)
(6, 7)
(6, 8)
(6, 9)
(7, 8)
(7, 9)
(8, 9)
Degree 3 Prevalent Patterns:
(1, 2, 3)
(1, 2, 4)
(1, 2, 5)
(1, 3, 4)
(1, 3, 5)
(1, 4, 5)
(10, 6, 7)
(10, 6, 8)
(10, 6, 9)
(10, 7, 8)
(10, 7, 9)
(10, 8, 9)
(2, 3, 4)
(2, 3, 5)
(2, 4, 5)
(3, 4, 5)
(6, 7, 8)
(6, 7, 9)
(6, 8, 9)
(7, 8, 9)
Degree 4 Prevalent Patterns:
(1, 2, 3, 4)
(1, 2, 3, 5)
(1, 2, 4, 5)
(1, 3, 4, 5)
(10, 6, 7, 8)
(10, 6, 7, 9)
(10, 6, 8, 9)
(10, 7, 8, 9)
(2, 3, 4, 5)
(6, 7, 8, 9)
Degree 5 Prevalent Patterns:
(1, 2, 3, 4, 5)
(10, 6, 7, 8, 9)


0