In [1]:
# g++ -O3 -shared -o c_functions.so -fPIC c_functions.cpp
import pandas as pd
import numpy as np
import math
import csv
import ctypes
import os
import geopandas as gpd
from rtree import index
from itertools import combinations

In [2]:
# Parameters: These are the only things that need to be changed
# prevalence_threshold, shapefile_path, directory_path
with open('required_files/distance_threshold_parameter.txt', 'r') as file:   # file to read distance threshold path
    distance_threshold = float(file.read())
    
prevalence_threshold = 0.55    # set the prevalence threshold
# shapefile path to determine border points
shapefile_path = 'real_world_data/NorthAmerica/shapefile'
# path that holds the data (csv files of sub-regions)
directory_path = 'real_world_data/NorthAmerica'

In [3]:
# This class holds the information and functions pertaining to each subregion
class Subregion:
    def __init__(self, data):
        self.df = pd.DataFrame(data)
        self.df['featureType'] = self.df['featureType'].astype(str)
        self.featureInfo = {}  # Dictionary to store count, start row ID, and end row ID for each feature type
        self.star_neighbors = {}  # Dictionary to store neighbors of different types within distance_threshold for each instance

    # This function will calculate the feature ranges for each type of feature
    def calc_feature_info(self, offset):
        # Initialize variables to track count, start row ID, and end row ID
        count = 0
        start_row_id = 0
        prev_feature = None
        featureInfo = {}

        # Iterate through the DataFrame
        for i, row in self.df.iterrows():
            feature = row['featureType']

            # If feature type changes, update feature_info for the previous feature
            if feature != prev_feature:
                if prev_feature is not None:
                    featureInfo[prev_feature] = {'count': count, 'start': start_row_id, 'end': i - 1}
                count = 1
                start_row_id = i
                prev_feature = feature
            else:
                count += 1

        # Update feature_info for the last feature
        if prev_feature is not None:
            featureInfo[prev_feature] = {'count': count, 'start': start_row_id, 'end': len(self.df) - 1 + offset}

        # Check if the last feature only has one occurrence
        if prev_feature is not None and count == 1:
            featureInfo[prev_feature] = {'count': count, 'start': start_row_id, 'end': start_row_id}
                
        self.featureInfo = featureInfo
        
    # Calculates the distance between two points
    def calc_distance(self, point1, point2):
        x1, y1 = point1['xCoordinate'], point1['yCoordinate']
        x2, y2 = point2['xCoordinate'], point2['yCoordinate']
        return math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
    
    # This function will calculate the star neighbors for each instance
    def calc_star_neighbors(self, distance_threshold):
        star_neighbors = {}
        # Create a spatial index
        idx = index.Index()
        # Insert each point into the spatial index with its unique identifier
        for i, row in self.df.iterrows():
            x_coord = row['xCoordinate']
            y_coord = row['yCoordinate']
            idx.insert(i, (x_coord, y_coord, x_coord, y_coord))

        # Iterate over each row in the DataFrame
        for i, row in self.df.iterrows():
            row_id = i
            feature_type = row['featureType']
            x_coord = row['xCoordinate']
            y_coord = row['yCoordinate']

            # Query the spatial index to find nearby points within the distance threshold
            nearby_points = list(idx.intersection((x_coord - distance_threshold, 
                                                   y_coord - distance_threshold, 
                                                   x_coord + distance_threshold, 
                                                   y_coord + distance_threshold)))

            # Filter neighbors based on distance and feature type, and ensure they are greater than the key
            points_to_add = sorted([j for j in nearby_points if j != row_id and self.calc_distance(self.df.loc[j], self.df.loc[row_id]) <= distance_threshold and self.df.loc[j, 'featureType'] != feature_type and j > row_id])
            # Store the nearby points in the dictionary
            star_neighbors[row_id] = points_to_add
            
        self.star_neighbors = star_neighbors

In [4]:
# Read in the data and calculate offsets
subregions = []
dataframes = []
offsets = [0]
number_subregions = 0
for filename in os.listdir(directory_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory_path, filename)
        df = pd.read_csv(file_path)
        df.index = range(offsets[number_subregions], offsets[number_subregions] + len(df))  # Apply the offset to the indices of the DataFrame
        offsets.append(df.shape[number_subregions])  # Keeps track of offsets by storing in a list
        subregions.append(Subregion(df))  # Store instance of the subregion class in a list
        dataframes.append(df)  # Append the DataFrame to a list of DataFrames
        number_subregions += 1
        
offsets.pop()  # Get rid of last offset because it is not needed

5

In [5]:
# calculate the featureInfo, star_neighbors and write to files
for i in range(len(subregions)):
    subregions[i].calc_feature_info(offsets[i])
    subregions[i].calc_star_neighbors(distance_threshold)
    subregions[i].features = subregions[i].featureInfo.keys()
    
    # write the featureInfo to a csv file
    with open('required_files/featureInfo/featureInfo' + str(i) + '.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['feature', 'count', 'start', 'end'])
        for feature, values in subregions[i].featureInfo.items():
            writer.writerow([feature, values['count'], values['start'], values['end']])
            
    # write the star_neighbor to a csv file
    with open('required_files/starNeighbors/starNeighbors' + str(i) + '.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['feature', 'star neighbors'])
        for feature, neighbors in subregions[i].star_neighbors.items():
            values_str = ' '.join(map(str, neighbors))
            writer.writerow([feature, values_str])

In [6]:
# load cpp shared library
lib = ctypes.CDLL('./c_functions.so')

In [7]:
# calculate prevalent patterns for the sub-regions
lib.subregion_main(ctypes.c_int(number_subregions), ctypes.c_double(prevalence_threshold))

Degree 2 Prevalent Patterns for Sub-Region 0:
(0, 1)
(0, 2)
(0, 3)
(0, 7)
(1, 2)
(1, 4)
(1, 7)
(2, 3)
(2, 5)
(2, 7)
(3, 7)
Degree 3 Prevalent Patterns for Sub-Region 0:
(0, 1, 2)
(0, 2, 3)
(1, 2, 7)
Degree 2 Prevalent Patterns for Sub-Region 1:
(0, 1)
(0, 2)
(0, 6)
(0, 8)
(1, 6)
(2, 6)
(2, 8)
(3, 5)


0

In [8]:
# This class holds the information pertaining to the border region
class Border:
    def __init__(self):
        self.combined_df = pd.DataFrame()
        self.border_df = pd.DataFrame()
        self.featureInfo = {}
        self.star_neighbors = {}
        
    # This function will calculate the feature ranges for each type of feature
    def calc_feature_info(self):
        # Initialize variables to track count, start row ID, and end row ID
        count = 0
        start_row_id = 0
        prev_feature = None
        featureInfo = {}

        # Iterate through the DataFrame
        for i, row in self.border_df.iterrows():
            feature = row['featureType']

            # If feature type changes, update feature_info for the previous feature
            if feature != prev_feature:
                if prev_feature is not None:
                    featureInfo[prev_feature] = {'count': count, 'start': start_row_id, 'end': i - 1}
                count = 1
                start_row_id = i
                prev_feature = feature
            else:
                count += 1

        # Update feature_info for the last feature
        if prev_feature is not None:
            featureInfo[prev_feature] = {'count': count, 'start': start_row_id, 'end': len(self.border_df) - 1}

        # Check if the last feature only has one occurrence
        if prev_feature is not None and count == 1:
            featureInfo[prev_feature] = {'count': count, 'start': start_row_id, 'end': start_row_id}
                
        self.featureInfo = featureInfo
        
    def calc_distance(self, point1, point2):
        x1, y1 = point1['xCoordinate'], point1['yCoordinate']
        x2, y2 = point2['xCoordinate'], point2['yCoordinate']
        return math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
        
    # This function will calculate the star neighbors for each instance
    def calc_star_neighbors(self, distance_threshold):
        star_neighbors = {}
        # Create a spatial index
        idx = index.Index()
        # Insert each point into the spatial index with its unique identifier
        for i, row in self.border_df.iterrows():
            x_coord = row['xCoordinate']
            y_coord = row['yCoordinate']
            idx.insert(i, (x_coord, y_coord, x_coord, y_coord))

        # Iterate over each row in the DataFrame
        for i, row in self.border_df.iterrows():
            row_id = i
            feature_type = row['featureType']
            x_coord = row['xCoordinate']
            y_coord = row['yCoordinate']

            # Query the spatial index to find nearby points within the distance threshold
            nearby_points = list(idx.intersection((x_coord - distance_threshold, y_coord - distance_threshold, x_coord + distance_threshold, y_coord + distance_threshold)))

            # Filter neighbors based on distance and feature type, and ensure they are greater than the key
            points_to_add = sorted([j for j in nearby_points if j != row_id and self.calc_distance(self.border_df.loc[j], self.border_df.loc[row_id]) <= distance_threshold and 
                                    self.border_df.loc[j, 'featureType'] != feature_type and j > row_id])
            # Store the nearby points in the dictionary
            star_neighbors[row_id] = points_to_add
            
        self.star_neighbors = star_neighbors

In [9]:
border = Border()
border.combined_df = pd.concat(dataframes, ignore_index=True)  # combine the Dataframes
border.combined_df['ID'] = border.combined_df.index

# sort the df by featureType
border.combined_df = border.combined_df.sort_values(by='featureType', ignore_index=True)

In [10]:
# read the shapefile
shapefile = gpd.read_file(shapefile_path)
points = np.array(border.combined_df[['latitude', 'longitude']])
points_gdf = gpd.GeoDataFrame(geometry=gpd.points_from_xy(points[:, 1], points[:, 0]))
points_gdf = points_gdf.set_crs("EPSG:4326")
shapefile = shapefile.to_crs("EPSG:4326")

In [11]:
# find the border instances
featureType = []
x = []
y = []
ID = []
curr_index = 0
for point in points_gdf.geometry:
    # Convert radius from kilometers to degrees (assuming a spherical Earth)
    radius_deg = distance_threshold / 111.32  # Approximately 111.32 km per degree of latitude
    # Create a circle geometry
    circle = point.buffer(radius_deg)
    
    # Find the borders that the point intersects
    intersected_borders = [border for border in shapefile['geometry'] if circle.intersects(border)]
    
    if len(intersected_borders) >= 2:
        featureType.append(border.combined_df['featureType'].iloc[curr_index])
        x.append(border.combined_df['xCoordinate'].iloc[curr_index])
        y.append(border.combined_df['yCoordinate'].iloc[curr_index])
        ID.append(border.combined_df['ID'].iloc[curr_index])
    curr_index += 1

In [12]:
border.border_df = pd.DataFrame({
    'featureType': featureType,
    'xCoordinate': x,
    'yCoordinate': y,
    'ID': ID})

In [13]:
border.border_df

Unnamed: 0,featureType,xCoordinate,yCoordinate,ID
0,0,-13004.57470,3613.132698,2918
1,0,-13004.57470,3613.132698,2917
2,0,-13004.57470,3613.132698,2916
3,0,-13004.57503,3613.132809,2915
4,0,-13004.57503,3613.132809,2914
...,...,...,...,...
85,6,-10842.13160,2874.623141,3517
86,6,-13004.57470,3613.132698,3423
87,6,-12180.44382,3482.860836,3510
88,6,-13004.57470,3613.132698,3424


In [14]:
ids = border.border_df['ID'].to_list()  # store the IDs from the Dataframe in a list

In [15]:
# calculate featureInfo, star_neighbors for the border region and write to file
border.calc_feature_info()
# write the featureInfo to a csv file
with open('required_files/border_featureInfo/featureInfo.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['feature', 'count', 'start', 'end'])
    for feature, values in border.featureInfo.items():
        writer.writerow([feature, values['count'], values['start'], values['end']])
        
border.calc_star_neighbors(distance_threshold)
# write the star_neighbor to a csv file
with open('required_files/border_starNeighbors/starNeighbors.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['feature', 'star neighbors'])
    for feature, neighbors in border.star_neighbors.items():
        values_str = ' '.join(map(str, neighbors))
        writer.writerow([feature, values_str])

In [16]:
# calculate the degree 2 prevalent patterns for border region
number_borders = 1
lib.border_main(ctypes.c_int(number_borders), ctypes.c_double(prevalence_threshold))

Degree 2 Prevalent Patterns for Border 0:
(0, 1)
(0, 2)
(0, 3)
(0, 6)
(1, 2)
(1, 3)
(1, 6)
(2, 3)
(2, 6)
(3, 6)


0

In [17]:
# update star_neighbors and featureInfo so that the original indices are used
arr_len = len(ids)
arr_type = ctypes.c_int * arr_len
arr_c = arr_type(*ids)
border_number = 0
lib.update_border_info.argtypes = (ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_int)
lib.update_border_info(arr_c, arr_len, border_number)

0

In [18]:
# combine the hashmaps for the subregions and border region
lib.combine_hashmaps.argtypes = (ctypes.c_int, ctypes.c_int)
lib.combine_hashmaps(number_subregions, number_borders)

0

In [19]:
# combine the instance tables for the subregions and the border region
lib.combine_instance_tables.argtypes = (ctypes.c_int, ctypes.c_int)
lib.combine_instance_tables(number_subregions, number_borders)

0

In [20]:
# calculate prevalent patterns for entire region
features = list(border.combined_df['featureType'].unique())
string_ptrs = (ctypes.c_char_p * len(features))()
string_ptrs[:] = [s.encode() for s in features]
lib.region_main.argtypes = (ctypes.c_int, ctypes.c_double, ctypes.POINTER(ctypes.c_char_p), ctypes.c_int)
lib.region_main(number_subregions, prevalence_threshold, string_ptrs, len(features))

Degree 2 Prevalent Patterns for Entire Region:
(0, 1)
(0, 2)
(0, 3)
(1, 2)
(1, 6)
(2, 3)
(2, 5)
(2, 7)
Degree 3 Prevalent Patterns for Entire Region:
(0, 1, 2)
(0, 2, 3)


0