In [1]:
# g++ -O3 -shared -o c_functions.so -fPIC c_functions.cpp
try:
    import pandas as pd
    import numpy as np
    import math
    import csv
    import ctypes
    import os
    import geopandas as gpd
    from rtree import index
    from itertools import combinations
    import time
except:
    !pip install geopandas

In [2]:
!rm -f gmon.out

In [3]:
library = ctypes.CDLL('./functions.so')

In [4]:
start = time.time()

In [5]:
s = time.time()
library.subregion_main()
e = time.time()
print("SUBREGION PROCESSING:", e - s, "SEC")

Degree 2 Prevalent Patterns for Sub-Region 0:
(0, 1)
(0, 2)
(0, 3)
(0, 7)
(1, 2)
(1, 4)
(1, 7)
(2, 3)
(2, 5)
(2, 7)
(3, 7)
Degree 3 Prevalent Patterns for Sub-Region 0: 
(0, 1, 2)
(0, 2, 3)
(1, 2, 7)
Degree 2 Prevalent Patterns for Sub-Region 1:
(0, 1)
(0, 2)
(0, 6)
(1, 6)
(2, 6)
(2, 8)
(3, 5)
SUBREGION PROCESSING: 1.2268645763397217 SEC


In [6]:
s = time.time()
# Parameters: These are the only things that need to be changed
# parameters: distance_threshold, prevalance_threshold
distance_threshold = 71.22
prevalence_threshold = 0.55    # set the prevalence threshold
# shapefile path to determine border points
shapefile_path = '/home/amk7r/colocation_mining/updated_regional/Data/Region1/shapefile'
directory_path = '/home/amk7r/colocation_mining/updated_regional/Data/Region1'
subregions = []
dataframes = []
offsets = [0]
number_subregions = 0
for filename in os.listdir(directory_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory_path, filename)
        df = pd.read_csv(file_path)
        df.index = range(offsets[number_subregions], offsets[number_subregions] + len(df))  # apply the offset
        offsets.append(df.shape[number_subregions])
        dataframes.append(df)
        number_subregions += 1
        
offsets.pop()
e = time.time()
print("PREPROCESS:", e - s, "SEC")

PREPROCESS: 0.007802486419677734 SEC


In [7]:
# This class holds the information pertaining to the border region
class Border:
    def __init__(self):
        self.combined_df = pd.DataFrame()
        self.border_df = pd.DataFrame()

In [8]:
s = time.time()
border = Border()
border.combined_df = pd.concat(dataframes, ignore_index=True)
border.combined_df['ID'] = border.combined_df.index

# sort the df by featureType
border.combined_df = border.combined_df.sort_values(by='featureType', ignore_index=True)

In [9]:
border.combined_df

Unnamed: 0,featureType,latitude,longitude,xCoordinate,yCoordinate,ID
0,0,34.607389,-86.979767,-9671.708811,3848.166081,0
1,0,42.880273,-87.900801,-9774.123119,4768.068811,373
2,0,47.462201,-122.254239,-13594.051140,5277.555959,372
3,0,47.242603,-122.454338,-13616.301130,5253.137775,371
4,0,47.658780,-117.426047,-13057.180680,5299.414546,370
...,...,...,...,...,...,...
3547,8,25.774591,-80.214195,-8919.411529,2866.003756,2908
3548,8,25.774591,-80.214195,-8919.411529,2866.003756,2909
3549,8,27.946530,-82.459269,-9169.052368,3107.512353,2910
3550,8,14.540831,-92.220917,-10254.498100,1616.866636,3536


In [10]:
shapefile = gpd.read_file(shapefile_path)
points = np.array(border.combined_df[['latitude', 'longitude']])
points_gdf = gpd.GeoDataFrame(geometry=gpd.points_from_xy(points[:, 1], points[:, 0]))
points_gdf = points_gdf.set_crs("EPSG:4326")
shapefile = shapefile.to_crs("EPSG:4326")

In [11]:
featureType = []
x = []
y = []
ID = []
curr_index = 0
for point in points_gdf.geometry:
    # Convert radius from kilometers to degrees (assuming a spherical Earth)
    radius_deg = distance_threshold / 111.32  # Approximately 111.32 km per degree of latitude
    # Create a circle geometry
    circle = point.buffer(radius_deg)
    
    # Find the borders that the point intersects
    intersected_borders = [border for border in shapefile['geometry'] if circle.intersects(border)]
    
    if len(intersected_borders) >= 2:
        featureType.append(border.combined_df['featureType'].iloc[curr_index])
        x.append(border.combined_df['xCoordinate'].iloc[curr_index])
        y.append(border.combined_df['yCoordinate'].iloc[curr_index])
        ID.append(border.combined_df['ID'].iloc[curr_index])
    curr_index += 1

In [12]:
border.border_df = pd.DataFrame({
    'featureType': featureType,
    'xCoordinate': x,
    'yCoordinate': y,
    'ID': ID})

In [13]:
border.border_df

Unnamed: 0,featureType,xCoordinate,yCoordinate,ID
0,0,-13004.57470,3613.132698,2918
1,0,-13004.57470,3613.132698,2917
2,0,-13004.57470,3613.132698,2916
3,0,-13004.57503,3613.132809,2915
4,0,-13004.57503,3613.132809,2914
...,...,...,...,...
85,6,-10842.13160,2874.623141,3517
86,6,-13004.57470,3613.132698,3423
87,6,-12180.44382,3482.860836,3510
88,6,-13004.57470,3613.132698,3424


In [14]:
ids = border.border_df['ID'].to_list()

In [15]:
border.border_df.to_csv("/home/amk7r/colocation_mining/updated_regional/IntermediateData/border.csv", index=False)

In [16]:
number_borders = 1
library.border_main(ctypes.c_int(number_borders))

enter
Degree 2 Prevalent Patterns for Sub-Region 0:
(0, 1)
(0, 2)
(0, 3)
(0, 6)
(1, 3)
(1, 6)
(2, 3)
(2, 6)
(3, 6)


0

In [17]:
arr_len = len(ids)
arr_type = ctypes.c_int * arr_len
arr_c = arr_type(*ids)
border_number = 0
library.update_border_info.argtypes = (ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_int)
library.update_border_info(arr_c, arr_len, border_number)
e = time.time()
print("BORDER PROCESSING:", e - s, "SEC")

BORDER PROCESSING: 9.573695182800293 SEC


In [18]:
s = time.time()
library.combine_hashmaps.argtypes = (ctypes.c_int, ctypes.c_int)
library.combine_hashmaps(number_subregions, number_borders)

0

In [19]:
library.combine_instance_tables.argtypes = (ctypes.c_int, ctypes.c_int)
library.combine_instance_tables(number_subregions, number_borders)
e = time.time()
print("PROCESSING:", e - s, "SEC")

PROCESSING: 0.011550188064575195 SEC


In [20]:
s = time.time()
library.region_main()
e = time.time()
print("REGION PROCESSING", e - s, "SEC")

Degree 2 Prevalent Patterns for Entire Region:
(0, 1)
(0, 2)
(0, 3)
(1, 2)
(1, 6)
(2, 3)
(2, 5)
(2, 7)
326 / 545
182 / 296
1036 / 1358
401 / 545
1188 / 1358
688 / 1007
Degree 3 Prevalent Patterns for Entire Region:2:
(0, 1, 2)
(0, 2, 3)
REGION PROCESSING 1.2671544551849365 SEC


In [21]:
end = time.time()
print("TOTAL TIME: ", end - start)

TOTAL TIME:  12.124194145202637


In [22]:
# region1 total time: 12.124194145202637
# region10 total time: 
# region6 total time: