### Data Overview
`[using dummy and generated data] **as long as there is lat and long attribute**` <br />

to create the algorithm and pivot points to optimize search algorithm in database and server

In [None]:
import json

# dummy data first

'''
data json must be consist of [{...}, {...}, ..., {...},]
where the {...} consist of 
- _id: int or str
- latitude: float
- longitude: float
'''
fname = "data/generated.json"

datafile = None

try:
    with open(fname, "r") as file:
        datafile = json.load(file)

except json.JSONDecodeError as e:
    print("Failed to decode JSON:", e)

except FileNotFoundError:
    print("The file was not found.")


In [None]:
len(datafile)

In [None]:
type(datafile[0])

In [None]:
datafile[0].keys()

### Map data into Pandas Dataframe 

In [None]:
column_names = [
    # main attributes 
    '_id', 
    'index', 
    'latitude',
    'longitude',
    
    # optional attributes 
    'isActive',
    'address',
    'company',
    'name'
]

temp_data = {
    'uuid': [],
    'id_index': [],
    'lat': [],
    'long' : [],
    
    # optional attributes 
    'isActive' : [],
    'address' : [],
    'company' : [],
    'name': [],
    
}
for d in datafile:
    temp_data['uuid'].append(d['_id'])
    temp_data['id_index'].append(d['index'])
    temp_data['lat'].append(d['latitude'])
    temp_data['long'].append(d['longitude'])
    temp_data['isActive'].append(d['isActive'])
    temp_data['address'].append(d['address'])
    temp_data['company'].append(d['company'])
    temp_data['name'].append(d['name'])

In [None]:
import pandas as pd

df = pd.DataFrame(temp_data)

In [None]:
df.head()

### Geopandas Data

In [None]:
import geopandas as gpd
from shapely.geometry import Point

gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["long"], df["lat"]),
    crs="EPSG:4326"
)

gdf.head()

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt

gdf = gdf.set_crs(epsg=4326)

ax = gdf.plot(
    figsize=(10, 6),
    color="red",
    markersize=20
)

ax.set_title("Data Points Map")
plt.show()

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt

gdf = gdf.set_crs(epsg=4326)

ax = gdf.plot(
    figsize=(10, 6),
    color="red",
    markersize=20
)

center_lat = -6.2
center_lon = 106.8
center_point = gpd.GeoSeries(
    [Point(center_lon, center_lat)],
    crs="EPSG:4326"
)
gdf_3857 = gdf.to_crs(epsg=3857)
center_3857 = center_point.to_crs(epsg=3857)

radius_km = 5000
radius_m = radius_km * 1000

circle = center_3857.buffer(radius_m)

circle_4326 = circle.to_crs(epsg=4326)
center_4326 = center_3857.to_crs(epsg=4326)


circle_4326.plot(
    ax=ax,
    facecolor="none",
    edgecolor="blue",
    linewidth=2,
    label="Radius 5000 km"
)

center_4326.plot(
    ax=ax,
    color="blue",
    markersize=50,
    marker="x",
    label="Center"
)

ax.set_title("Data Points Map with Radius")
plt.show()


### Working on Creating and Designing the Clustering Method

In [None]:
# sliding window the latitude
df['dist_next_lat'] = df['lat'].diff().abs()

# sliding window the latitude
df['dist_next_long'] = df['long'].diff().abs()

In [None]:
df.head()

In [None]:
df['dist_next_lat'].mean()

In [None]:
df['dist_next_long'].mean()

In [None]:
df['dist_next_long'].median()

In [None]:
df['dist_next_lat'].median()

#### Plot the data distribution and Subplot data `long and lat distance`

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm

data = df['dist_next_lat'].dropna()

fig, axs = plt.subplots(1, 2, figsize=(14, 5))

axs[0].boxplot(data, vert=True, showfliers=True)
axs[0].set_title("Boxplot dist_next_lat")
axs[0].set_ylabel("Latitude Difference")
axs[0].grid(axis="y", alpha=0.3)

axs[1].hist(data, bins=30, density=True, alpha=0.6)

mu, std = norm.fit(data)

x = np.linspace(data.min(), data.max(), 100)
p = norm.pdf(x, mu, std)

axs[1].plot(x, p, linewidth=2)
axs[1].set_title(f"Gaussian Fit\nμ={mu:.5f}, σ={std:.5f}")
axs[1].set_xlabel("Latitude Difference")
axs[1].set_ylabel("Density")

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm

data = df['dist_next_long'].dropna()

fig, axs = plt.subplots(1, 2, figsize=(14, 5))

axs[0].boxplot(data, vert=True, showfliers=True)
axs[0].set_title("Boxplot dist_next_long")
axs[0].set_ylabel("Longitude Difference")
axs[0].grid(axis="y", alpha=0.3)

axs[1].hist(data, bins=30, density=True, alpha=0.6)

mu, std = norm.fit(data)

x = np.linspace(data.min(), data.max(), 100)
p = norm.pdf(x, mu, std)

axs[1].plot(x, p, linewidth=2)
axs[1].set_title(f"Gaussian Fit\nμ={mu:.5f}, σ={std:.5f}")
axs[1].set_xlabel("Longitude Difference")
axs[1].set_ylabel("Density")

plt.tight_layout()
plt.show()

In [None]:
print(df['lat'].max(), df['lat'].min())
print(df['long'].max(), df['long'].min())

In [None]:
# find how many lat points that will be set as the center poin for the pivot center
print(f"min: {df['lat'].min()}")
for i in range(int(df['lat'].min()), int(df['lat'].max()), 60):
    print(i)
print(f"max: {df['lat'].max()}")

'''
min: -89.153062
-89
-29
31
max: 89.255691
'''

# take -89, -29, 31, 89

In [None]:
# find how many long points that will be set as the center poin for the pivot center

print(f"min: {df['long'].min()}")
for i in range(int(df['long'].min()), int(df['long'].max()), 60):
    print(i)
print(f"max: {df['long'].max()}")

'''
min: -177.386922
-177
-117
-57
3
63
123
max: 165.523263
'''

# take -177, -57, 3, 63, 123

#### Check The Circle Radius Coverage **[FFAILED]**
`BECAUSE NEED EXTRA STUFF TO DO GUARANTEE THAT ALL POINTS WILL BE COVERAGE`

In [None]:
lats = [-89, -29, 31, 89]
longs = [-177, -57, 3, 63, 123]

coverage_res = []

import numpy as np

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # radius bumi (km)

    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    return R * c

def do_coverage(row, dist=5000):
    coverage_res = []

    point_lat = row['lat']
    point_lon = row['long']

    for lat in lats:
        for lon in longs:
            d = haversine(point_lat, point_lon, lat, lon)
            if d <= dist:
                coverage_res.append({
                    "center_lat": lat,
                    "center_lon": lon,
                    "dist_km": d
                })

    return coverage_res
        

In [None]:
df['circle_coverage'] = df.apply(do_coverage, axis=1)

df

In [None]:
df[df['circle_coverage'].apply(len) == 0]

#### Check The Grid Area Coverage

In [None]:
lat_bins = [(-90, 0), (0, 90)]
lon_bins = [
    (-180, -90), (-90, 0), 
    (0, 90), (90, 180)
]

grid_area_lists = []

area_id = 1
for lat_min, lat_max in lat_bins:
    for lon_min, lon_max in lon_bins:
        grid_area_lists.append({
            "name": f"AREA {area_id}",
            "lat_min": lat_min,
            "lat_max": lat_max,
            "lon_min": lon_min,
            "lon_max": lon_max,
        })
        area_id += 1


In [None]:
len(grid_area_lists)

In [None]:
def under_the_area(point_lat, point_lon, area):
    isUnder = (
        area["lat_min"] <= point_lat < area["lat_max"] and
        area["lon_min"] <= point_lon < area["lon_max"]
    )

    center_lat = (area["lat_min"] + area["lat_max"]) / 2
    center_lon = (area["lon_min"] + area["lon_max"]) / 2

    dlat = point_lat - center_lat
    dlon = point_lon - center_lon

    return isUnder, dlat, dlon


In [None]:
def do_coverage(row):
    coverage_res = []

    point_lat = row['lat']
    point_lon = row['long']

    for area in grid_area_lists:
        isUnder, dlat, dlong = under_the_area(
            point_lat,
            point_lon,
            area
        )

        if isUnder:
            coverage_res.append({
                "area": area["name"],
                "lat_min": area["lat_min"],
                "lat_max": area["lat_max"],
                "lon_min": area["lon_min"],
                "lon_max": area["lon_max"],
                "delta_lat": round(dlat, 4),
                "delta_long": round(dlong, 4),
            })
    print(coverage_res)
    return coverage_res


In [None]:
df['grid_coverage'] = df.apply(do_coverage, axis=1)

df

### Create The Data Splitter

In [None]:
from collections import defaultdict

def build_grid_index(lat_bins, lon_bins):
    grid = []

    area_id = 1

    for lat_min, lat_max in lat_bins:
        for lon_min, lon_max in lon_bins:
            grid.append({
                "index": f"Area_{area_id}",
                "minimum-latitude": lat_min,
                "maximum-latitude": lat_max,
                "minimum-longitude": lon_min,
                "maximum-longitude": lon_max,
            })
            area_id += 1

    return grid


def find_grid(lat, lon, grid):
    for area in grid:
        if (
            (
                area["minimum-latitude"] <= lat
                and
                lat < area["maximum-latitude"]
            )
            and
            (
                area["minimum-longitude"] <= lon
                and
                lon < area["maximum-longitude"]
            )
        ):
            return area["index"]
    else:
        return None


def data_splitter(data_list, grid):
    data_container = defaultdict(list)

    for data in data_list:
        lat = data["latitude"]
        lon = data["longitude"]
        area = find_grid(lat, lon, grid)

        if area:
            data_container[area].append(data)
        else:
            data_container["OOB"].append(data)

    return dict(data_container)

In [None]:
# latitude and longitude boundaries 

lat_bins = [(-90, -30), (-30, 30), (30, 90)]
lon_bins = [
    (-180, -120), (-120, -60), (-60, 0),
    (0, 60), (60, 120), (120, 180)
]

GRID = build_grid_index(lat_bins, lon_bins)

In [None]:
len(GRID)

In [None]:
data_splited = data_splitter(datafile, GRID)

In [None]:
data_splited.keys()

### Create Data Splitter Converter Into JSON so WE can Just Insert the data

In [None]:

import json

def convert(data, name):
    try:
        with open(name, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4)
            print(f"JSON in FILE {name} created!!")
    except Exception as E:
        print(f"Error happened {str(E)}")

def convert_all_data_container(data_splited):
    file_names = []
    for k, item in data_splited.items():
        fname = f"./data/{k}.json"
        convert(item, name=fname)
        file_names.append(fname)
    
    return file_names

In [None]:
convert_all_data_container(data_splited)

## Implementation Of 
`How Using The Class Clustering Data`

In [None]:
# class from the ./version/module.py
import json
from collections import defaultdict
import pandas as pd
import pymongo

class DataSplitter:
    def __init__(
        self,
    ):
        # may be this is not nescesary at all
        self.dataSplited = None

        self.LAT_BINS = [(-90, -30), (-30, 30), (30, 90)]
        self.LON_BINS = [
            (-180, -120), (-120, -60), (-60, 0),
            (0, 60), (60, 120), (120, 180)
        ]

        self.GRID = self.build_grid_index(self.LAT_BINS, self.LON_BINS)

    def build_grid_index(self, lat_bins, lon_bins):
        grid = []

        area_id = 1

        for lat_min, lat_max in lat_bins:
            for lon_min, lon_max in lon_bins:
                grid.append({
                    "index": f"Area_{area_id}",
                    "minimum_latitude": lat_min,
                    "maximum_latitude": lat_max,
                    "minimum_longitude": lon_min,
                    "maximum_longitude": lon_max,
                })
                area_id += 1

        return grid

    def find_grid(self, lat, lon, grid):
        for area in grid:
            if (
                (
                    area["minimum_latitude"] <= lat
                    and
                    lat < area["maximum_latitude"]
                )
                and
                (
                    area["minimum_longitude"] <= lon
                    and
                    lon < area["maximum_longitude"]
                )
            ):
                return area["index"]
        else:
            return None

    def data_splitter(self, data_list, grid):
        data_container = defaultdict(list)

        for data in data_list:
            lat = data["latitude"]
            lon = data["longitude"]
            area = self.find_grid(lat, lon, grid)

            if area:
                data_container[area].append(data)
            else:
                data_container["OOB"].append(data)

        return dict(data_container)

    def convert(self, data, name):
        try:
            with open(name, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=4)
                print(f"JSON in FILE {name} created!!")
        except Exception as E:
            print(f"Error happened {str(E)}")

    def convert_all_data_container(self, data_splited):
        for k, item in data_splited.items():
            self.convert(item, name=f"./data/{k}.json")

    # incase the data loaded as the pandas dataframe
    def CSV2JSON_Convert(self, dataframe):
        json_data = dataframe.to_json(orientation="records", indent=4)
        json_data = json.loads(json_data)
        return json_data

    def UploadToDatabase(
        self, 
        DB_URI, DB_NAME, 
        COLLECTION_NAME, data_list
    ):
        try:
            client = pymongo.MongoClient(DB_URI)

            db = client[DB_NAME]

            collection = db[COLLECTION_NAME]

            collection.insert_many(data_list)

            print(
                f"Data stored in DB: {DB_NAME}, Collection: {COLLECTION_NAME}"
            )

        except Exception as E:
            print(f"Error happened: {str(E)}")


    def runSplit(self, dataJSON):
        grid = self.GRID
        data_splited = self.data_splitter(dataJSON, grid)
        self.result_files = self.convert_all_data_container(data_splited)
        self.dataSplited = data_splited
    
    def UploadData(self, DB_URI, DB_NAME):
        res_temp = []
        for k, item in self.dataSplited.items():
            self.UploadToDatabase(DB_URI, DB_NAME, k, item)
            res_temp.append({"DB_NAME": DB_NAME, "COL_NAME": k})
            
    

In [None]:
spliter = DataSplitter()

In [None]:
# dummy data first

'''
data json must be consist of [{...}, {...}, ..., {...},]
where the {...} consist of 
- _id: int or str
- latitude: float
- longitude: float
'''
fname = "data/generated.json"

datafile = None

try:
    with open(fname, "r") as file:
        datafile = json.load(file)

except json.JSONDecodeError as e:
    print("Failed to decode JSON:", e)

except FileNotFoundError:
    print("The file was not found.")


In [None]:
spliter.runSplit(datafile)

In [None]:
spliter.dataSplited.keys()

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

DB_URI = os.getenv("MONGODB_URI_CONNECTION")

In [None]:
spliter.UploadData(DB_URI=DB_URI, DB_NAME="hydrolab-database-v0")