### DATA PROCESSING

In [None]:
import math
import json
from collections import defaultdict
import math
import pandas as pd

class DataSplitter:

    def __init__(self, data_json=None, data_frame=None):

        if data_json is not None:
            self.data = data_json
            self.data_type = "json"

        elif data_frame is not None:
            self.data = data_frame.to_dict("records")
            self.data_type = "data_frame"

        else:
            raise ValueError("Data tidak boleh kosong")

    def calculate_haversine(self, coord1, coord2):

        R = 6371  # km

        lat1, lon1 = coord1
        lat2, lon2 = coord2

        lat1, lon1 = map(math.radians, (lat1, lon1))
        lat2, lon2 = map(math.radians, (lat2, lon2))

        dlat = lat2 - lat1
        dlon = lon2 - lon1

        a = (
            math.sin(dlat / 2) ** 2
            + math.cos(lat1)
            * math.cos(lat2)
            * math.sin(dlon / 2) ** 2
        )

        c = 2 * math.asin(math.sqrt(a))

        return R * c

    def process_boundaries(self, lat_name="latitude", lon_name="longitude"):

        if not self.data:
            raise ValueError("Data kosong")


        max_lat = max(self.data, key=lambda x: x[lat_name])[lat_name]
        min_lat = min(self.data, key=lambda x: x[lat_name])[lat_name]
        max_lon = max(self.data, key=lambda x: x[lon_name])[lon_name]
        min_lon = min(self.data, key=lambda x: x[lon_name])[lon_name]

        print (
            f"min-latitude: {min_lat}", "\n"
            f"max-latitude: {max_lat}", "\n"
            f"min-longitude: {min_lon}", "\n"
            f"max-longitude: {max_lon}", "\n"
        )

        return {
            "min-latitude": min_lat,
            "max-latitude": max_lat,
            "min-longitude": min_lon,
            "max-longitude": max_lon,
        }

    def create_bins_from_boundaries(
        self,
        lat_range,   # [start, end]
        lon_range,   # [start, end]
        range_lat=20,    # km
        range_lon=50     # km
    ):

        start_lat, end_lat = lat_range
        start_lon, end_lon = lon_range

        # latitude
        KM_PER_DEG_LAT = 111.32

        lat_step = range_lat / KM_PER_DEG_LAT

        lat_bins = self._generate_bins(
            start_lat, end_lat, lat_step
        )

        # longitude
        mid_lat = (start_lat + end_lat) / 2

        KM_PER_DEG_LON = 111.32 * math.cos(math.radians(mid_lat))

        if KM_PER_DEG_LON < 1e-6:
            raise ValueError("Terlalu dekat kutub")

        lon_step = range_lon / KM_PER_DEG_LON

        lon_bins = self._generate_bins(
            start_lon, end_lon, lon_step
        )

        print(
            f"There are {len(lat_bins)} latitude bins", "\n",
            f"There are {len(lon_bins)} longitude bins", "\n",
        )

        return {
            "lat_bins": lat_bins,
            "lon_bins": lon_bins
        }

    def _generate_bins(self, start, end, step):

        bins = []

        val = start

        while val <= end:
            bins.append(round(val, 6))
            val += step

        return bins

    def build_grid_index(self, lat_bins, lon_bins):

        if len(lat_bins) < 2 or len(lon_bins) < 2:
            raise ValueError("Bins tidak valid")

        lat_step = lat_bins[1] - lat_bins[0]
        lon_step = lon_bins[1] - lon_bins[0]

        grid_meta = {
            "lat_start": lat_bins[0],
            "lon_start": lon_bins[0],

            "lat_step": lat_step,
            "lon_step": lon_step,

            "lat_count": len(lat_bins) - 1,
            "lon_count": len(lon_bins) - 1,
        }

        return grid_meta


    def find_grid_fast(self, lat, lon, grid_meta):

        lat_start = grid_meta["lat_start"]
        lon_start = grid_meta["lon_start"]

        lat_step = grid_meta["lat_step"]
        lon_step = grid_meta["lon_step"]

        lat_count = grid_meta["lat_count"]
        lon_count = grid_meta["lon_count"]

        # Hitung index baris & kolom
        row = int((lat - lat_start) / lat_step)
        col = int((lon - lon_start) / lon_step)

        # Cek out of bounds
        if (
            row < 0 or row >= lat_count
            or
            col < 0 or col >= lon_count
        ):
            return None

        # Konversi ke area id
        area_id = row * lon_count + col + 1

        return f"Area_{area_id}"


    def splitdata(self, data_list, grid, lat_name = "latitude", lon_name = "longitude"):
        data_container = defaultdict(list)

        for data in data_list:
            lat = data[lat_name]
            lon = data[lon_name]
            area = self.find_grid_fast(lat, lon, grid)

            if area:
                data_container[area].append(data)
            else:
                data_container["OOB"].append(data)

        print(dict(data_container))
        return dict(data_container)

    def convert2JSON(self, data, name):
        try:
            with open(name, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=4)
                print(f"JSON in FILE {name} created!!")
        except Exception as E:
            print(f"Error happened {str(E)}")

### START HERE

In [None]:
data_folder_path = "./data/metadata.json"

In [None]:
datafile = None

try:
    with open(data_folder_path, "r") as file:
        datafile = json.load(file)

except json.JSONDecodeError as e:
    print("Failed to decode JSON:", e)

except FileNotFoundError:
    print("The file was not found.")

In [None]:
dataSplitter = DataSplitter(datafile)

#### BOUNDARIES

In [None]:
boundaries = dataSplitter.process_boundaries()

#### LATITUDE AND LONGITUDE BINS

In [None]:
bins = dataSplitter.create_bins_from_boundaries(
    lat_range = [-89.153062, 89.255691],
    lon_range = [-177.386922, 165.523263],
    range_lat=10,
    range_lon=5,
)

In [26]:
len(bins)

2

In [27]:
dataSplitter.convert2JSON(bins, "./data/info-bins.json")

JSON in FILE ./data/info-bins.json created!!


#### GRID

In [None]:
grid = dataSplitter.build_grid_index(
    bins["lat_bins"],
    bins["lon_bins"]
)

In [None]:
print(grid)

#### DATA SPLITING

In [17]:
result = dataSplitter.splitdata(datafile, grid)

{'Area_8075210': [{'Station_ID': 1, 'Station_Name': 'STASIUN METEOROLOGI CUT BAU MAIMUN SALEH', 'Elevation': 126, 'File_Created': '2/12/2025', 'Data_Points': 99, 'Years_Covered': '1927-2025', 'Records': 99, 'Start_Year': 1927, 'End_Year': 2025, 'Annual_Mean': 145.73, 'Annual_Max': 321.0, 'Missing_Values': 339, 'LAT': 5.876666667, 'LON': 95.33777778, 'latitude': 5.876666667, 'longitude': 95.33777778}, {'Station_ID': 8330, 'Station_Name': 'BPP PAYA SEUNARA', 'Elevation': 9, 'File_Created': '2/12/2025', 'Data_Points': 16, 'Years_Covered': '2010-2025', 'Records': 16, 'Start_Year': 2010, 'End_Year': 2025, 'Annual_Mean': 85.65, 'Annual_Max': 141.5, 'Missing_Values': 40, 'LAT': 5.851666667, 'LON': 95.31805556, 'latitude': 5.851666667, 'longitude': 95.31805556}, {'Station_ID': 8332, 'Station_Name': 'BPP SUKA JAYA', 'Elevation': 2, 'File_Created': '2/12/2025', 'Data_Points': 16, 'Years_Covered': '2010-2025', 'Records': 16, 'Start_Year': 2010, 'End_Year': 2025, 'Annual_Mean': 86.5, 'Annual_Max':

In [18]:
len(result)

5936

In [22]:
dataSplitter.convert2JSON(result, "./data/splited-data.json")

JSON in FILE ./data/splited-data.json created!!


In [23]:
grid

{'lat_start': -89.153062,
 'lon_start': -177.386922,
 'lat_step': 0.08983100000000377,
 'lon_step': 0.04491600000000062,
 'lat_count': 1986,
 'lon_count': 7634}

In [24]:
dataSplitter.convert2JSON(grid, "./data/info-grid.json")

JSON in FILE ./data/info-grid.json created!!


In [None]:
len(datafile)

In [None]:
c = 0
for k, i in result.items():
  c += len(i)

print(c)

In [None]:
import math

def calculate_haversine(coord1, coord2):

    R = 6371  # km

    lat1, lon1 = coord1
    lat2, lon2 = coord2

    lat1, lon1 = map(math.radians, (lat1, lon1))
    lat2, lon2 = map(math.radians, (lat2, lon2))

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = (
        math.sin(dlat / 2) ** 2
        + math.cos(lat1)
        * math.cos(lat2)
        * math.sin(dlon / 2) ** 2
    )

    c = 2 * math.asin(math.sqrt(a))

    return R * c

In [None]:
calculate_haversine(
    (5.228333333, 96.9475), 
    (5.21, 96.96)
)

In [None]:
calculate_haversine(
    (5.228333333, 96.9475), 
    (5.21, 96.96)
)

In [32]:
import json
import pymongo
from dotenv import load_dotenv
import os

load_dotenv()

DB_URI = os.getenv("MONGODB_URI_CONNECTION")
DB_NAME = "hydrolab-database-v0"

info_bins_path_json = "./data/info-bins.json"
info_grid_path_json = "./data/info-grid.json"
# info_splited_data_json = "./data/splited-data.json"
info_pairing_data_json = "./data/pairingdata.json"
info_all_data_json = "./data/metadata.json"


def upload_to_collection(db, collection_name, data):
    collection = db[collection_name]
    collection.delete_many({})  # clear old data
    if isinstance(data, list):
        collection.insert_many(data)
    else:
        collection.insert_one(data)
    print(f"Inserted into {collection_name}")


def seed_to_db():
    client = pymongo.MongoClient(DB_URI)
    db = client[DB_NAME]

    with open(info_all_data_json, "r") as f:
        full_data = json.load(f)

    upload_to_collection(db, "full_data", full_data)
        
    with open(info_pairing_data_json, "r") as f:
        pairing_data = json.load(f)

    upload_to_collection(db, "pairing_data", pairing_data)


    with open(info_grid_path_json, "r") as f:
        grid_meta = json.load(f)

    upload_to_collection(db, "grid_meta", grid_meta)


    with open(info_bins_path_json, "r") as f:
        bins_meta = json.load(f)

    upload_to_collection(db, "bins_meta", bins_meta)

    print("DATABASE SYNC COMPLETED SUCCESSFULLY")


In [34]:
# seed_to_db()