In [1]:
import mlflow
import warnings
warnings.filterwarnings("ignore")

mlflow.set_experiment("Clustering_Models_IPM_monitor")

2025/05/28 06:30:46 INFO mlflow.tracking.fluent: Experiment with name 'Clustering_Models_IPM_monitor' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///d:/6.Teknologi%20Web%20Service/IAH-VISION/notebooks/mlruns/450841624946508368', creation_time=1748388646836, experiment_id='450841624946508368', last_update_time=1748388646836, lifecycle_stage='active', name='Clustering_Models_IPM_monitor', tags={}>

In [1]:
from sqlalchemy import create_engine
import pandas as pd

engine = create_engine("postgresql+psycopg2://postgres:postgres@localhost:5432/iahVision")

In [2]:
import pandas as pd
from sklearn.preprocessing import RobustScaler

def get_scaled_ipm_data_per_year(engine, start_year=2010, end_year=2024):
    df_scaled_dict = {}

    for year in range(start_year, end_year + 1):
        query = f"""
        WITH ranked_data AS (
        SELECT
            p.id_provinsi,
            p.provinsi,
            ahs.ahs_{year} AS ahs,
            ahh.ahh_{year} AS ahh,
            rls.rls_{year} AS rls,
            ppk.ppk_{year} AS ppk,
            ROW_NUMBER() OVER (ORDER BY p.id_provinsi) AS rn,
            COUNT(*) OVER () AS total_rows
        FROM
            provinsi p
        JOIN ahs ON p.id_provinsi = ahs.id_provinsi
        JOIN ahh ON p.id_provinsi = ahh.id_provinsi
        JOIN rls ON p.id_provinsi = rls.id_provinsi
        JOIN ppk ON p.id_provinsi = ppk.id_provinsi
        )
        SELECT
            id_provinsi, provinsi, ahs, ahh, rls, ppk
        FROM
            ranked_data
        WHERE
            rn < total_rows;
                """
        df = pd.read_sql(query, engine)

        fitur = df[['ahs', 'ahh', 'rls', 'ppk']]
        scaler = RobustScaler()
        fitur_scaled = scaler.fit_transform(fitur)

        df_scaled = pd.concat([df[['id_provinsi', 'provinsi']], pd.DataFrame(fitur_scaled, columns=fitur.columns)], axis=1)
        df_scaled_dict[year] = df_scaled

        print(f"Scaled data tahun {year} loaded. Shape: {df_scaled.shape}")

    return df_scaled_dict


df_scaled_dict = get_scaled_ipm_data_per_year(engine)

Scaled data tahun 2010 loaded. Shape: (33, 6)
Scaled data tahun 2011 loaded. Shape: (33, 6)
Scaled data tahun 2012 loaded. Shape: (33, 6)
Scaled data tahun 2013 loaded. Shape: (33, 6)
Scaled data tahun 2014 loaded. Shape: (33, 6)
Scaled data tahun 2015 loaded. Shape: (33, 6)
Scaled data tahun 2016 loaded. Shape: (33, 6)
Scaled data tahun 2017 loaded. Shape: (33, 6)
Scaled data tahun 2018 loaded. Shape: (33, 6)
Scaled data tahun 2019 loaded. Shape: (33, 6)
Scaled data tahun 2020 loaded. Shape: (33, 6)
Scaled data tahun 2021 loaded. Shape: (33, 6)
Scaled data tahun 2022 loaded. Shape: (33, 6)
Scaled data tahun 2023 loaded. Shape: (33, 6)
Scaled data tahun 2024 loaded. Shape: (33, 6)


In [3]:
df_scaled_dict[2010]

Unnamed: 0,id_provinsi,provinsi,ahs,ahh,rls,ppk
0,1,ACEH,2.108108,0.053846,0.507143,-0.535581
1,2,SUMATERA UTARA,0.648649,-0.444615,0.671429,0.139647
2,3,SUMATERA BARAT,1.189189,-0.404615,0.4,0.216158
3,4,RIAU,0.567568,0.383077,0.485714,0.493312
4,5,JAMBI,0.0,0.301538,-0.164286,-0.244516
5,6,SUMATERA SELATAN,-0.418919,-0.172308,-0.164286,-0.213483
6,7,BENGKULU,0.337838,-0.335385,0.2,-0.254682
7,8,LAMPUNG,-0.621622,0.0,-0.221429,-0.519529
8,9,KEP. BANGKA BELITUNG,-1.162162,0.073846,-0.357143,0.948101
9,10,KEP. RIAU,0.22973,-0.150769,1.292857,1.782772


In [4]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.model_selection import ParameterGrid
import mlflow

param_grid = {
    'n_clusters': [2, 3, 4, 5],
    'n_init': [5, 10, 15],
    'max_iter': [200, 300],
    'init': ['k-means++', 'random']
}

def kmeans_mlflow(df_scaled, year):
    df_kmeans = df_scaled.iloc[:, 2:]

    best_score = -1
    best_params = None
    best_labels = None

    for params in ParameterGrid(param_grid):
        model = KMeans(**params, random_state=42)
        labels = model.fit_predict(df_kmeans)
        score = silhouette_score(df_kmeans, labels)

        if score > best_score:
            best_score = score
            best_params = params
            best_labels = labels

    final_model = KMeans(**best_params, random_state=42)
    final_labels = final_model.fit_predict(df_kmeans)

    dbi = davies_bouldin_score(df_kmeans, final_labels)
    ch = calinski_harabasz_score(df_kmeans, final_labels)

    with mlflow.start_run(run_name=f"KMeans_{year}"):
        mlflow.set_tag("year", year)
        mlflow.set_tag("model_type", "KMeans")

        mlflow.log_params(best_params)
        mlflow.log_metric("silhouette_score", best_score)
        mlflow.log_metric("davies_bouldin_index", dbi)
        mlflow.log_metric("calinski_harabasz_score", ch)
    
    # print evaluasi
    print("\nBest Params:", best_params)
    print("Best Silhouette Score:", best_score)
    print("Davies-Bouldin Index:", dbi)
    print("Calinski-Harabasz Score:", ch)

    return final_labels

In [5]:
for year in range(2010, 2025):
    print(f"\nRunning clustering K-means tahun {year}")
    df_scaled = df_scaled_dict[year]
    labels = kmeans_mlflow(df_scaled, year)


Running clustering K-means tahun 2010


[WinError 2] The system cannot find the file specified
  File "c:\Users\Asus\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\Asus\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Asus\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Asus\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



Best Params: {'init': 'k-means++', 'max_iter': 200, 'n_clusters': 3, 'n_init': 5}
Best Silhouette Score: 0.3068701099442235
Davies-Bouldin Index: 1.0908480279133708
Calinski-Harabasz Score: 13.98252910957412

Running clustering K-means tahun 2011

Best Params: {'init': 'k-means++', 'max_iter': 200, 'n_clusters': 2, 'n_init': 5}
Best Silhouette Score: 0.40351669855050887
Davies-Bouldin Index: 1.0551789596310308
Calinski-Harabasz Score: 13.886929013843545

Running clustering K-means tahun 2012

Best Params: {'init': 'k-means++', 'max_iter': 200, 'n_clusters': 2, 'n_init': 5}
Best Silhouette Score: 0.42222885191669146
Davies-Bouldin Index: 0.9993652455280774
Calinski-Harabasz Score: 15.607354892371571

Running clustering K-means tahun 2013

Best Params: {'init': 'k-means++', 'max_iter': 200, 'n_clusters': 2, 'n_init': 5}
Best Silhouette Score: 0.40499740249349386
Davies-Bouldin Index: 1.04010983053748
Calinski-Harabasz Score: 14.037532393620408

Running clustering K-means tahun 2014

Bes

In [6]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.model_selection import ParameterGrid
import mlflow

param_grid_agglomerative = {
    'n_clusters': [2, 3, 4, 5],
    'linkage': ['ward', 'average', 'complete']
}

def agglomerative_mlflow(df_scaled, year):
    df_hclust = df_scaled.iloc[:, 2:]

    best_score = -1
    best_params = None
    best_labels = None

    for params in ParameterGrid(param_grid_agglomerative):
        try:
            model = AgglomerativeClustering(**params)
            labels = model.fit_predict(df_hclust)

            if len(set(labels)) > 1:
                sil_score = silhouette_score(df_hclust, labels)
                print(f"Params: {params}, Silhouette Score: {sil_score:.4f}")

                if sil_score > best_score:
                    best_score = sil_score
                    best_params = params
                    best_labels = labels
        except Exception as e:
            print(f"Error for params {params}: {e}")
            continue

    # Bangun model final berdasarkan best_params
    if best_params['linkage'] == 'ward':
        agg_model = AgglomerativeClustering(**best_params, metric='euclidean')
    else:
        agg_model = AgglomerativeClustering(**best_params)

    agg_labels = agg_model.fit_predict(df_hclust)

    # Evaluasi
    dbi_score = davies_bouldin_score(df_hclust, agg_labels)
    ch_score = calinski_harabasz_score(df_hclust, agg_labels)

    # Logging ke MLflow
    with mlflow.start_run(run_name=f"Agglomerative_{year}"):
        mlflow.set_tag("year", year)
        mlflow.set_tag("model_type", "Agglomerative")
        mlflow.log_params(best_params)
        mlflow.log_metric("silhouette_score", best_score)
        mlflow.log_metric("davies_bouldin_index", dbi_score)
        mlflow.log_metric("calinski_harabasz_score", ch_score)

    # Print hasil
    print("\nBest Params:", best_params)
    print("Best Silhouette Score:", best_score)
    print("Davies-Bouldin Index:", dbi_score)
    print("Calinski-Harabasz Score:", ch_score)

    return agg_labels

In [7]:
for year in range(2010, 2025):
    print(f"\nRunning clustering Agglomerative tahun {year}")
    df_scaled = df_scaled_dict[year]
    labels = agglomerative_mlflow(df_scaled, year)


Running clustering Agglomerative tahun 2010
Params: {'linkage': 'ward', 'n_clusters': 2}, Silhouette Score: 0.3915
Params: {'linkage': 'ward', 'n_clusters': 3}, Silhouette Score: 0.3807
Params: {'linkage': 'ward', 'n_clusters': 4}, Silhouette Score: 0.2093
Params: {'linkage': 'ward', 'n_clusters': 5}, Silhouette Score: 0.2366
Params: {'linkage': 'average', 'n_clusters': 2}, Silhouette Score: 0.5117
Params: {'linkage': 'average', 'n_clusters': 3}, Silhouette Score: 0.4359
Params: {'linkage': 'average', 'n_clusters': 4}, Silhouette Score: 0.3360
Params: {'linkage': 'average', 'n_clusters': 5}, Silhouette Score: 0.2806
Params: {'linkage': 'complete', 'n_clusters': 2}, Silhouette Score: 0.3915
Params: {'linkage': 'complete', 'n_clusters': 3}, Silhouette Score: 0.3807
Params: {'linkage': 'complete', 'n_clusters': 4}, Silhouette Score: 0.3360
Params: {'linkage': 'complete', 'n_clusters': 5}, Silhouette Score: 0.1864

Best Params: {'linkage': 'average', 'n_clusters': 2}
Best Silhouette Score