In [4]:
import pandas as pd
from skquery.pairwise import RandomMLCL
from skquery.oracle import MLCLOracle
from active_semi_clustering.semi_supervised.pairwise_constraints import COPKMeans
from sklearn.datasets import *
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score
import plotly.express as px
import plotly.graph_objects as go

In [5]:
#Plots data partition, optionally with ML/CL constraints
def plot(dataset, partition, constraints=None, filename=None):
    viz_dataset = pd.DataFrame(PCA(n_components=2).fit_transform(dataset)) if dataset.shape[1] > 3 else pd.DataFrame(dataset)
    fig = None
    match viz_dataset.shape[1]:
        case 2:
            fig = px.scatter(viz_dataset, x=0, y=1, template="simple_white",
                             color=partition, symbol=partition,
                             hover_data={'index': viz_dataset.index.astype(str)})
        case 3:
            fig = px.scatter_3d(viz_dataset, x=0, y=1, z=2, template="simple_white",
                                color=partition, symbol=partition,
                                hover_data={'index': viz_dataset.index.astype(str)})

    if constraints:
        for key in constraints:
            for cst in constraints[key]:
                points = viz_dataset.iloc[list(cst)]
                match viz_dataset.shape[1]:
                    case 2:
                        fig.add_trace(go.Scatter(name=str(cst), x=[points.iloc[0, 0], points.iloc[1, 0]],
                                                 mode="lines", y=[points.iloc[0, 1], points.iloc[1, 1]]))
                    case 3:
                        fig.add_trace(go.Scatter3d(name=str(cst), x=[points.iloc[0, 0], points.iloc[1, 0]],
                                                   mode="lines", y=[points.iloc[0, 1], points.iloc[1, 1]],
                                                   z=[points.iloc[0, 2], points.iloc[1, 2]]))
                if key == "ml":
                    fig['data'][-1]['line']['color'] = "#ff0000"
                else:
                    fig['data'][-1]['line']['color'] = "#0000ff"
                    fig['data'][-1]['line']['dash'] = "dash"

    fig.update_layout(showlegend=False)
    fig.update(layout_coloraxis_showscale=False)
    if not filename:
        fig.show()
    else:
        fig.write_html(filename)

In [6]:
X, y = load_iris(return_X_y=True)

In [11]:
cc_alg = COPKMeans(n_clusters=len(set(y)))
cc_alg.fit(X)
init_partition = cc_alg.labels_
plot(X, init_partition)
print(f"Initial ARI with GT : {adjusted_rand_score(y, init_partition)}")

Initial ARI with GT : 0.7163421126838476


In [8]:
#Test run
qs = RandomMLCL()
oracle = MLCLOracle(truth=y, budget=10)
constraints = qs.fit(X, oracle, partition=init_partition)

plot(X, init_partition, constraints)
print("ML : ", constraints["ml"])
print("CL : ", constraints["cl"])
#qs.csts_to_file(constraints)

ML :  [(123, 116), (69, 57)]
CL :  [(103, 29), (114, 97), (53, 17), (15, 59), (18, 78), (14, 140), (148, 9), (101, 62)]


In [9]:
cc_alg.fit(X, ml=constraints["ml"], cl=constraints["cl"])
plot(X, cc_alg.labels_)
print(f"Final ARI with GT : {adjusted_rand_score(y, cc_alg.labels_)}")

Final ARI with GT : 0.7429751965483791
