In [14]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from sklearn.datasets import make_classification

# Make a dataset with 20 features in total, but only 3 of them are really informative. This will be used to test my sas proc varclus implementation.
# Should be able to cluster the features into three good clusters.
X, y = make_classification(
    n_samples=1000,
    n_features=4,
    n_informative=2,
    n_redundant=1,
    n_clusters_per_class=1,
    random_state=42,
)

X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
X.shape, y.shape

((1000, 4), (1000,))

In [15]:
X.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3
0,-0.102497,1.191914,-0.649373,-1.408574
1,0.471671,1.135029,1.236131,-1.340508
2,0.176208,-0.771196,-0.879814,0.911542
3,-2.201781,-2.514889,-0.960046,2.968465
4,2.076836,0.224677,0.294224,-0.262425


In [16]:
fig = go.Figure()

fig.add_trace(
    go.Scatter3d(
        x=X.loc[y == 0, "feature_0"],
        y=X.loc[y == 0, "feature_1"],
        z=X.loc[y == 0, "feature_2"],
        mode="markers",
        marker=dict(size=3, color="blue", opacity=0.5),
        name="Class 0",
        legendgroup="Target",
    )
)

fig.add_trace(
    go.Scatter3d(
        x=X.loc[y == 1, "feature_0"],
        y=X.loc[y == 1, "feature_1"],
        z=X.loc[y == 1, "feature_2"],
        mode="markers",
        marker=dict(size=3, color="red", opacity=0.5),
        name="Class 1",
        legendgroup="Target",
    )
)

fig.update_layout(
    scene=dict(
        xaxis_title="Feature 0", yaxis_title="Feature 1", zaxis_title="Feature 2"
    )
)

fig.write_html("data.html")

In [17]:
from varclus import VarClus

In [18]:
vc = VarClus(X)

In [19]:
vc.matrix_calculator.compute_corr_matrix()

array([[ 1.        , -0.02019676,  0.03311569,  0.02148623],
       [-0.02019676,  1.        , -0.00426183, -0.99999917],
       [ 0.03311569, -0.00426183,  1.        ,  0.00430444],
       [ 0.02148623, -0.99999917,  0.00430444,  1.        ]])

In [20]:
vc.matrix_calculator.compute_dist_matrix(vc.matrix_calculator.compute_corr_matrix())

array([[0.00000000e+00, 9.79803236e-01, 9.66884305e-01, 9.78513767e-01],
       [9.79803236e-01, 0.00000000e+00, 9.95738167e-01, 8.32000000e-07],
       [9.66884305e-01, 9.95738167e-01, 0.00000000e+00, 9.95695562e-01],
       [9.78513767e-01, 8.32000000e-07, 9.95695562e-01, 0.00000000e+00]])

In [21]:
vc.initialize_clusters()

array([2, 1, 3, 1], dtype=int32)

In [22]:
vc.run()

Unnamed: 0,Variable,Cluster
0,feature_0,2
1,feature_1,1
2,feature_2,3
3,feature_3,1


In [26]:
px.scatter(vc.data, x="feature_2", y="feature_3", title="Cluster 0").show()