In [26]:
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import homogeneity_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN


n_samples = 1500
dataset = datasets.make_blobs(n_samples=n_samples, centers=2, center_box=(-7.0, 7.5),
                              cluster_std=[1.4, 1.7],
                              random_state=42)
X_2, _ = datasets.make_blobs(n_samples=n_samples, random_state=170, centers=[[-4, -3]], cluster_std=[1.9])
transformation = [[1.2, -0.8], [-0.4, 1.7]]
X_2 = np.dot(X_2, transformation)
X, y = np.concatenate((dataset[0], X_2)), np.concatenate((dataset[1], np.array([2] * len(X_2))))


# сначала получим предсказанные кластеры при помощи метода кластеризации
kmeans = KMeans(n_clusters=3, random_state=42)
X = StandardScaler().fit_transform(X)
kmeans.fit(X)
kmeans_pred = kmeans.labels_ 
# теперь посчитаем однородность
homogeneity_score(labels_true=y, labels_pred=kmeans_pred)

0.8025180661661425

In [4]:
k_means = KMeans(n_clusters=3, random_state=42)
k_means.fit(X)
print(homogeneity_score(labels_true=y, labels_pred=k_means.labels_))

gm = GaussianMixture(n_components=3, random_state=42)
gm_labels = gm.fit_predict(X)
print(homogeneity_score(labels_true=y, labels_pred=gm_labels))

agg = AgglomerativeClustering(n_clusters=3)
agg_labels = agg.fit_predict(X)
print(homogeneity_score(labels_true=y, labels_pred=agg_labels))

dbs = DBSCAN(eps=0.9, min_samples=35)
dbs_labels = dbs.fit_predict(X)
print(homogeneity_score(labels_true=y, labels_pred=dbs_labels))


0.8025180661661425
0.9339791347507893
0.9099781767383747
0.00044465831964150197


In [6]:
from sklearn.metrics.cluster import completeness_score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

gm = GaussianMixture(n_components=3, random_state=42)
gm_labels = gm.fit_predict(X_scaled)
print(completeness_score(labels_true=y, labels_pred=gm_labels))


0.9325740421656737


In [7]:
k_means = KMeans(n_clusters=3, random_state=42)
k_means.fit(X_scaled)
print(completeness_score(labels_true=y, labels_pred=k_means.labels_))

gm = GaussianMixture(n_components=3, random_state=42)
gm_labels = gm.fit_predict(X_scaled)
print(completeness_score(labels_true=y, labels_pred=gm_labels))

agg = AgglomerativeClustering(n_clusters=3)
agg_labels = agg.fit_predict(X_scaled)
print(completeness_score(labels_true=y, labels_pred=agg_labels))

dbs = DBSCAN(eps=0.9, min_samples=35)
dbs_labels = dbs.fit_predict(X_scaled)
print(completeness_score(labels_true=y, labels_pred=dbs_labels))


0.7808712092278982
0.9325740421656737
0.9058386997451113
0.08342237034907717


In [8]:
from sklearn.metrics.cluster import v_measure_score

k_means = KMeans(n_clusters=3, random_state=42)
k_means.fit(X_scaled)
print(v_measure_score(labels_true=y, labels_pred=k_means.labels_))

gm = GaussianMixture(n_components=3, random_state=42)
gm_labels = gm.fit_predict(X_scaled)
print(v_measure_score(labels_true=y, labels_pred=gm_labels))

agg = AgglomerativeClustering(n_clusters=3)
agg_labels = agg.fit_predict(X_scaled)
print(v_measure_score(labels_true=y, labels_pred=agg_labels))

dbs = DBSCAN(eps=0.9, min_samples=35)
dbs_labels = dbs.fit_predict(X_scaled)
print(v_measure_score(labels_true=y, labels_pred=dbs_labels))

0.791546668267586
0.9332760595996924
0.9079037199053294
0.000884601531943088


In [9]:
k_means = KMeans(n_clusters=3, init='k-means++', n_init=1, random_state=42)
k_means.fit(X_scaled)
print(v_measure_score(labels_true=y, labels_pred=k_means.labels_))

k_means = KMeans(n_clusters=3, init='random', n_init=1, random_state=42)
k_means.fit(X_scaled)
print(v_measure_score(labels_true=y, labels_pred=k_means.labels_))


0.7892280957870129
0.7903450330611903


In [10]:
from sklearn.cluster import MiniBatchKMeans

kmeans_mini_batch = MiniBatchKMeans(n_clusters=3, n_init=1, random_state=42)
k_means = KMeans(n_clusters=3, n_init=1, random_state=42)

kmeans_mini_batch.fit(X_scaled)
k_means.fit(X_scaled)

print(v_measure_score(labels_true=y, labels_pred=kmeans_mini_batch.labels_))
print(v_measure_score(labels_true=y, labels_pred=k_means.labels_))


0.7497460092948622
0.7892280957870129


In [12]:
agg = AgglomerativeClustering(n_clusters=3, linkage="ward")
agg_labels = agg.fit_predict(X_scaled)
print(v_measure_score(labels_true=y, labels_pred=agg_labels))

agg = AgglomerativeClustering(n_clusters=3, linkage="complete")
agg_labels = agg.fit_predict(X_scaled)
print(v_measure_score(labels_true=y, labels_pred=agg_labels))

agg = AgglomerativeClustering(n_clusters=3, linkage="average")
agg_labels = agg.fit_predict(X_scaled)
print(v_measure_score(labels_true=y, labels_pred=agg_labels))

agg = AgglomerativeClustering(n_clusters=3, linkage="single")
agg_labels = agg.fit_predict(X_scaled)
print(v_measure_score(labels_true=y, labels_pred=agg_labels))


0.9079037199053294
0.5805530251504777
0.678656551579543
0.0008842106330108959


In [13]:
from sklearn.neighbors import kneighbors_graph

connectivity = kneighbors_graph(X, n_neighbors=6, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)

agg = AgglomerativeClustering(n_clusters=3)
agg_labels = agg.fit_predict(X)
print(v_measure_score(labels_true=y, labels_pred=agg_labels))

agg = AgglomerativeClustering(n_clusters=3, connectivity=connectivity)
agg_labels = agg.fit_predict(X)
print(v_measure_score(labels_true=y, labels_pred=agg_labels))

0.9079037199053294
0.883942992495597


In [27]:
dbs = DBSCAN(eps=0.9, min_samples=35)
dbs_labels = dbs.fit_predict(X)
print(completeness_score(labels_true=y, labels_pred=dbs_labels))

dbs = DBSCAN(eps=0.6, min_samples=35)
dbs_labels = dbs.fit_predict(X)
print(completeness_score(labels_true=y, labels_pred=dbs_labels))


0.08342237034907717
0.08772294194361933


In [38]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

n_samples = 1500
dataset = datasets.make_blobs(
    n_samples=n_samples,
    centers=2,
    center_box=(-7.0, 7.5),
    cluster_std=[1.4, 1.7],
    random_state=42,
)
X_2, _ = datasets.make_blobs(
    n_samples=n_samples,
    random_state=170,
    centers=[[-4, -3]],
    cluster_std=[1.9],
)
transformation = [[1.2, -0.8], [-0.4, 1.7]]
X_2 = np.dot(X_2, transformation)
X, y = np.concatenate((dataset[0], X_2)), np.concatenate((dataset[1], np.array([2] * len(X_2))))

dbscan_eps_09 = DBSCAN(eps=0.9, min_samples=35)
dbscan_eps_08 = DBSCAN(eps=0.8, min_samples=35)

for algo in [dbscan_eps_09, dbscan_eps_08]:
    labels = algo.fit_predict(X)
    v_measure = v_measure_score(y, labels)
    print(np.round(v_measure, 2))

0.77
0.71


In [39]:
y_predict = dbscan_eps_09.fit_predict(X)
np.unique(y_predict, return_counts=True)

(array([-1,  0,  1,  2]), array([ 368,  656,  721, 1255]))

In [41]:
not_noise_ind = np.where(y_predict != -1)
v_measure = v_measure_score(y[not_noise_ind], y_predict[not_noise_ind])
print(v_measure)

0.9667281300681958


In [42]:
agg = AgglomerativeClustering(n_clusters=3)
agg_labels = agg.fit_predict(X)
print(v_measure_score(labels_true=y, labels_pred=agg_labels))


0.703747024360433


In [43]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
agg_labels = agg.fit_predict(X_scaled)
print(v_measure_score(labels_true=y, labels_pred=agg_labels))


0.9079037199053294


In [44]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)
agg_labels = agg.fit_predict(X_scaled)
print(v_measure_score(labels_true=y, labels_pred=agg_labels))


0.8929241488344335
