# Klasterovanje

In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import silhouette_score, homogeneity_score
from sklearn.neighbors import kneighbors_graph
from sklearn.feature_selection import f_classif, SelectKBest, SequentialFeatureSelector
from sklearn.metrics import confusion_matrix, silhouette_score, homogeneity_score
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
from sklearn.svm import SVC
from sklearn.decomposition import PCA

Učitavanje podataka

In [None]:
# Dodati odgovarajući folder sa podacima
# data = pd.read_csv('pbmc_za_vezbu/podaci.csv', header=None)

Izdvajanje naziva klasa

In [None]:
# Regularnim izrazom se pronalaze i uklanjaju sufiksi _1 _2, ...
klase = data.iloc[:, 0].apply(lambda x: re.sub(r'_\d+$', '', x))

Upisivanje izdvojenih klasa u CSV fajl

In [None]:
klase.to_csv('klase.csv', index=False)

Dodela numerickih oznaka klasama i grupisanje prema tipovima ćelija

In [None]:
oznake_klasa = {
"Klasa_1": 0,    
"Klasa_2": 1,   
"Klasa_3": 2,
"Klasa_4": 3, 
"Klasa_5": 4,
"Klasa_6": 5,    
"Klasa_7": 6,
"Klasa_8": 7,
"Klasa_9": 8,
"Klasa_10": 9,
"Klasa_11": 10,
"Klasa_12": 11,
"Klasa_13": 12,
"Klasa_14": 13,
"Klasa_15": 14,
}

oznake_klasa_grupisane = {
"Klasa_1": 1,    
"Klasa_2": 0,   
"Klasa_3": 2,
"Klasa_4": 3, 
"Klasa_5": 3,
"Klasa_6": 4,    
"Klasa_7": 4,
"Klasa_8": 5,
"Klasa_9": 0,
"Klasa_10": 0,
"Klasa_11": 0,
"Klasa_12": 6,
"Klasa_13": 6,
"Klasa_14": 7,
"Klasa_15": 7,
}

klase = klase.replace(oznake_klasa)

Analiza varijansi atributa

In [None]:
variances = X.var()
print(variances)
print(f'Mean: {variances.mean()}')
print(f'Median: {variances.median()}')
print(f'Min: {variances.min()}')
print(f'Max: {variances.max()}')

Uklanjanje atributa sa varijansom ispod zadatog praga

In [None]:
X = data.iloc[:, 1:]

In [None]:
X.shape

In [None]:
vt = VarianceThreshold(threshold=1)
X_vt_1 = vt.fit_transform(X)

In [None]:
X_vt_1.shape

Upisivanje transformisanih podataka u fajl

In [None]:
X_vt_1_df = pd.DataFrame(data=X_vt_1)
X_vt_1_df.to_csv('X_vt_1.csv', index=False)

Analiza varijanse po redovima

In [None]:
X_vt_1.T.var().min()

Analiza PCA komponenti i objasnjene varijanse

In [None]:
pca = PCA()
X_vt_1_pca = pca.fit_transform(X_vt_1)

In [None]:
pca.explained_variance_ratio_[:50].sum()

Grafik dobijen pomoću PCA komponenti (obratiti pažnju na lošu vrednost objašnjene varijanse)

In [None]:
plt.scatter(X_vt_1_pca[:, 0], X_vt_1_pca[:, 1], c=klase.values)

Konstrukcija grafa najbližih suseda na osnovu kosinusnog rastojanja

In [None]:
# Testirati drugačije načine konstrukcije grafa
X_vt_1_graph = kneighbors_graph(
    X_vt_1, 
    n_neighbors=150, 
    mode='distance', 
    metric='cosine',
    n_jobs=-1
)

X_vt_1_graph_map = kneighbors_graph(
    X_vt_1, 
    n_neighbors=150, 
    mode='connectivity', 
    metric='cosine',
    n_jobs=-1
)

In [None]:
X_vt_1_graph_arr = X_vt_1_graph.toarray()
X_vt_1_graph_map_arr = X_vt_1_graph_map.toarray()
X_vt_1_graph_full = X_vt_1_graph_arr

X_vt_1_graph_full = 1.0 / (1.0 + X_vt_1_graph_arr)

In [None]:
X_vt_1_graph_full[X_vt_1_graph_map_arr == 0] = 0

Pronalaženje klastera spektralnim klasterovanjem

In [None]:
sc = SpectralClustering(
    n_clusters=5, 
    affinity='precomputed', 
    n_components=2
)
sc.fit(X_vt_1_graph_full)

In [None]:
plt.scatter(X_vt_1_pca[:, 0], X_vt_1_pca[:, 1], c=sc.labels_)

Procena kvaliteta klasterovanja

In [None]:
# Silueta
silhouette_score(X_vt_1, labels=sc.labels_, metric='cosine')

In [None]:
# Homogenost
homogeneity_score(klase.values.ravel(), sc.labels_)

Pronalaženje klastera hijerarhijskim klasterovanjem

In [None]:
ac = AgglomerativeClustering(
    n_clusters=15, 
    affinity='cosine', 
    linkage='average'
)
fitted_model = ac.fit(X_vt_1)

In [None]:
plt.scatter(X_vt_1_pca[:, 0], X_vt_1_pca[:, 1], c=ac.labels_)

Procena kvaliteta klasterovanja

In [None]:
# Silueta
silhouette_score(X_vt_1, labels=ac.labels_, metric='cosine')

In [None]:
# Homogenost
homogeneity_score(klase.values.ravel(), ac.labels_)

# Klasifikacija

In [None]:
y = klase

# Testirati pomoću grupisanih oznaka klasa
# i isključivanjem podataka iz "ambiguous" klase

# y_filter = y['0'] != 1
# y = y[y_filter]
# X = data.iloc[:, 1:][y_filter]

Feature selection pomoću <i>Relief</i> algoritma

In [None]:
# r = relief.Relief(
#     n_features=100
# )

# X_vt_1_r_100 = r.fit_transform(
#     X_vt_1.values,
#     y.values.ravel()
# )

Izdvajanje $k$ najperspektivnijih atributa

In [None]:
# Testirati različite parametre k
skb = SelectKBest(score_func=f_classif, k=20)
X_vt_1_skb_20 = skb.fit_transform(X, y.values.ravel())

Klasifikacija na osnovu odabranih atributa pomoću SVM klasifikatora (testirati druge klasifikatora, npr. <i>DecisionTreeClassifier</i> sa različitim parametrima)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_vt_1_skb_10, y.values.ravel(), test_size=0.3)

svm = SVC(C=1.0, kernel='rbf')
svm.fit(X_train, y_train)

Evaluacija modela na trening i test podacima

In [None]:
svm.score(X_train, X_test)

In [None]:
svm.score(X_test, y_test)

In [None]:
# Matrica konfuzije nad test podacima
y_pred = svm.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

Vizualizacija rezultata pomocu PCA koordinata

In [None]:
pca = PCA()
X_vt_1_skb_20_pca = pca.fit_transform(X_vt_1_skb_20)

In [None]:
pca.explained_variance_ratio_[:2].sum()

In [None]:
# Originalne klase
_ = plt.scatter(X_vt_1_skb_20_pca[:, 0], X_vt_1_skb_20_pca[:, 1], c=y.values.ravel())

In [None]:
# Predviđene klase
_ = plt.scatter(X_vt_1_skb_20_pca[:, 0], X_vt_1_skb_20_pca[:, 1], c=y_pred)

# Klasterovanje na osnovu atributa izdvojenih posredstvom poznavanja oznaka klasa

In [None]:
# Kmeans
km = KMeans(n_clusters=5)
km.fit(X_vt_1_skb_20)

In [None]:
_ = plt.scatter(X_vt_1_skb_20_pca[:, 0], X_vt_1_skb_20_pca[:, 1], c=km.labels_)

In [None]:
# Silueta
silhouette_score(X_vt_1_skb_10, km.labels_)

In [None]:
# Homogenost
homogeneity_score(y.values.ravel(), km.labels_)

In [None]:
# Spektralno klasterovanje
sc = SpectralClustering(n_clusters=5)
sc.fit(X_vt_1_skb_20)

In [None]:
_ = plt.scatter(X_vt_1_skb_20_pca[:, 0], X_vt_1_skb_20_pca[:, 1], c=sc.labels_)

In [None]:
# Silueta
silhouette_score(X_vt_1_skb_10, sc.labels_)

In [None]:
# Homogenost
homogeneity_score(y.values.ravel(), sc.labels_)

In [None]:
# Hijerarhijsko klasterovanje
ac = AgglomerativeClustering(n_clusters=5, linkage='ward')
ac.fit(X_vt_1_skb_20)

In [None]:
_ = plt.scatter(X_vt_1_skb_20_pca[:, 0], X_vt_1_skb_20_pca[:, 1], c=ac.labels_)

In [None]:
# Silueta
silhouette_score(X_vt_1_skb_10, ac.labels_)

In [None]:
# Homogenost
homogeneity_score(y.values.ravel(), ac.labels_)

In [None]:
# Crtanje dendrograma
def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
# Treniranje kompletnog modela
ac_full = AgglomerativeClustering(
    n_clusters=None, 
    linkage='ward', 
    distance_threshold=0
)
ac_full_trained = ac_full.fit(X_vt_1_skb_10)

In [None]:
plot_dendrogram(ac_full_trained, truncate_mode="level", p=10)