In [7]:
from DataSets import csv_read as cv
import numpy as np

In [2]:
G = cv.load_exp_graph_matrix()
seeds = cv.load_seed()

In [3]:
X_sp = cv.load_spectral_embedding(g_type='exp')

In [8]:
X_sp = X_sp[:,:500]
X_sp.shape

(6000, 500)

In [5]:
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn import metrics

def run_metrics(X, labels_pred, labels_true=seeds[:,1]):
    labels = labels_pred[seeds[:,0]-1]
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels_pred, metric='sqeuclidean'))

def v_measure(labels_pred, labels_true=seeds[:,1]):
    labels = labels_pred[seeds[:,0]-1]
    return metrics.v_measure_score(labels_true, labels)
    
def clust_kmeans(X):
        kmeans = KMeans(n_clusters = 10).fit(X)
        run_metrics(X, kmeans.labels_)
        return kmeans.labels_

def clust_gmm(X):
    gmm = GaussianMixture(n_components=10, max_iter=200, n_init=10).fit(X)
    labels = gmm.predict(X)
    run_metrics(X, labels)
    return labels

In [33]:
kmeans = clust_kmeans(X_sp)

Homogeneity: 0.520
Completeness: 0.575
V-measure: 0.546
Adjusted Rand Index: 0.215
Adjusted Mutual Information: 0.310
Silhouette Coefficient: -0.126


In [36]:
gmm = clust_gmm(X_sp)

Homogeneity: 0.550
Completeness: 0.595
V-measure: 0.572
Adjusted Rand Index: 0.243
Adjusted Mutual Information: 0.349
Silhouette Coefficient: -0.142


In [10]:
X_PCA = cv.load_extracted_features_PCA(k=500)
X_f = cv.load_extracted_features()

In [39]:
from sklearn.cross_decomposition import CCA

cca = CCA(n_components=8)
cca.fit(X_PCA[:6000], X_sp)
cca_preds = cca.transform(X_PCA)

In [40]:
kmeans = clust_kmeans(cca_preds)

Homogeneity: 0.643
Completeness: 0.671
V-measure: 0.657
Adjusted Rand Index: 0.387
Adjusted Mutual Information: 0.472
Silhouette Coefficient: 0.261


In [41]:
gmm = clust_gmm(cca_preds)

Homogeneity: 0.714
Completeness: 0.744
V-measure: 0.728
Adjusted Rand Index: 0.513
Adjusted Mutual Information: 0.577
Silhouette Coefficient: 0.217


In [42]:
X_f[:,0].reshape(-1,1).shape

(10000, 1)

In [16]:
def cm(labels_pred, labels_true=seeds[:,1]): 
    labels = labels_pred[seeds[:,0]-1]
    print("Accuracy: " + str(metrics.accuracy_score(labels_true, labels)))
    print("F1 score: " + str(metrics.f1_score(labels_true, labels, average='micro')))  
    return metrics.confusion_matrix(labels_true, labels)

cm(gmm)

Accuracy: 0.0166666666667
F1 score: 0.0166666666667


array([[0, 0, 0, 0, 0, 6, 0, 0, 0, 0],
       [0, 0, 0, 6, 0, 0, 0, 0, 0, 0],
       [0, 6, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 5, 0],
       [0, 0, 0, 0, 0, 0, 0, 6, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0, 3],
       [0, 0, 4, 0, 1, 1, 0, 0, 0, 0],
       [4, 0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 1, 0, 1, 0, 0, 3],
       [1, 0, 0, 0, 0, 0, 0, 4, 0, 1]])

In [70]:
def best_pred(labels_pred, labels_true=seeds[:,1]):
    labels = labels_pred[seeds[:,0]-1]
    from itertools import permutations
    max = 0
    for d in permutations(range(10)):
        preds = []
        for i in range(labels_true.shape[0]):
            preds.append(d[labels[i]])
        if(metrics.accuracy_score(labels_true, preds) > max):
            max = metrics.accuracy_score(labels_true, preds)
            best_d = d
    return best_d

def best_labels(d,labels_pred):
    preds = []
    for i in range(10000):
        preds.append(d[labels_pred[i]])
    return np.array(preds)

In [65]:
best_map = best_pred(gmm)
print(best_map)

(3, 4, 5, 2, 0, 7, 8, 1, 9, 6)

In [71]:
best_map= [3, 4, 5, 2, 0, 7, 8, 1, 9, 6]
preds = best_labels(best_map, gmm)

In [73]:
cm(preds)

Accuracy: 0.716666666667
F1 score: 0.716666666667


array([[5, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 6, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 5, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 5, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 6, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 6, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 5, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 5, 0, 0],
       [0, 3, 0, 0, 1, 2, 0, 0, 0, 0],
       [0, 0, 0, 0, 4, 1, 0, 1, 0, 0]])

In [94]:
x = np.array(range(1,10001))
out = np.column_stack((x,preds))
out[6000:]

array([[ 6001,     1],
       [ 6002,     4],
       [ 6003,     0],
       ..., 
       [ 9998,     0],
       [ 9999,     5],
       [10000,     7]])

In [93]:
np.savetxt('{}'.format("CCA_ALL_PREDS"), out[6000:], delimiter=',', fmt='%d',  header='Id,Label')

In [47]:
cca = CCA(n_components=8)
cca.fit(X_f[:6000], X_sp)
cca_preds = cca.transform(X_f)

In [48]:
kmeans = clust_kmeans(cca_preds)

Homogeneity: 0.744
Completeness: 0.771
V-measure: 0.757
Adjusted Rand Index: 0.533
Adjusted Mutual Information: 0.619
Silhouette Coefficient: 0.233


In [49]:
gmm = clust_gmm(cca_preds)

Homogeneity: 0.753
Completeness: 0.808
V-measure: 0.779
Adjusted Rand Index: 0.553
Adjusted Mutual Information: 0.642
Silhouette Coefficient: 0.219


In [None]:
dist_euc = cv.load_graph(shape_match=True, g_type='dist', dist_type='euc')
X_euc = cv.load_spectral_embedding(g_type='dist', dist_type='euc')

In [9]:
X_euc = X_euc[:,500]
X_euc.shape

NameError: name 'X_euc' is not defined