In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
d = Path().resolve().parent.parent
author_institution= pd.read_csv(str(d) + "/data/generated/author_institution.csv")

In [None]:
institution_author_series = author_institution.groupby('institution_id')['author_id'].apply(list)

In [None]:
institution_author_dict = institution_author_series.to_dict()

In [None]:
author_topicList = np.load('author_topicList.npy').item()

In [None]:
final_dict = {}
for key,values in institution_author_dict.items():
    aux=[]
    for el in values:
        aux += author_topicList[el]
    final_dict[key] = aux

In [None]:
institution_top_cluster = {}
for key,value in final_dict.items():
    dc = [0,0,0,0,0,0,0,0]
    for ele in value:
        dc[ele]+=1
    institution_top_cluster[key] = dc

In [None]:
values = np.array(list(institution_top_cluster.values()))
values

# K-MEANS

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=37, random_state=0).fit(values)

In [None]:
institution_topic_cluster = []
i = 0
for key,value in final_dict.items():
    institution_topic_cluster.append([key,kmeans.labels_[i]])
    i += 1

In [None]:
text_file = open("institution-topic-cluster.csv", "w")
for element in institution_topic_cluster:
    text_file.write("%s, %s \n" % (element[0],element[1]))
text_file.close()


https://stats.stackexchange.com/questions/21807/evaluation-measure-of-clustering-without-having-truth-labels

Silhouette index (implementation in MATLAB) (the closer to 1 the better)

Davies-Bouldin (the smaller the better)

Calinski-Harabasz (the higher the better) http://scikit-learn.org/stable/modules/generated/sklearn.metrics.calinski_harabaz_score.html

Dunn index (implementation in MATLAB)

R-squared index

Hubert-Levin (C-index)

Krzanowski-Lai index

Hartigan index https://github.com/teddyroland/python-hartigan/blob/master/hartigan.py

Root-mean-square standard deviation (RMSSTD) index

Semi-partial R-squared (SPR) index

Distance between two clusters (CD) index

weighted inter-intra index 

Homogeneity index

Separation index

In [None]:
from sklearn.cluster import KMeans
from numpy import array

def hartigan_K(list_of_tuples, threshold = 12):
    # 'list_of_tuples' is a list containing the points you want to cluster
    # 'threshold' optimizes goodness of fit values
    #     Hartigan recommends setting threshold to 10, but Chiang & Mirkin confirm up to 12
    # returns integer, "correct" number of clusters
    
    inertia_list = np.zeros(len(list_of_tuples)+1) # initializes for maximum possible clusters
    num = 0                                     # counter
    H_Rule = threshold+1                        # simply initializes above threshold to meet 'while' condition
    
    # NOTE: 'inertia' is equivalent to the sum of within-cluster distances to centroids
    
    while num < len(list_of_tuples) and H_Rule > threshold:
        kmn = KMeans(n_clusters = num+1)
        kmn.fit(list_of_tuples)
        inertia_list[num+1]+=kmn.inertia_
        if num > 0:
            H_Rule = ((float(inertia_list[num])/inertia_list[num+1])-1)*(len(list_of_tuples)-(num)-1)
        num+=1
    
    if H_Rule > threshold:
      num+=1
    # NOTE: if while-loop reaches the number of K-Means clusters equal to the length of list_of_tuples
    # without hitting the threshold, then function returns trivial solution that there are N clusters
    # (where N is the number of points under observation)
    
    return num-1

In [None]:
hartigan_K(values)

Analyzing silhouette coefficient and calinski harabaz values in order to choose the optimal size

In [None]:
from __future__ import print_function

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import metrics

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np



range_n_clusters = [25,31,37]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(values) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(values)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(values, cluster_labels)
    calinski_harabaz = metrics.calinski_harabaz_score(values,cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg,
         "Calinski harab is:",calinski_harabaz)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(values, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(values[:, 0], values[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

    plt.show()

# DBSCAN http://scikit-learn.org/stable/modules/clustering.html
DBSCAN testing

In [None]:
print(__doc__)

import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler


# #############################################################################


X = StandardScaler().fit_transform(values)

# #############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)


print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

# Other
Other methods were tested in order to choose the best one

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.cluster as cluster
import time
%matplotlib inline
sns.set_context('poster')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}
def plot_clusters(data, algorithm, args, kwds):
    start_time = time.time()
    labels = algorithm(*args, **kwds).fit_predict(data)
    silhouette_avg = silhouette_score(data, labels)
    calinski_harabaz = metrics.calinski_harabaz_score(data,labels)
    print("The average silhouette_score is :", silhouette_avg,
         "Calinski harab is:",calinski_harabaz)
    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
    plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)

In [None]:
plot_clusters(values, cluster.KMeans, (), {'n_clusters':6})

In [None]:
plot_clusters(values, cluster.AffinityPropagation, (), {'preference':-5.0, 'damping':0.95})

In [None]:
plot_clusters(values, cluster.MeanShift, (0.175,), {'cluster_all':False})

In [None]:
plot_clusters(values, cluster.AgglomerativeClustering, (), {'n_clusters':6, 'linkage':'ward'})

In [None]:
plot_clusters(values, cluster.DBSCAN, (), {'eps':0.025})