In [1]:
import matplotlib.pyplot as plt

In [2]:
import sys
import re
import pickle
from pathlib import Path
from collections import Counter
from itertools import combinations
from multiprocessing.sharedctypes import RawArray

import torch
import json
import yaml
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from pqdm.processes import pqdm
from IPython.core.display import HTML
from scipy.special import softmax
from scipy.spatial.distance import jensenshannon, cdist
from kneed import KneeLocator

import matplotlib
import matplotlib.cm as cm
import matplotlib.colors as mcolors

sys.path.append("../soup_nuts/models/dvae/")

from dvae import data_iterator, CollapsedMultinomial, DVAE
from utils import load_sparse, compute_to

In [3]:
def load_json(path):
    with open(path) as infile:
        return json.load(infile)

def load_yaml(path):
    with open(path) as infile:
        return yaml.load(infile, Loader=yaml.FullLoader)

def load_text(path):
    with open(path) as infile:
        return [text.strip().split(" ") for text in infile]

def save_json(obj, path):
    with open(path, 'w') as outfile:
        return json.dump(obj, outfile, indent=2)

In [4]:
def topic_est_to_words(topic_words, inv_vocab, n=10):
    return [inv_vocab[idx] for idx in (-topic_words).argsort()[:n]]

### Estimate loading functions

In [5]:
_CUDA_AVAILABLE = torch.cuda.is_available()
_data_cache = {}

def load_mallet_estimates(fpath):
    """
    Load the doc-topic and topic-word estimates from our mallet output folder
    """
    topic_word = np.load(fpath / "beta.npy")
    #topic_word = None 
    # Load the standard mallet document-topic estimate as a numpy matrix
    with open(fpath / "doctopics.txt") as infile:
        doc_topic = np.array([
            [float(x) for x in line.strip().split("\t")[2:]]
            for line in infile
        ])
        
        
    return topic_word, doc_topic, None # TODO: is loss available in mallet?

def load_dvae_estimates(fpath): 
    """
    Loads the dvae model and gets the topic word distribution, then instantiates
    the encoder portion and does a forward pass to get the 
    """
    # get the topic word
    device = torch.device("cuda") if _CUDA_AVAILABLE else torch.device("cpu")

    state_dict = torch.load(fpath / "model.pt", map_location=device)
    beta = state_dict["params"]["decoder$$$eta_layer.weight"]
    topic_word = torch.transpose(beta, 0, 1).cpu().detach().numpy()

    # do a forward pass to get the document topics
    # first instantiate the model and load in the params
    config = load_yaml(fpath / "config.yml")
    
    dvae = DVAE(
        vocab_size=topic_word.shape[1],
        num_topics=config["num_topics"],
        alpha_prior=config["alpha_prior"],
        embeddings_dim=config["encoder_embeddings_dim"],
        hidden_dim=config["encoder_hidden_dim"],
        dropout=config["dropout"],
        cuda=_CUDA_AVAILABLE,
    )
    dvae_dict = {
        k.replace("$$$", "."): v
        for k, v in state_dict['params'].items()
    }
    dvae.load_state_dict(dvae_dict, strict=False)
    dvae.eval()
    turn_off_bn = 1 * (config["epochs_to_anneal_bn"] > 0) # 0 means use BN, > 0 means no BN

    # then load the data for the forward pass
    data_fpath = Path(config["input_dir"], config["train_path"])
    if data_fpath not in _data_cache:
        data = load_sparse(data_fpath).astype(np.float32)
        _data_cache[data_fpath] = data
    else:
        data = _data_cache[data_fpath]
    
    batch_size = config["batch_size"]
    epochs = config["num_epochs"]
    n = data.shape[0]
    train_batches = n // batch_size + 1

    # do the forward pass and collect outputs in an array
    doc_topic = np.zeros((n, config["num_topics"]), dtype=np.float32)
    losses = np.zeros(n, dtype=np.float32)
    for i, x_batch in enumerate(data_iterator(data, batch_size, train_batches)):
        x_batch = x_batch.to(device)
        doc_topic_batch = dvae.encoder(x_batch)
        doc_topic_batch = doc_topic_batch / doc_topic_batch.sum(1, keepdims=True)
        x_recon = dvae.decoder(doc_topic_batch, bn_annealing_factor=turn_off_bn)
        loss_batch = -CollapsedMultinomial(1, probs=x_recon).log_prob(x_batch)

        doc_topic[i * batch_size:(i + 1) * batch_size] = doc_topic_batch.detach().cpu().numpy().astype(np.float32)
        losses[i * batch_size:(i + 1) * batch_size] = loss_batch.detach().cpu().numpy().astype(np.float32)
    return topic_word, doc_topic, losses


def load_etm_estimates(fpath):
    """
    Load the ETM estimates from a model
    """
    pass


def load_estimates(fpath, model_type):
    if model_type == "dvae":
        return load_dvae_estimates(fpath)
    if model_type == "mallet":
        return load_mallet_estimates(fpath)

### collect the runs

In [6]:
# roughly 7 GB RAM for k=100
def get_estimates_over_runs(run_paths, overlap_words, exclude_dups=False):
    doc_topics, topic_words, duplicates = [], [], [] # TODO: change to 3d tensors
    for i, (p, model_type) in enumerate(tqdm(run_paths)):
        t, d, l = load_estimates(p, model_type=model_type)
        
        # located duplicated topics
        sorted_t = np.sort((-t).argsort(axis=1)[:, :overlap_words], axis=1)
        counted_topics = Counter([tuple(t_) for t_ in sorted_t])
        if exclude_dups and max(counted_topics.values()) > 1:
            continue
        doc_topics.append(d)
        topic_words.append(t)
        duplicates.append(sum(c > 1 for c in counted_topics.values()))

    return doc_topics, topic_words, duplicates

#### takes relatively longer to run $\downarrow$

In [7]:
#run_dir = "../runs/outputs/url_partisanship_data"
run_dir = "../runs/outputs/full-mindf_power_law-maxdf_0.9"
dataset = 'nytimes' #'url_partisan'

mallet_paths = [
    (p.parent, "mallet")
    for p in Path(run_dir).glob("**/mallet-with-beta/**/doctopics.txt")
    if dataset in str(p)
]
dvae_paths = [
    (p.parent, "dvae") for p in Path(run_dir).glob("**/dvae/**/model.pt")
    if dataset in str(p)
]

# should be independent of the model
config = load_yaml(dvae_paths[0][0] / "config.yml")
data = load_sparse(Path(config["input_dir"], "train.dtm.npz"))
vocab = load_json(Path(config["input_dir"], "vocab.json"))
inv_vocab = {v: k for k, v in vocab.items()}


In [8]:
labels_coarse = np.load(Path(config["input_dir"], "train.labels.coarse.npy"))
labels_fine = np.load(Path(config["input_dir"], "train.labels.fine.npy"))

In [9]:
# dvae_paths = dvae_paths
# mallet_paths = mallet_paths

In [10]:
len(dvae_paths), len(mallet_paths)

(82, 84)

In [11]:
num_topics = sorted(set(int(re.search("k-([0-9]+)", str(p)).group(1)) for p in dvae_paths))
num_topics

[50]

In [12]:
n_topic_words = 15
overlap_words = 5

#### Gather the betas and thetas for all the mallet and numpy runs $\downarrow$

In [13]:
estimates_dvae, estimates_mallet = {}, {}
for k in num_topics:
    print(f"On k={k}")

    dvae_paths_k = [p for p in dvae_paths if f'k-{k}/' in str(p[0])]
    runs = len(dvae_paths_k)
    if dvae_paths_k:
        doc_topics, topic_words, duplicates = get_estimates_over_runs(dvae_paths_k, overlap_words, exclude_dups=False)
        estimates_dvae[k] = {"doc_topics": doc_topics, "topic_words": topic_words, "duplicates": duplicates}
    
    mallet_paths_k = [p for p in mallet_paths if f'k-{k}/' in str(p[0])]
    if mallet_paths_k:
        doc_topics, topic_words, duplicates = get_estimates_over_runs(mallet_paths_k, overlap_words, exclude_dups=False)
        estimates_mallet[k] = {"doc_topics": doc_topics, "topic_words": topic_words, "duplicates": duplicates}

with open(f"dvae-{dataset}-estimates.pkl", "wb") as outfile:
    pickle.dump(estimates_dvae, outfile)

On k=50


  0%|          | 0/82 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Find the "user labels" for the documents 

In [191]:
from collections import defaultdict, Counter

In [192]:
doc_topics = estimates_dvae[50]['doc_topics']

In [193]:
user_labels_dvae = {}
for document_index in range(len(doc_topics)) : 
    user_labels = np.argmax(doc_topics[document_index], axis=1) 
    user_labels_dvae[document_index] = user_labels

In [194]:
user_labels_dvae

{0: array([34, 34, 34, ..., 34, 39, 34]),
 1: array([20, 20, 20, ..., 20, 20, 20]),
 2: array([21, 40, 21, ..., 40, 21, 40]),
 3: array([21, 21, 21, ..., 31, 21, 21]),
 4: array([15, 46, 15, ..., 15, 24, 15]),
 5: array([26, 26, 26, ..., 26, 26, 26]),
 6: array([20, 20, 20, ..., 38, 20, 20]),
 7: array([26, 27, 46, ..., 27, 46, 26]),
 8: array([38, 31, 26, ..., 31, 38, 31]),
 9: array([20, 38, 38, ..., 38, 20, 38]),
 10: array([29,  8, 29, ...,  0, 29,  8]),
 11: array([ 5, 47,  5, ..., 47,  5, 47]),
 12: array([38, 34, 30, ..., 34, 19, 34]),
 13: array([29, 27, 29, ..., 27, 29, 27]),
 14: array([42, 20, 20, ..., 20, 42, 20]),
 15: array([39, 34, 39, ..., 34, 39, 34]),
 16: array([46, 46, 46, ..., 46, 46, 21]),
 17: array([31, 21, 31, ..., 21, 31, 21]),
 18: array([29, 29, 29, ..., 29, 29, 29]),
 19: array([31, 31, 31, ..., 10, 31, 31]),
 20: array([26,  4, 26, ..., 27, 26, 27]),
 21: array([21,  0, 21, ..., 33, 20,  0]),
 22: array([42, 16, 42, ..., 16, 42, 16]),
 23: array([26, 31, 2

In [195]:
doc_topics_mallet = estimates_mallet[50]['doc_topics']

In [196]:
user_labels_mallet = {}
for document_index in range(len(doc_topics_mallet)) : 
    user_labels = np.argmax(doc_topics_mallet[document_index], axis=1) 
    user_labels_mallet[document_index] = user_labels

In [197]:
user_labels_mallet

{0: array([28, 10, 28, ..., 10, 28, 10]),
 1: array([41, 45, 41, ..., 45, 41, 45]),
 2: array([10, 14, 10, ..., 14, 10, 14]),
 3: array([46, 46, 46, ..., 46, 46, 46]),
 4: array([18, 18, 18, ..., 18, 18, 18]),
 5: array([41, 45, 41, ..., 45, 41, 45]),
 6: array([6, 6, 6, ..., 6, 6, 6]),
 7: array([6, 6, 6, ..., 6, 6, 6]),
 8: array([28, 10, 28, ..., 10, 28, 10]),
 9: array([18, 18, 18, ..., 18, 18, 18]),
 10: array([28, 10, 28, ..., 10, 28, 10]),
 11: array([17, 14, 17, ..., 14, 17, 10]),
 12: array([48, 18, 48, ..., 18, 48, 18]),
 13: array([18, 37, 31, ..., 18, 18, 31]),
 14: array([20, 20, 20, ..., 20, 20, 20]),
 15: array([21, 21,  0, ..., 21,  0, 21]),
 16: array([18, 18, 18, ..., 18, 18, 18]),
 17: array([ 3, 19,  3, ..., 19,  3, 19]),
 18: array([37, 37, 37, ..., 37, 37, 37]),
 19: array([10, 14, 10, ..., 14, 10, 14]),
 20: array([6, 6, 6, ..., 6, 6, 6]),
 21: array([17, 17, 17, ..., 17, 17, 17]),
 22: array([10, 10, 10, ..., 10, 10, 10]),
 23: array([35, 32, 18, ..., 32, 31, 32

In [153]:
np.unique(user_labels_mallet[4], return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
 array([ 2036,  4390,  6060,  7542,  5712,  2903,  4754,  4596,  4961,
         4274,  4789,  4424,  3732,  4219,  7316,  5351,  5121,  8968,
        18312,  8332,  3944,  2136,  2996,  5380,  8896,  3479,  4283,
        15141,  3005,  3310,  4952, 10375,  2682, 11379,  6229, 10150,
         8651,  2753,  3072,  6140,  2899,  3182,  3381,  1749,  4530,
         4371,  4781,  4441,  1507,  4784]))

In [14]:
def purity(user_labels, gold_labels): 
    """
    Calculates the Purity metric as described in https://aclanthology.org/P16-1110/
    "ALTO: Active Learning with Topic Overviews for Speeding Label Induction and Document Labeling"
    
    For sanity check - The purity of any two user labels should be 1
    """
    assert len(user_labels) == len(gold_labels)
    
    user_label_clusters = defaultdict(list)
    for doc_index, label in enumerate(user_labels): 
        user_label_clusters[label].append(doc_index) 
    
    # main step : 
    purity_sum = 0 
    for cluster_label, user_document_set in user_label_clusters.items(): 
        gold_label_distribution = Counter([gold_labels[i] for i in  user_document_set])
        most_frequent_cluster_freq = sorted(gold_label_distribution.items(), key = lambda x:x[1], reverse=True)[0][1]
        purity_sum += most_frequent_cluster_freq
        
    return purity_sum/len(user_labels) 

In [15]:
from sklearn.metrics import rand_score, normalized_mutual_info_score , adjusted_rand_score

In [16]:
purity(user_labels_dvae[1], user_labels_dvae[0]) 

NameError: name 'user_labels_dvae' is not defined

In [201]:
rand_score(user_labels_dvae[1], user_labels_dvae[0])

0.9700630406253116

In [202]:
normalized_mutual_info_score(user_labels_dvae[1], user_labels_dvae[0])

0.6056388595988299

In [203]:
metrics = (purity, rand_score, normalized_mutual_info_score)

In [204]:
len(Counter(labels_fine)), len(Counter(labels_coarse))

(2877, 31)

In [205]:
#user_labels_dvae

In [221]:
def get_dataframe(user_labels_list, model) :
    stuff = []
    for index in range(len(user_labels_list)) : 
        user_labels = user_labels_list[index]

        d = {
        "model": f"{model}",
        "index": f"{model}_{index}",
        "rand_fine" : adjusted_rand_score(user_labels, labels_fine), 
        "rand_coarse" : adjusted_rand_score(user_labels, labels_coarse), 
        "nmi_fine": normalized_mutual_info_score(user_labels, labels_fine),
        "nmi_coarse": normalized_mutual_info_score(user_labels, labels_coarse),
        "purity_fine": purity(user_labels, labels_fine),
        "purity_coarse": purity(user_labels, labels_coarse)
        }

    #     print(f"Model {index}\n\t Rand Scores: {rand_coarse:.3f},{rand_fine:.3f} \
    #              \tNMI Scores: {nmi_coarse:.3f},{nmi_fine:.3f} \
    #              \tPurity Scores: {purity_coarse:.3f},{purity_fine:.3f}")

        stuff.append(d)
        
    df = pd.DataFrame(stuff)

    return df

In [222]:
dvae = get_dataframe(user_labels_dvae, "dvae")

In [223]:
dvae.head()

Unnamed: 0,model,index,rand_fine,rand_coarse,nmi_fine,nmi_coarse,purity_fine,purity_coarse
0,dvae,dvae_0,0.316941,0.156117,0.446646,0.364865,0.321519,0.515589
1,dvae,dvae_1,0.394868,0.173889,0.467502,0.376476,0.333767,0.535815
2,dvae,dvae_2,0.257027,0.142913,0.466106,0.387155,0.353181,0.55042
3,dvae,dvae_3,0.350939,0.179454,0.484734,0.383542,0.354966,0.545049
4,dvae,dvae_4,0.282265,0.143717,0.392989,0.322242,0.312131,0.507714


In [224]:
mallet = get_dataframe(user_labels_mallet, "mallet")

In [225]:
mallet.head()

Unnamed: 0,model,index,rand_fine,rand_coarse,nmi_fine,nmi_coarse,purity_fine,purity_coarse
0,mallet,mallet_0,0.350499,0.182586,0.455902,0.371386,0.326765,0.545684
1,mallet,mallet_1,0.343367,0.178253,0.45981,0.370611,0.333029,0.543213
2,mallet,mallet_2,0.317573,0.15406,0.456268,0.374088,0.332316,0.551709
3,mallet,mallet_3,0.407702,0.190262,0.468044,0.377941,0.342655,0.55244
4,mallet,mallet_4,0.422896,0.194206,0.468109,0.370147,0.348897,0.541473


In [226]:
df = pd.concat([dvae, mallet], ignore_index=True)

In [227]:
df.shape

(166, 8)

In [188]:
df.head()

Unnamed: 0,model,index,rand_fine,rand_coarse,nmi_fine,nmi_coarse,purity_fine,purity_coarse
0,dvae,dvae_0,0.316941,0.156117,0.446646,0.364865,0.321519,0.515589
1,dvae,dvae_1,0.394868,0.173889,0.467502,0.376476,0.333767,0.535815
2,dvae,dvae_2,0.257027,0.142913,0.466106,0.387155,0.353181,0.55042
3,dvae,dvae_3,0.350939,0.179454,0.484734,0.383542,0.354966,0.545049
4,dvae,dvae_4,0.282265,0.143717,0.392989,0.322242,0.312131,0.507714


In [229]:
df.to_csv("coverage_results_for_viz.csv", index=False) 

In [157]:
dvae_stuff = []
for index in range(len(user_labels_dvae)) : 
    user_labels = user_labels_dvae[index]
    
    d = {
    "rand_fine" : adjusted_rand_score(user_labels, labels_fine), 
    "rand_coarse" : adjusted_rand_score(user_labels, labels_coarse), 
    "nmi_fine": normalized_mutual_info_score(user_labels, labels_fine),
    "nmi_coarse": normalized_mutual_info_score(user_labels, labels_coarse),
    "purity_fine": purity(user_labels, labels_fine),
    "purity_coarse" = purity(user_labels, labels_coarse)
    "model":"dvae",
    "index":f"dvae_{index}"
    }
    
#     print(f"Model {index}\n\t Rand Scores: {rand_coarse:.3f},{rand_fine:.3f} \
#              \tNMI Scores: {nmi_coarse:.3f},{nmi_fine:.3f} \
#              \tPurity Scores: {purity_coarse:.3f},{purity_fine:.3f}")

    dvae_stuff.append(d)

Model 0
	 Rand Scores: 0.156,0.317              	NMI Scores: 0.365,0.447              	Purity Scores: 0.516,0.322
Model 1
	 Rand Scores: 0.174,0.395              	NMI Scores: 0.376,0.468              	Purity Scores: 0.536,0.334
Model 2
	 Rand Scores: 0.143,0.257              	NMI Scores: 0.387,0.466              	Purity Scores: 0.550,0.353
Model 3
	 Rand Scores: 0.179,0.351              	NMI Scores: 0.384,0.485              	Purity Scores: 0.545,0.355
Model 4
	 Rand Scores: 0.144,0.282              	NMI Scores: 0.322,0.393              	Purity Scores: 0.508,0.312


In [231]:
for index in range(len(user_labels_dvae)) : 
    user_labels = user_labels_dvae[index]
    rand_fine = adjusted_rand_score(user_labels, labels_fine)
    rand_coarse = adjusted_rand_score(user_labels, labels_coarse)
    nmi_fine = normalized_mutual_info_score(user_labels, labels_fine)
    nmi_coarse = normalized_mutual_info_score(user_labels, labels_coarse)
    purity_fine = purity(user_labels, labels_fine)
    purity_coarse = purity(user_labels, labels_coarse)
    
    print(f"Model {index}\n\t Rand Scores: {rand_coarse:.3f},{rand_fine:.3f} \
             \tNMI Scores: {nmi_coarse:.3f},{nmi_fine:.3f} \
             \tPurity Scores: {purity_coarse:.3f},{purity_fine:.3f}")

Model 0
	 Rand Scores: 0.156,0.317              	NMI Scores: 0.365,0.447              	Purity Scores: 0.516,0.322
Model 1
	 Rand Scores: 0.174,0.395              	NMI Scores: 0.376,0.468              	Purity Scores: 0.536,0.334
Model 2
	 Rand Scores: 0.143,0.257              	NMI Scores: 0.387,0.466              	Purity Scores: 0.550,0.353
Model 3
	 Rand Scores: 0.179,0.351              	NMI Scores: 0.384,0.485              	Purity Scores: 0.545,0.355
Model 4
	 Rand Scores: 0.144,0.282              	NMI Scores: 0.322,0.393              	Purity Scores: 0.508,0.312
Model 5
	 Rand Scores: 0.203,0.404              	NMI Scores: 0.380,0.481              	Purity Scores: 0.546,0.361
Model 6
	 Rand Scores: 0.156,0.336              	NMI Scores: 0.367,0.444              	Purity Scores: 0.534,0.329
Model 7
	 Rand Scores: 0.148,0.252              	NMI Scores: 0.364,0.436              	Purity Scores: 0.521,0.308
Model 8
	 Rand Scores: 0.164,0.316              	NMI Scores: 0.392,0.477              	P

Model 72
	 Rand Scores: 0.142,0.292              	NMI Scores: 0.373,0.478              	Purity Scores: 0.539,0.352
Model 73
	 Rand Scores: 0.141,0.253              	NMI Scores: 0.383,0.458              	Purity Scores: 0.540,0.339
Model 74
	 Rand Scores: 0.194,0.380              	NMI Scores: 0.393,0.474              	Purity Scores: 0.552,0.336
Model 75
	 Rand Scores: 0.160,0.310              	NMI Scores: 0.386,0.474              	Purity Scores: 0.555,0.350
Model 76
	 Rand Scores: 0.166,0.337              	NMI Scores: 0.381,0.476              	Purity Scores: 0.547,0.348
Model 77
	 Rand Scores: 0.162,0.323              	NMI Scores: 0.388,0.484              	Purity Scores: 0.554,0.357
Model 78
	 Rand Scores: 0.171,0.355              	NMI Scores: 0.391,0.493              	Purity Scores: 0.557,0.365
Model 79
	 Rand Scores: 0.136,0.241              	NMI Scores: 0.346,0.414              	Purity Scores: 0.515,0.294
Model 80
	 Rand Scores: 0.151,0.292              	NMI Scores: 0.358,0.443       

In [230]:
for index in range(len(user_labels_mallet)) : 
    user_labels = user_labels_mallet[index]
    rand_fine = adjusted_rand_score(user_labels, labels_fine)
    rand_coarse = adjusted_rand_score(user_labels, labels_coarse)
    nmi_fine = normalized_mutual_info_score(user_labels, labels_fine)
    nmi_coarse = normalized_mutual_info_score(user_labels, labels_coarse)
    purity_fine = purity(user_labels, labels_fine)
    purity_coarse = purity(user_labels, labels_coarse)
    
    print(f"Model {index}\n\t Rand Scores: {rand_coarse:.3f},{rand_fine:.3f} \
             \tNMI Scores: {nmi_coarse:.3f},{nmi_fine:.3f} \
             \tPurity Scores: {purity_coarse:.3f},{purity_fine:.3f}")

Model 0
	 Rand Scores: 0.183,0.350              	NMI Scores: 0.371,0.456              	Purity Scores: 0.546,0.327
Model 1
	 Rand Scores: 0.178,0.343              	NMI Scores: 0.371,0.460              	Purity Scores: 0.543,0.333
Model 2
	 Rand Scores: 0.154,0.318              	NMI Scores: 0.374,0.456              	Purity Scores: 0.552,0.332
Model 3
	 Rand Scores: 0.190,0.408              	NMI Scores: 0.378,0.468              	Purity Scores: 0.552,0.343
Model 4
	 Rand Scores: 0.194,0.423              	NMI Scores: 0.370,0.468              	Purity Scores: 0.541,0.349
Model 5
	 Rand Scores: 0.167,0.352              	NMI Scores: 0.371,0.467              	Purity Scores: 0.552,0.353
Model 6
	 Rand Scores: 0.190,0.406              	NMI Scores: 0.363,0.445              	Purity Scores: 0.531,0.320
Model 7
	 Rand Scores: 0.199,0.430              	NMI Scores: 0.372,0.466              	Purity Scores: 0.548,0.341
Model 8
	 Rand Scores: 0.183,0.359              	NMI Scores: 0.369,0.458              	P

Model 72
	 Rand Scores: 0.193,0.408              	NMI Scores: 0.372,0.457              	Purity Scores: 0.544,0.329
Model 73
	 Rand Scores: 0.168,0.354              	NMI Scores: 0.372,0.468              	Purity Scores: 0.553,0.353
Model 74
	 Rand Scores: 0.173,0.345              	NMI Scores: 0.380,0.470              	Purity Scores: 0.555,0.354
Model 75
	 Rand Scores: 0.201,0.422              	NMI Scores: 0.368,0.459              	Purity Scores: 0.539,0.333
Model 76
	 Rand Scores: 0.195,0.427              	NMI Scores: 0.371,0.467              	Purity Scores: 0.543,0.345
Model 77
	 Rand Scores: 0.170,0.340              	NMI Scores: 0.370,0.457              	Purity Scores: 0.545,0.329
Model 78
	 Rand Scores: 0.197,0.433              	NMI Scores: 0.380,0.472              	Purity Scores: 0.552,0.359
Model 79
	 Rand Scores: 0.192,0.417              	NMI Scores: 0.379,0.470              	Purity Scores: 0.554,0.344
Model 80
	 Rand Scores: 0.197,0.431              	NMI Scores: 0.370,0.465       

In [None]:
users