In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt          # plotting
from scipy.sparse import csr_matrix      # sparse matrices
%matplotlib inline

In [2]:
wiki = pd.read_csv('people_wiki.csv')

In [3]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix( (data, indices, indptr), shape)

In [4]:
tf_idf = load_sparse_csr('people_wiki_tf_idf.npz')

In [5]:
# now we can load it back
import json
with open('people_wiki_map_index_to_word.json','r') as f:
    map_index_to_word = json.load(f)

In [6]:
from sklearn.preprocessing import normalize

In [7]:
tf_idf = normalize(tf_idf)

In [8]:
def get_initial_centroids(data, k, seed=None):
    '''Randomly choose k data points as initial centroids'''
    if seed is not None: # useful for obtaining consistent results
        np.random.seed(seed)
    n = data.shape[0] # number of data points
        
    # Pick K indices from range [0, N).
    rand_indices = np.random.randint(0, n, k)
    
    # Keep centroids as dense format, as many entries will be nonzero due to averaging.
    # As long as at least one document in a cluster contains a word,
    # it will carry a nonzero weight in the TF-IDF vector of the centroid.
    centroids = data[rand_indices,:].toarray()
    
    return centroids

In [10]:
from sklearn.metrics.pairwise import pairwise_distances
# Get the TF-IDF vectors for documents 100 through 102.
queries = tf_idf[100:102,:]

# Compute pairwise distances from every data point to each query vector.
dist = pairwise_distances(tf_idf, queries, metric='euclidean')

print dist

[[ 1.41000789  1.36894636]
 [ 1.40935215  1.41023886]
 [ 1.39855967  1.40890299]
 ..., 
 [ 1.41108296  1.39123646]
 [ 1.41022804  1.31468652]
 [ 1.39899784  1.41072448]]


In [12]:
dist.shape, tf_idf.shape

((59071, 2), (59071, 547979))

In [13]:
'''Test cell'''
if np.allclose(dist, pairwise_distances(tf_idf[430,:], tf_idf[1,:])):
    print('Pass')
else:
    print('Check your code again')

Check your code again
