In [1]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import matplotlib.pyplot as plt

df_bal = pd.read_csv("../../corpus_sprint2_balanced_cp.csv", encoding="utf-8")
df = df_bal["Consumer complaint narrative"]
df = df.dropna()

model = Doc2Vec.load("../../model_doc2vec_balanced_20epochs")

X = model.docvecs.vectors_docs

In [2]:
def init_medoids(X, k):
    from numpy.random import choice
    from numpy.random import seed
 
    seed(1)
    samples = choice(len(X), size=k, replace=False)
    return X[samples, :]
def compute_d_p(X, medoids, p):
    m = len(X)
    medoids_shape = medoids.shape
    # If a 1-D array is provided, 
    # it will be reshaped to a single row 2-D array
    if len(medoids_shape) == 1: 
        medoids = medoids.reshape((1,len(medoids)))
    k = len(medoids)
    
    S = np.empty((m, k))
    
    for i in range(m):
        d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
        S[i, :] = d_i**p

    return S
def assign_labels(S):
    return np.argmin(S, axis=1)
def update_medoids(points, medoids, p):
    
    S = compute_d_p(points, medoids, p)
    labels = assign_labels(S)
        
    out_medoids = medoids
                
    for i in set(labels):
        
        avg_dissimilarity = np.sum(compute_d_p(points, medoids[i], p))

        cluster_points = points[labels == i]
        
        for datap in cluster_points:
            new_medoid = datap
            new_dissimilarity= np.sum(compute_d_p(points, datap, p))
            
            if new_dissimilarity < avg_dissimilarity :
                avg_dissimilarity = new_dissimilarity
                
                out_medoids[i] = datap
                
    return out_medoids
def has_converged(old_medoids, medoids):
    return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids])
def kmedoids(X, k, p, starting_medoids=None, max_steps=np.inf):
    print("START")
    if starting_medoids is None:
        medoids = init_medoids(X, k)
    else:
        medoids = starting_medoids
    print("INIT MEDIODS DONE")

    converged = False
    labels = np.zeros(len(X))
    i = 1
    while (not converged) and (i <= max_steps):
        print("START ")
        print(i)
        old_medoids = medoids.copy()
        S = compute_d_p(X, medoids, p)
        print("COMPUTED ")
        labels = assign_labels(S)
        print("ASSIGNED ")
        medoids = update_medoids(X, medoids, p)
        print("UPDATED ")
        converged = has_converged(old_medoids, medoids)
        print("CONVERGED ")
        i += 1
    return (medoids,labels)

In [3]:
results = kmedoids(X, 3, 2)

START
INIT MEDIODS DONE
START 
1
COMPUTED 
ASSIGNED 


KeyboardInterrupt: 