In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import scipy as scp

import nltk
from nltk.corpus import wordnet as wn
from itertools import product

from sklearn.model_selection import train_test_split

%matplotlib inline

headers = ['age','workclass','fnlwgt','education','education-num','marital-status',
           'occupation','relationship','race','sex','capital-gain','capital-loss',
           'hours-per-week','native-country','class']
adult = pd.read_csv('adult/adult.data', 
                    sep=', ', names=headers, na_values='?', engine='python')

# Drop all records with missing values
adult.dropna(inplace=True)
adult.reset_index(drop=True, inplace=True)

y = adult['class']
X = adult.drop(['fnlwgt', 'class'], axis = 1)
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Load semantic distances from files

import os
import collections
categorical_attributes = ['workclass', 'education', 'marital-status', 'occupation',
                          'relationship','race','sex','native-country']
# path = 'semdist/ontodist/'
path = 'semdist/embedist/'

distances = {}
for categorical_attribute in categorical_attributes:
    with open(path+categorical_attribute+'.txt') as f:
        contents = list(f)
        categories = list(map(str.strip, contents[0].split(',')))
        m = []
        for line in contents[1:]:
            m.append(list(map(float, map(str.strip, line.split(',')))))
        d = collections.defaultdict(dict)
        for i in range(len(categories)):
            for j in range(len(categories)):
                d[categories[i]][categories[j]] = m[i][j]
        distances[categorical_attribute] = d

In [None]:
def dist_attr(x, y, attr_pos, categorical_mask, column_names):
    if categorical_mask[attr_pos]:
        return distances[column_names[attr_pos]][x[attr_pos]][y[attr_pos]]
    else:
        return abs(float(x[attr_pos]) - float(y[attr_pos]))

def dist_record(x, y):
    d = []
    categorical_mask = [0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
    column_names =  ['age','workclass','education','education-num','marital-status',
           'occupation','relationship','race','sex','capital-gain','capital-loss',
           'hours-per-week','native-country']
    for i, cn in enumerate(column_names):
        d.append(dist_attr(x, y, i, categorical_mask, column_names))
    
    ## 0->age 
    #d.append(dist_attr(x,y,0))
    ## 1->workclass  
    #d.append(distances['workclass'][x[1]][y[1]])
    ## 2->education 
    #d.append(distances['education'][x[2]][y[2]])
    ## 3->education-num   
    #d.append(dist_attr(x,y,3))
    ## 4->marital-status  
    #d.append(distances['marital-status'][x[4]][y[4]])
    ## 5->occupation  
    #d.append(distances['occupation'][x[5]][y[5]])
    ## 6->relationship 
    #d.append(distances['relationship'][x[6]][y[6]])
    ## 7->race    
    #d.append(distances['race'][x[7]][y[7]])
    ## 8->sex  
    #d.append(distances['sex'][x[8]][y[8]])
    ## 9->capital-gain  
    #d.append(dist_attr(x,y,9))
    ##10->capital-loss 
    #d.append(dist_attr(x,y,10))
    ##11->hours-per-week 
    #d.append(dist_attr(x,y,11))
    ##12->native-country  
    #d.append(distances['native-country'][x[12]][y[12]])
    
    return float(sum(d))
    
def mean_record(D):
    d = []
    # 0->age 
    d.append(float(np.mean(D[:,0])))
    # 1->workclass  
    d.append(mean_semantic(D[:,1], 'workclass'))
    # 2->education 
    d.append(mean_semantic(D[:,2], 'education'))
    # 3->education-num   
    d.append(float(np.mean(D[:,3])))
    # 4->marital-status  
    d.append(mean_semantic(D[:,4], 'marital-status'))
    # 5->occupation  
    d.append(mean_semantic(D[:,5], 'occupation'))
    # 6->relationship 
    d.append(mean_semantic(D[:,6], 'relationship'))
    # 7->race    
    d.append(mean_semantic(D[:,7], 'race'))
    # 8->sex  
    d.append(mean_semantic(D[:,8], 'sex'))
    # 9->capital-gain  
    d.append(float(np.mean(D[:,9])))
    #10->capital-loss 
    d.append(float(np.mean(D[:,10])))
    #11->hours-per-week 
    d.append(float(np.mean(D[:,11])))
    #12->native-country  
    d.append(mean_semantic(D[:,12], 'native-country'))
    
    return d
    
def mean_semantic(values, attribute_name):
    candidates = list(distances[attribute_name].keys())
    return values[np.argmin([sum([distances[attribute_name][c][v] for c in candidates]) for v in values])]

def dist(x,y):
    return np.linalg.norm(x-y)
    #return scipy.spatial.distance.correlation(x,y)

def poprow(arr,i):
    pop = arr[i]
    new_array = np.vstack((arr[:i],arr[i+1:]))
    return new_array,pop

def cluster(X, p, k):
    c = [p]
    D = np.column_stack((X,[dist_record(v[:-1],p[:-1]) for v in X]))
    D = D[D[:,-1].argsort()]
    D = np.delete(D, -1, 1)
    c.extend(D[:k-1])
    D = D[k-1:]
    
    xc = np.array([p[:-1] for p in c], copy=False, ndmin=2)
    yc = np.array([p[-1] for p in c], copy=False)
    cl = (xc, yc)
    return D, cl
    
def mdav(X, y, k):
    D = np.column_stack((X,y))
    clusters = []
    while len(D) >= 3*k:
        # Centroid
        xm = mean_record(D)
        # Furthest from centroid
        xri = np.argmax([dist_record(v[:-1],xm) for v in D])
        D, xr = poprow(D, xri)
        # Furthest from furthest from centroid
        xsi = np.argmax([dist_record(v[:-1],xr[:-1]) for v in D])
        D, xs = poprow(D, xsi) 

        #cluster of xr
        D, c = cluster(D, xr, k)
        clusters.append(c)
        #cluster of xs
        D, c = cluster(D, xs, k)
        clusters.append(c)
        
    if len(D) >= 2*k and len(D) < 3*k:
        # Centroid
        xm = mean_record(D)
        # Furthest from centroid
        xri = np.argmax([dist_record(v[:-1],xm) for v in D])
        D, xr = poprow(D, xri)
        #cluster of xr
        D, c = cluster(D, xr, k)
        clusters.append(c)
        
        # rest of points
        xc = np.array([p[:-1] for p in D[:]], copy=False, ndmin=2)
        yc = np.array([p[-1] for p in D[:]], copy=False)
        cl = (xc, yc)
        clusters.append(cl)     
    else:
        # rest of points
        xc = np.array([p[:-1] for p in D[:]], copy=False, ndmin=2)
        yc = np.array([p[-1] for p in D[:]], copy=False)
        cl = (xc, yc)
        clusters.append(cl)
    
    centroids = np.array([mean_record(c[0]) for c in clusters], copy=False)
    
    return clusters, centroids


In [None]:
K = [5]
clusterings = []
centroids_of_clusterings = []
for k in K:
    clustering, centroids = mdav(X, y, k)
    clusterings.append(clustering)
    centroids_of_clusterings.append(centroids)    

In [None]:
idxs = []
for i in range(0, len(centroids), 10):
    if np.any(clustering[i][1] != clustering[i][1][0]):
        idxs.append(i)