In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import scipy as scp

import nltk
from nltk.corpus import wordnet as wn
from itertools import product

from sklearn.model_selection import train_test_split
import pickle

%matplotlib inline

headers = ['age','workclass','fnlwgt','education','education-num','marital-status',
           'occupation','relationship','race','sex','capital-gain','capital-loss',
           'hours-per-week','native-country','class']

adult = pd.read_csv('adult/adult.data', 
                    sep=', ', names=headers, na_values='?', engine='python')

# Drop all records with missing values
adult.dropna(inplace=True)

y = adult['class'].to_numpy()
X = adult.drop(['fnlwgt', 'class'], axis = 1)
X.reset_index(drop=True, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Specify the file names for saving
file_X_train = "adult/X_train_nc.csv"
file_X_test = "adult/X_test_nc.csv"
file_y_train = "adult/y_train_nc.pkl"
file_y_test = "adult/y_test_nc.pkl"

# Save the data to the adult directory
X_train.to_csv(file_X_train, index=False)
X_test.to_csv(file_X_test, index=False)
with open(file_y_train, 'wb') as f:
    pickle.dump(y_train, f)

with open(file_y_test, 'wb') as f:
    pickle.dump(y_test, f)

In [7]:
# Load semantic distances from files
import os
import collections
categorical_attributes = ['workclass', 'education', 'marital-status', 'occupation',
                          'relationship','race','sex','native-country']
# path = 'semdist/ontodist/'
path = 'semdist/embedist/'

distances = {}
for categorical_attribute in categorical_attributes:
    with open(path+categorical_attribute+'.txt') as f:
        contents = list(f)
        categories = list(map(str.strip, contents[0].split(',')))
        m = []
        for line in contents[1:]:
            m.append(list(map(float, map(str.strip, line.split(',')))))
        d = collections.defaultdict(dict)
        for i in range(len(categories)):
            for j in range(len(categories)):
                d[categories[i]][categories[j]] = m[i][j]
        distances[categorical_attribute] = d

In [8]:
def dist_attr(x, y, attr_pos, categorical_mask, column_names):
    if categorical_mask[attr_pos]:
        return distances[column_names[attr_pos]][x[attr_pos]][y[attr_pos]]
    else:
        return abs(float(x[attr_pos]) - float(y[attr_pos]))

def dist_record(x, y):
    d = []
    categorical_mask = [0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
    column_names =  ['age','workclass','education','education-num','marital-status',
           'occupation','relationship','race','sex','capital-gain','capital-loss',
           'hours-per-week','native-country']
    for i, cn in enumerate(column_names):
        d.append(dist_attr(x, y, i, categorical_mask, column_names))
    
    return float(sum(d))
    
def mean_record(D):
    d = []
    # 0->age 
    d.append(float(np.mean(D[:,0])))
    # 1->workclass  
    d.append(mean_semantic(D[:,1], 'workclass'))
    # 2->education 
    d.append(mean_semantic(D[:,2], 'education'))
    # 3->education-num   
    d.append(float(np.mean(D[:,3])))
    # 4->marital-status  
    d.append(mean_semantic(D[:,4], 'marital-status'))
    # 5->occupation  
    d.append(mean_semantic(D[:,5], 'occupation'))
    # 6->relationship 
    d.append(mean_semantic(D[:,6], 'relationship'))
    # 7->race    
    d.append(mean_semantic(D[:,7], 'race'))
    # 8->sex  
    d.append(mean_semantic(D[:,8], 'sex'))
    # 9->capital-gain  
    d.append(float(np.mean(D[:,9])))
    #10->capital-loss 
    d.append(float(np.mean(D[:,10])))
    #11->hours-per-week 
    d.append(float(np.mean(D[:,11])))
    #12->native-country  
    d.append(mean_semantic(D[:,12], 'native-country'))
    
    return d
    
def mean_semantic(values, attribute_name):
    candidates = list(distances[attribute_name].keys())
    return values[np.argmin([sum([distances[attribute_name][c][v] for c in candidates]) for v in values])]

def dist(x,y):
    return np.linalg.norm(x-y)
    #return scipy.spatial.distance.correlation(x,y)

def poprow(arr,i):
    pop = arr[i]
    new_array = np.vstack((arr[:i],arr[i+1:]))
    return new_array,pop

def cluster(X, p, k):
    c = [p]
    D = np.column_stack((X,[dist_record(v[:-1],p[:-1]) for v in X]))
    D = D[D[:,-1].argsort()]
    D = np.delete(D, -1, 1)
    c.extend(D[:k-1])
    D = D[k-1:]
    
    xc = np.array([p[:-1] for p in c], copy=False, ndmin=2)
    yc = np.array([p[-1] for p in c], copy=False)
    cl = (xc, yc)
    return D, cl
    
def mdav(X, y, k):
    D = np.column_stack((X,y))
    clusters = []
    while len(D) >= 3*k:
        # Centroid
        xm = mean_record(D)
        # Furthest from centroid
        xri = np.argmax([dist_record(v[:-1],xm) for v in D])
        D, xr = poprow(D, xri)
        # Furthest from furthest from centroid
        xsi = np.argmax([dist_record(v[:-1],xr[:-1]) for v in D])
        D, xs = poprow(D, xsi) 

        #cluster of xr
        D, cl = cluster(D, xr, k)
        clusters.append(cl)
        #cluster of xs
        D, cl = cluster(D, xs, k)
        clusters.append(cl)
        
    if len(D) >= 2*k and len(D) < 3*k:
        # Centroid
        xm = mean_record(D)
        # Furthest from centroid
        xri = np.argmax([dist_record(v[:-1],xm) for v in D])
        D, xr = poprow(D, xri)
        #cluster of xr
        D, cl = cluster(D, xr, k)
        clusters.append(cl)
        
        # rest of points
        xc = np.array([p[:-1] for p in D[:]], copy=False, ndmin=2)
        yc = np.array([p[-1] for p in D[:]], copy=False)
        cl = (xc, yc)
        clusters.append(cl)     
    else:
        # rest of points
        xc = np.array([p[:-1] for p in D[:]], copy=False, ndmin=2)
        yc = np.array([p[-1] for p in D[:]], copy=False)
        cl = (xc, yc)
        clusters.append(cl)
    
    centroids = np.array([mean_record(c[0]) for c in clusters], copy=False)
    
    return clusters, centroids


In [9]:
K = [3, 5, 10, 15, 20]
for k in K:
    print('k=', k)
    clustering, centroids = mdav(X_train, y_train, k)
    X_train_k = np.vstack(np.repeat(centroids[i].reshape(1, -1), len(c[0]), axis = 0) for i, c in enumerate(clustering))
    
    y_train_k = None
    for i in range(len(clustering)):
        yc = clustering[i][1]
        if y_train_k is None:
            y_train_k = yc
        else:
            y_train_k = np.hstack((y_train_k, yc))
    
    X_train_k = pd.DataFrame(X_train_k, columns = X_train.columns)
    categorical_mask = [0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
    for mask, col in zip(categorical_mask, X_train_k.columns):
        if mask == 0: 
            X_train_k[col] = pd.to_numeric(X_train_k[col], errors='coerce')
    # Specify the file names for saving
    file_X_train_k = "adult/X_train_nc_k={}.csv".format(k)
    file_y_train_k = "adult/y_train_nc_k={}.pkl".format(k)

    # Save the data to the adult directory
    X_train_k.to_csv(file_X_train_k, index=False)
    
    with open(file_y_train_k, 'wb') as f:
        pickle.dump(y_train_k, f)


k= 3


  X_train_k = np.vstack(np.repeat(centroids[i].reshape(1, -1), len(c[0]), axis = 0) for i, c in enumerate(clustering))


k= 5
k= 10
k= 15
k= 20
