In [955]:
import numpy as np
import pandas as pd
import seaborn as sns
from helpers.helper import *

%matplotlib inline

# Load Employment Data

In [956]:
fp = '../../../data/databases/db15/Skills.txt'
X = load_skills(fp).replace(np.nan, 0.0)
X['O*NET-SOC Code'] = X['O*NET-SOC Code'].apply(lambda x: str(x[:-3]))

# delete duplicate data
X = X.drop_duplicates(subset = 'O*NET-SOC Code')

numbers = pd.read_csv('../../../data/education/employment_numbers.csv')
numbers = numbers.rename(columns = {'code': "O*NET-SOC Code"})

def clean_number(element):
    return float(str(element).replace(',',''))

numbers.e2014 = numbers.e2014.apply(clean_number)

X = X.merge(numbers, on = 'O*NET-SOC Code', how = 'inner')

# Divide codes from data

#### With option to find high-employment subsets

In [957]:
def get_data(X, subset = False, threshold = 0.00666666):
    if subset:
        total_emp = 150539.90
        subset_X = X[X.e2014 / total_emp >= threshold]
        X = subset_X
    codes, weights, title, X = X['O*NET-SOC Code'], X['e2014'], X.title, X.drop(['O*NET-SOC Code', 'e2014', 'title', 'Unnamed: 3'], axis = 1)
    return codes, weights, title, X

codes, weights, titles, X_matrix = get_data(X, subset = False)

# Add Weights

In [958]:
dists = sp.distance.squareform(sp.distance.pdist(X_matrix))

In [959]:
dists

array([[  0.        ,   6.68261925,   5.52246322, ...,  15.22675277,
         17.14086929,  13.11334435],
       [  6.68261925,   0.        ,   3.10396521, ...,   9.4963572 ,
         11.43110668,   7.75711286],
       [  5.52246322,   3.10396521,   0.        , ...,  10.86476875,
         12.75235664,   9.32074031],
       ..., 
       [ 15.22675277,   9.4963572 ,  10.86476875, ...,   0.        ,
          3.60469139,   4.47915171],
       [ 17.14086929,  11.43110668,  12.75235664, ...,   3.60469139,
          0.        ,   6.03778105],
       [ 13.11334435,   7.75711286,   9.32074031, ...,   4.47915171,
          6.03778105,   0.        ]])

# Find Groupings

In [960]:
import Pycluster as pc
dm = pc.distancematrix(np.array(X_matrix), dist = 'e')
solution, score, something = pc.kmedoids(dm, nclusters = 30, npass = 10)
medoids = codes[solution].unique()
groupings = numbers[numbers['O*NET-SOC Code'].isin(medoids)]

# KMedoids Algorithm – Github Code

In [962]:
import numpy as np
import random

def cluster(distances, k=3):

    m = distances.shape[0] # number of points

    # Pick k random medoids.
    curr_medoids = np.array([-1]*k)
    while not len(np.unique(curr_medoids)) == k:
        curr_medoids = np.array([random.randint(0, m - 1) for _ in range(k)])
    old_medoids = np.array([-1]*k) # Doesn't matter what we initialize these to.
    new_medoids = np.array([-1]*k)
   
    # Until the medoids stop updating, do the following:
    while not ((old_medoids == curr_medoids).all()):
        # Assign each point to cluster with closest medoid.
        clusters = assign_points_to_clusters(curr_medoids, distances)

        # Update cluster medoids to be lowest cost point. 
        for curr_medoid in curr_medoids:
            cluster = np.where(clusters == curr_medoid)[0]
            new_medoids[curr_medoids == curr_medoid] = compute_new_medoid(cluster, distances)

        old_medoids[:] = curr_medoids[:]
        curr_medoids[:] = new_medoids[:]

    return clusters, curr_medoids

def assign_points_to_clusters(medoids, distances):
    distances_to_medoids = distances[:,medoids]
    clusters = medoids[np.argmin(distances_to_medoids, axis=1)]
    clusters[medoids] = medoids
    return clusters

def compute_new_medoid(cluster, distances):
    mask = np.ones(distances.shape)
    mask[np.ix_(cluster,cluster)] = 0.
    cluster_distances = np.ma.masked_array(data=distances, mask=mask, fill_value=10e9)
    costs = cluster_distances.sum(axis=1)
    return costs.argmin(axis=0, fill_value=10e9)

In [1016]:
class Point():
    def __init__(self, x, weight = None, code = None):
        self.x = x
        self.weight = weight
        self.cluster = None
        self.code = code

class Cluster():
    def __init__(self, medoid, label):
        self.medoid = medoid
        self.label = label
        self.points = []
        self.distances = []

class KMedoids():
    def __init__(self, N, transform = None, max_iters = float('inf'), threshold_change = -float('inf'), print_iter = True):
        self.N = N # number of clusters
        self.M = [] # current medoids
        self.O = [] # current non-medoids
        self.clusters = []
        self.updated_clusters = []
        self.X = None
        self.weights = None
        self.n_points = None
        self.transform = lambda x: x
        self.max_iters = max_iters
        self.threshold_change = threshold_change
        self.num_changes = float('inf')
        self.print_iter = print_iter
        self.distsum = []

    def fit(self, X, weights, codes):
        self.__init__(self.N, self.transform, self.max_iters, self.threshold_change)
        # check data type
        if isinstance(X, np.ndarray):
            self.X = X
        else:
            self.X = np.array(X)

        self.n_points = self.X.shape[0]
        self.weights = np.array(weights)
        self.codes = np.array(codes)
        self._initialize_X()
        self._initialize_medoids()
        changed = True
        iter_num = 0
        while changed and iter_num < self.max_iters and self.num_changes > self.threshold_change:
            iter_num += 1
            if self.print_iter:
                print iter_num
            self._reset_clusters()
            self._assign_points()
            changed = self._update_medoids()
            self._create_self_O()
            self._update_costs(iter_num)
        return self.clusters

    def _initialize_X(self):
        for point, weight, code in zip(self.X, self.weights, self.codes):
            self.O.append(Point(point, weight, code))

    def _initialize_medoids(self):
        indices = np.random.randint(0, self.n_points, self.N)
        for i in indices:
            self.clusters.append(Cluster(self.O[i], i))
        self.O = np.delete(self.O, indices, axis = 0)

    def _reset_clusters(self):
        for cluster in self.clusters:
            cluster.points, cluster.distances = [], []

    def _assign_points(self):
        for point in self.O:
            
            target_distance = float('inf')
            optimal_cluster = None
            for i, cluster in enumerate(self.clusters):
                weighted_distance = self._calc_distance(point, cluster)
                if weighted_distance < target_distance:
                    target_distance = weighted_distance
                    optimal_cluster = i
            self.clusters[optimal_cluster].points.append(point)
            self.clusters[optimal_cluster].distances.append(target_distance)
            
    def _calc_distance(self, point, cluster):
        # calcs distances between point and cluster medioid, returns 
        def euclidean_distance(u, v):
            return np.sqrt(np.sum((u - v)**2))

        distance = euclidean_distance(point.x, cluster.medoid.x)
        weighted_distance = self.transform(point.weight) * distance

        return weighted_distance

    def _update_medoids(self):
        changed = False
        new_clusters = []
        total_changed = 0
        for cluster in self.clusters:
            optimal_cluster = cluster
            target_cost = sum(cluster.distances)
            for some_point in cluster.points:
                new_cluster = cluster
                new_cluster.points.append(new_cluster.medoid)
                new_cluster.points.remove(some_point)
                new_cluster.medoid = some_point
                total_cost = self._calc_cluster_cost(new_cluster.medoid, new_cluster)
                if total_cost < target_cost:
                    target_cost = total_cost
                    optimal_cluster = new_cluster
                    changed = True
                    total_changed += 1
            new_clusters.append(optimal_cluster)
        self.clusters = new_clusters
        self.num_changes = total_changed
        if not changed:
            return False
        else:
            return True
        
    def _create_self_O(self):
        self.O = []
        for cluster in self.clusters:
            self.O += cluster.points
            
    def _update_costs(self, iter_num):
        self.distsum.append((iter_num, sum(map(lambda x: sum(x.distances), self.clusters))))

    def _calc_cluster_cost(self, medoid, cluster):
        return sum(map(lambda x: self._calc_distance(x, cluster), cluster.points))

In [1017]:
def test_km():
    ncols = 3
    M = np.random.rand(100,ncols)
    X, y, w = M[:,:ncols-1], M[:,ncols-2:-1], M[:,ncols-1:]
    km = KMedoids(6, max_iters = 10, threshold_change = 1)
    clusters = km.fit(X, y, w)
    return clusters
    
c = test_km()

1
2
3
4
5
6
7
8
9
10


In [1020]:
def agg_points(clusters, plot = False):
    all_points = np.empty((1,2))
    colours = []
    for c in clusters:
        for point in c.points:
            all_points = np.vstack((all_points, point.x[np.newaxis, :]))
        all_points = np.vstack((all_points, c.medoid.x[np.newaxis,:]))
        colours += [c.label for i in range(len(c.points) + 1)]


    x, y, c = all_points[1:,0], all_points[1:,1], np.array(colours)
    if plot:
        
        def colourmap_ints(num_list, ordered = False, convert = False):
            uniques = np.unique(num_list)
            if ordered:
                colours = {str(v): i for (i, v) in enumerate(uniques)}
                if convert:
                    return [colours[str(c)] for c in num_list]
            else:
                colours = np.random.permutation(range(len(uniques)))

            return colours
        
        n_points = len(x)
        n_clusters = len(np.unique(c))
        
        colour_ints = colourmap_ints(c, True, True)
        hot = plt.get_cmap('hot')
        cNorm  = colors.Normalize(vmin=0, vmax=n_clusters)
        scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=hot)
        colours = [scalarMap.to_rgba(i) for i in colour_ints]
        
        # setup the plot
        fig, ax = plt.subplots(1,1, figsize=(10,10))
        ax.scatter(x,y,s=15, c = colours)

        
    else:
        return x, y, c

In [1129]:
fp = '../../../data/databases/db15/Skills.txt'
Skills = load_skills(fp)
Skills['O*NET-SOC Code'] = Skills['O*NET-SOC Code'].apply(lambda x: str(x)[:-3])
Skills = Skills.replace(np.nan, 0)
Skills = Skills.drop_duplicates(subset = 'O*NET-SOC Code')

In [1130]:
emp_figs = pd.read_csv('../../../data/education/employment_numbers.csv').iloc[:,:-1]
emp_figs.columns = ['O*NET-SOC Code', 'e2014', 'c']
Skills = Skills.merge(emp_figs[['O*NET-SOC Code', 'e2014']], on = 'O*NET-SOC Code', how = 'inner')
Skills.e2014 = Skills.e2014.apply(lambda x: float(str(x).replace(',','')))

In [1067]:
150000*0.10 / 30

500.0

In [1134]:
subset_Skills = Skills[Skills.e2014 > 300]
subset_Skills.shape

(117, 37)

In [1135]:
weights = subset_Skills.pop('e2014')
ones_weights = np.ones_like(weights)
codes = subset_Skills.pop('O*NET-SOC Code')

In [1136]:
km = KMedoids(30, transform = lambda x: 1/x, max_iters = 4000, threshold_change = 5)

In [1137]:
clusters = km.fit(np.array(subset_Skills), ones_weights, codes)

1
2
3
4
5
6
7
8
9
10
11
12
13


In [1138]:
x, y = np.array(km.distsum)[:,0], np.array(km.distsum)[:,1]

In [1139]:
codes = [c.medoid.code for c in clusters]

In [1143]:
Skills[Skills['O*NET-SOC Code'].isin(codes)].e2014

0       343.4
32      985.6
36      300.8
61      303.2
68      328.6
69      718.4
170     386.6
292     347.2
338    1492.1
355     327.3
363     680.0
368    1095.4
374     890.1
380     873.9
382    3159.7
391    2360.6
413     597.2
438     853.5
462    2581.8
482     304.6
522    1159.1
568     447.1
608    1374.7
655     397.9
702     496.6
720     419.2
732     497.3
733     445.3
763     695.4
Name: e2014, dtype: float64

In [1145]:
for i in emp_figs[emp_figs['O*NET-SOC Code'].isin(codes)].c:
    print i, '\n'

Chief executives 

Managers, all other 

Purchasing agents, except wholesale, retail, and farm products 

Loan officers 

Computer programmers 

Software developers, applications 

Social and human service assistants 

Physicians and surgeons, all other 

Nursing assistants 

Firefighters 

Police and sheriff's patrol officers 

Security guards 

First-line supervisors of food preparation and serving workers 

Food preparation workers 

Combined food preparation and serving workers, including fast food 

Janitors and cleaners, except maids and housekeeping cleaners 

Hairdressers, hairstylists, and cosmetologists 

Sales representatives, services, all other 

Customer service representatives 

Production, planning, and expediting clerks 

Construction laborers 

First-line supervisors of mechanics, installers, and repairers 

Maintenance and repair workers, general 

Welders, cutters, solderers, and brazers 

Inspectors, testers, sorters, samplers, and weighers 

Helpers--production wo