# Coarse Clustering
Change the value in `skills[skills['cluster'] == 34]` to see what do clusters look like (range 0-34)

In [41]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text 
import nltk.stem
global english_stemmer 

In [42]:
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):

    def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [43]:
skills = pd.read_csv('01-candidate_features.tsv')

In [44]:
skills.keys()

Index(['feature'], dtype='object')

In [45]:
skills
#vectorized[0].todense()

Unnamed: 0,feature
0,ability to
1,knowledge of
2,communication skills
3,of experience
4,able to
5,years of experience
6,on experience
7,hands on experience
8,and experience
9,with experience


In [46]:
stop_words = ['skill','skills','perform','deliver','ability','avail','experience','demonstrate'] 
#stop_words = ['available','availability']               
stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)
vectorizer = StemmedCountVectorizer(min_df=10, max_df=0.5,stop_words = stop_words)
vectorized = vectorizer.fit_transform(skills.feature)

In [47]:
num_clusters = 35
km = KMeans(n_clusters = num_clusters, init = 'k-means++', n_init = 1,verbose=1)
clustered = km.fit(vectorized)
predict = km.predict(vectorized)

Initialization complete
Iteration  0, inertia 12335.000
Iteration  1, inertia 11891.922
Iteration  2, inertia 11663.774
Iteration  3, inertia 11374.216
Iteration  4, inertia 11365.081
Iteration  5, inertia 11358.987
Iteration  6, inertia 11356.781
Iteration  7, inertia 11356.373
Converged at iteration 7: center shift 0.000000e+00 within tolerance 4.679250e-07


In [48]:
vectorizer.get_feature_names()

['3d',
 'abl',
 'academ',
 'acceler',
 'accept',
 'accommod',
 'achiev',
 'action',
 'adapt',
 'addit',
 'advanc',
 'ai',
 'algorithm',
 'altern',
 'analysi',
 'analyt',
 'analyz',
 'api',
 'appli',
 'applic',
 'approach',
 'architectur',
 'area',
 'assist',
 'attitud',
 'autom',
 'autonom',
 'avail',
 'aw',
 'award',
 'bachelor',
 'background',
 'base',
 'basic',
 'benefit',
 'bring',
 'bs',
 'build',
 'busi',
 'caff',
 'candid',
 'capabl',
 'certain',
 'challeng',
 'clear',
 'cloud',
 'code',
 'collabor',
 'comfort',
 'commerci',
 'communic',
 'compani',
 'compar',
 'complex',
 'compon',
 'comput',
 'concept',
 'concret',
 'connect',
 'continu',
 'contribut',
 'control',
 'core',
 'creat',
 'creativ',
 'critic',
 'cuda',
 'cultur',
 'current',
 'custom',
 'cv',
 'data',
 'debug',
 'deep',
 'degre',
 'deliv',
 'deliver',
 'deliveri',
 'demonstr',
 'depend',
 'deploy',
 'depth',
 'design',
 'desir',
 'desktop',
 'detect',
 'develop',
 'devic',
 'digit',
 'direct',
 'disabl',
 'distribu

In [49]:
vectorized.shape

(9629, 370)

In [50]:
len(skills.feature)

9629

In [51]:
skills['cluster'] = predict

In [52]:
skills['cluster'].unique()

array([ 1, 11, 24, 14, 29, 32,  4,  7, 12, 19, 10, 28, 26,  8, 31, 18,  9,
        2, 34, 20, 25, 22, 15, 13, 33,  5, 30,  3, 21, 17,  6, 27, 16,  0,
       23], dtype=int64)

In [53]:
skills.sort_values('cluster').count()

feature    9629
cluster    9629
dtype: int64

In [54]:
gg = skills.groupby('cluster')

In [55]:
import collections, re, operator
from collections import Counter
def BagofWords(textlist):
    bagsofwords = [ collections.Counter(re.findall(r'\w+', txt))
            for txt in textlist]
    words=Counter()
    for bbb in bagsofwords:
        for bb,v in bbb.items():  
            words[bb]+=v
    sorted_words = sorted(words.items(), key=operator.itemgetter(1),reverse=True)
    return sorted_words

In [56]:
from collections import defaultdict
d=defaultdict()
key=set(predict)
for k in key:
    d[k]=[]
for x,y in gg:
    d[x]+=list(y.feature.values)
    #print y.feature.values

In [57]:
print(d.keys())

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34])


In [58]:
key=5
words = BagofWords(d[5])
print(len(d))
print(words)

35
[('knowledge', 30), ('apply', 12), ('applied', 9), ('applying', 9), ('of', 7), ('in', 6), ('this', 3), ('their', 3), ('ois', 3), ('your', 3), ('fundamental', 3), ('python', 3), ('career', 1), ('to', 1), ('geometry', 1), ('clouds', 1), ('machine', 1), ('etc', 1), ('learning', 1), ('on', 1), ('software', 1), ('from', 1), ('deep', 1), ('as', 1), ('instructions', 1), ('following', 1), ('creatively', 1)]


In [59]:
#skills.sort_values('cluster',ascending=False).to_csv("./candidate_features_cluster.tsv", encoding='utf8', index=False, sep='\t')

In [98]:
skills[skills['cluster'] == 34]

Unnamed: 0,feature,cluster
107,applicable programming,34
138,applications experience,34
163,with applicable,34
170,applicable federal state,34
189,applicable law,34
191,applicable federal,34
199,one applicable programming,34
201,one applicable,34
230,under applicable,34
272,applicable federal state or,34


In [99]:
cluster_type={}

In [100]:
cluster_type[0] = 'skills'
cluster_type[1] = 'experience'
cluster_type[2] = 'task'
cluster_type[3] = 'knowledge'
cluster_type[4] = 'experience'
cluster_type[5] = 'skills'
cluster_type[6] = 'education'
cluster_type[7] = 'knowledge'
cluster_type[8] = 'knowledge'
cluster_type[9] = 'ability'
cluster_type[10] = 'delivery'
cluster_type[11] = 'knowledge'
cluster_type[12] = 'code'
cluster_type[13] = 'knowledge'
cluster_type[14] = 'other skills'
cluster_type[15] = 'education'
cluster_type[16] = 'education'
cluster_type[17] = 'education'
cluster_type[18] = 'education'
cluster_type[19] = 'skills'
cluster_type[20] = 'applicable'
cluster_type[21] = 'availability'
cluster_type[22] = 'skills'
cluster_type[23] = 'other'
cluster_type[24] = 'skills'
cluster_type[25] = 'knowledge'
cluster_type[26] = 'other'
cluster_type[27] = 'education'
cluster_type[28] = 'work experience'
cluster_type[29] = 'skills'
cluster_type[30] = 'education'
cluster_type[31] = 'other'
cluster_type[32] = 'other'
cluster_type[33] = 'requirement'
cluster_type[34] = 'previous experience'

In [101]:
CT = pd.DataFrame.from_dict(cluster_type,orient='index')

In [102]:
CT = CT.reset_index()

In [103]:
CT.columns = ['cluster','type']

In [104]:
CT

Unnamed: 0,cluster,type
0,0,skills
1,1,experience
2,2,task
3,3,knowledge
4,4,experience
5,5,skills
6,6,education
7,7,knowledge
8,8,knowledge
9,9,ability


In [105]:
new_skills = skills.merge(CT)

In [106]:
new_skills.sort_values('cluster',ascending=False).to_csv("./02-candidate_features_cluster.tsv", encoding='utf8', index=False, sep='\t')

In [107]:
new_skills.groupby('type').count()

Unnamed: 0_level_0,feature,cluster
type,Unnamed: 1_level_1,Unnamed: 2_level_1
ability,128,128
applicable,86,86
availability,42,42
code,136,136
delivery,94,94
education,525,525
experience,4636,4636
knowledge,2253,2253
other,320,320
other skills,52,52


In [67]:
skill_df = new_skills[new_skills['type']=='skills']

In [74]:
def Clustering(df,num_clusters,stop_words):
                 
    stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)
    vectorizer = StemmedCountVectorizer(min_df=10, max_df=0.3,stop_words = stop_words)
    vectorized = vectorizer.fit_transform(df.feature)
    
    km = KMeans(n_clusters = num_clusters, init = 'k-means++', n_init = 1,verbose=1)
    clustered = km.fit(vectorized)
    predict = km.predict(vectorized)
    return predict
    

In [75]:
stop_words = ['skills','basic','required','desired','desirable']  
num_clusters = 10
cl_num = Clustering(skill_df, num_clusters,stop_words)

Initialization complete
Iteration  0, inertia 6713.000
Iteration  1, inertia 6587.388
Iteration  2, inertia 6397.741
Iteration  3, inertia 6182.928
Iteration  4, inertia 6180.572
Iteration  5, inertia 6175.451
Iteration  6, inertia 6175.347
Converged at iteration 6: center shift 0.000000e+00 within tolerance 5.947484e-07


In [76]:
cl_num

array([9, 0, 0, ..., 0, 0, 0])

In [77]:
set(cl_num)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

In [78]:
skill_df['cluster_2'] = cl_num

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [79]:
skill_df

Unnamed: 0,feature,cluster,type,cluster_2
0,ability to,0,skills,9
1,communication skills,0,skills,0
2,of experience,0,skills,0
3,on experience,0,skills,0
4,hands on experience,0,skills,0
5,and experience,0,skills,0
6,with experience,0,skills,0
7,programming skills,0,skills,0
8,qualifications experience,0,skills,0
9,industry experience,0,skills,0


In [80]:
skill_df[skill_df['cluster_2'] == 0]

Unnamed: 0,feature,cluster,type,cluster_2
1,communication skills,0,skills,0
2,of experience,0,skills,0
3,on experience,0,skills,0
4,hands on experience,0,skills,0
5,and experience,0,skills,0
6,with experience,0,skills,0
7,programming skills,0,skills,0
8,qualifications experience,0,skills,0
9,industry experience,0,skills,0
10,vision experience,0,skills,0


In [81]:
def get_features(gg,predict):
    d=defaultdict()
    key=set(predict)
    for k in key:
        d[k]=[]
    for x,y in gg:
        d[x]+=list(y.feature.values)
    return d

In [82]:
gg = skill_df.groupby('cluster_2')
d = get_features(gg,predict)

for key in range(num_clusters):
    words = BagofWords(d[key])
#print len(d[key])
    print(words[0],words[1],len(d[key]))
#cluster 3 is huge & undetermined.

('experience', 1821) ('skills', 558) 3365
('demonstrated', 116) ('demonstrable', 79) 262
('knowledge', 1211) ('of', 412) 1245
('engineering', 34) ('experience', 27) 49
('data', 91) ('experience', 42) 91
('analysis', 32) ('perform', 16) 32
('comfortable', 138) ('with', 42) 138
('capable', 77) ('of', 42) 77
('deliver', 171) ('delivering', 65) 264
('ability', 262) ('to', 228) 265


In [83]:
print(len(skill_df))

5788


In [84]:
cl_num_new = Clustering(skill_df[skill_df['cluster_2']==0],num_clusters,stop_words)

Initialization complete
Iteration  0, inertia 3385.000
Iteration  1, inertia 3311.534
Iteration  2, inertia 3273.502
Iteration  3, inertia 3164.282
Iteration  4, inertia 3163.245
Converged at iteration 4: center shift 0.000000e+00 within tolerance 6.796753e-07


In [85]:
skill_df['cluster_3_type'] = ['unknown']*len(skill_df)

#skill_df[skill_df['cluster_2']==3]['cluster_3']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [86]:
gg = skill_df.groupby('cluster_2')

from collections import defaultdict
def get_features(gg,predict):
    d=defaultdict()
    key=set(predict)
    for k in key:
        d[k]=[]
    for x,y in gg:
        d[x]+=list(y.feature.values)
    return d

In [87]:
skill_df.keys()

Index(['feature', 'cluster', 'type', 'cluster_2', 'cluster_3_type'], dtype='object')

In [88]:
skill_df

Unnamed: 0,feature,cluster,type,cluster_2,cluster_3_type
0,ability to,0,skills,9,unknown
1,communication skills,0,skills,0,unknown
2,of experience,0,skills,0,unknown
3,on experience,0,skills,0,unknown
4,hands on experience,0,skills,0,unknown
5,and experience,0,skills,0,unknown
6,with experience,0,skills,0,unknown
7,programming skills,0,skills,0,unknown
8,qualifications experience,0,skills,0,unknown
9,industry experience,0,skills,0,unknown
