In [1]:
import configparser
import gensim 
from gensim.models import Word2Vec
import logging
from os import listdir
from gensim.models.phrases import Phrases, Phraser

## Detecting similar job skills using word2vec
We have a file with thousands of software engineer and similar job descriptions. We'd like to figure out from these job descriptions what job skills are related to each other. We'd like a job searcher with a given skill to be able to answer the questions:
- What other skills are hiring managers looking for, in addition to the one that I have?
- What kinds of jobs can I apply for? Are there job descriptions that say they want skill X, but the skill I have is a good enough substitute for that?

Being able to answer these questions will help a candidate leverage their time building skills better, and apply to a wider range of jobs.

My implementation is in word2vec and divides the job descriptions into sentences based on "lines".
The listings have had product and company names replaced with random letter strings, to anonymize them. 

In [2]:
input_file = '20180619.txt'
with open(input_file, 'r') as f:
    pre_vocabulary = f.read()

In [3]:
token_lists = []
lines = pre_vocabulary.split("\n")
for line in lines:
    sentence_list = gensim.utils.simple_preprocess(line, deacc=False, min_len=2, max_len=30)
    token_lists.append(sentence_list)

In [4]:
model = gensim.models.Word2Vec(token_lists, iter=10, min_count=1, size=300, workers=4)
model.save('word2vec.model')
vocab_size = len(model.wv.vocab)

In [5]:
similar_phrase = 'python'
similar_vec = similar_phrase.split()
model.wv.most_similar(positive=similar_vec, topn=10)

[('scala', 0.7532507181167603),
 ('ag_zr_pd_js_av_og_rc_remote', 0.7324556112289429),
 ('perl', 0.7139312028884888),
 ('ruby', 0.6984028816223145),
 ('php', 0.6938693523406982),
 ('django', 0.6879407167434692),
 ('golang', 0.6731946468353271),
 ('clojure', 0.6319807767868042),
 ('nodejs', 0.6215530037879944),
 ('java', 0.6183958649635315)]

#### Is "similar" good enough?
Word2vec comes up with mostly sensible results for "similar" skills, but it is only giving us skills that are _similar_. That could mean these skills are used similarly in job descriptions -- but it doesn't necessarily mean that a skill that counts as "similar" to the given one are always required together in the same job description, or that it could act as a substitute. So there are a few improvements we'd want to make -- we want to be able to tell
- which skills are often asked for together ("you must know skill X AND skill Y")
- which skills act as substitutes for eachother ("you have experience with skill A OR skill B")
- whether a skill is a "required"/"minimum qualification" skill, or just a bonus (which we won't get to in this notebook).
- whether the above is different based on seniority (which we also won't get to).

## Detecting skill combinations or substitutions

#### Just parsing lines
The simplest way to look for skills that asked for in combination is just to look for places in job descriptions that have `skill X` and `and` relatively close to each other. Same with skills that are substitutions -- we want `skill X` and `or` to be close to each other. Here, we can approximate that by looking for the two words (skill and connector) on the same line.

In [6]:
key_skill = 'python'

In [7]:
and_lines = []
or_lines = []

for line in token_lists:
    if key_skill in line and 'and' in line:
        and_lines.append(line)
    elif key_skill in line and 'or' in line:
        or_lines.append(line)
    else:
       continue      

In [8]:
and_lines_flattened = [item for sublist in and_lines for item in sublist]

stop_words = ["a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such", "any", "other", "similar",
"that", "the", "their", "then", "there", "these", "looking", "working",
"they", "this", "to", "was", "will", "with", "we", "you", "our", "like", 
              "using", "from", "have", "work", "years", "experience", key_skill]

and_lines_flattened = [x for x in and_lines_flattened if x not in stop_words]

from collections import Counter
most_common = list(Counter(and_lines_flattened).most_common(10))
for (i,x) in most_common:
    print(i, ':', round(x * 100/len(and_lines)),'%')

development : 30 %
java : 29 %
software : 28 %
javascript : 21 %
languages : 19 %
web : 19 %
team : 18 %
developer : 16 %
django : 15 %
programming : 15 %


Above, we can see that the results also look reasonable, returning skills and general experience areas that look related. This helps answer the question,"if I know X, what else do I need to know?"

In [9]:
or_lines_flattened = [item for sublist in or_lines for item in sublist]
or_lines_flattened = [x for x in or_lines_flattened if x not in stop_words]
from collections import Counter
Counter(or_lines_flattened).most_common(10)
most_common = list(Counter(or_lines_flattened).most_common(10))
for (i,x) in most_common:
    print(i, ':', round(x * 100/len(or_lines)),'%')

java : 42 %
ruby : 30 %
languages : 27 %
perl : 24 %
scripting : 21 %
programming : 17 %
one : 17 %
language : 16 %
php : 14 %
javascript : 14 %


The results for this also look reasonable -- maybe a description for a job asks for Python but will accept Java or Ruby instead. We'd probably expect this to be the case more with junior positions than with senior ones, but we don't have a way of telling that at the moment.

#### Using Phrases and bigrams

In [10]:
key_skill = 'rails'

In [11]:
bigram = Phrases()
bigram.add_vocab(token_lists)

In [12]:
bigram_counter = Counter()
for key in bigram.vocab.keys():
    split_key = str(key, 'utf-8').split("_")
    if len(split_key) > 1:
        if str(split_key[0]) not in stop_words and str(split_key[1]) not in stop_words:
            bigram_counter[key] += bigram.vocab[key]

for key, counts in bigram_counter.most_common(10000):
    if key_skill in str(key):
        print('{0: <20} {1}'.format(str(key), counts))

b'rails_developer'   62
b'rails_developers'  25
b'rails_javascript'  20
b'ruby_rails'        17
b'rails_engineer'    17
b'groovy_grails'     14
b'rails_software'    12
b'rails_development' 9
b'rails_react'       8


#### Back to word2vec
As another way of trying to detect clustered skills, I returned to word2vec. I started with a given skill, and scrubbed input lines mentioning that skill to just the few words before and after the skill, to try to tighten the similarities.

In [13]:
key_skill = 'python'

In [14]:
token_lists_scrubbed = []
n_words = 15
idces = []

linesx = pre_vocabulary.split("\n")
for linex in linesx:
    sentence_list_scrubbed = gensim.utils.simple_preprocess(linex, deacc=False, min_len=2, max_len=30)
    try:
        idx = sentence_list_scrubbed.index(key_skill)
        idces.append(idx)
        start_idx = max(0, idx - n_words)
        end_idx = min(idx + n_words, len(sentence_list_scrubbed) - 1)
        sentence_list_scrubbed = sentence_list_scrubbed[start_idx:end_idx]
    except ValueError:
        pass
    token_lists_scrubbed.append(sentence_list_scrubbed)
    
model2 = gensim.models.Word2Vec(token_lists_scrubbed, iter=10, min_count=1, size=300, workers=4)

In [15]:
model2.wv.most_similar(positive=[key_skill], topn=10)

[('perl', 0.7397568225860596),
 ('scala', 0.7276171445846558),
 ('php', 0.7149808406829834),
 ('ruby', 0.7096331119537354),
 ('django', 0.7038027048110962),
 ('rails', 0.6401203274726868),
 ('golang', 0.6296257972717285),
 ('clojure', 0.6255342364311218),
 ('nodejs', 0.621862530708313),
 ('java', 0.6135293245315552)]

This seems to result in slightly more tools as opposed to general experience areas, which might make it more useful for a job seeker, but it loses the combination / substitution capacity of the above methods. 