In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_pickle("./responsibilities_df_2.pickle")
df.head()

Unnamed: 0,responsibilities,title
2908496770,:|•\tDesign and implement ML methods on propri...,Data Scientist
2911267267,:|• Provide consultative support as and when r...,Data Scientist
2912844894,:|Support Data and Analytics team through deve...,Data Scientist
2911205495,|Build agent-based simulations of smart contra...,Data Scientist
2912480226,"|Productionize, launch, and monitor predictive...",Data Scientist


In [4]:
df.shape

(4295, 2)

In [5]:
docs = df.responsibilities

In [6]:
vec = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=0.8)

In [7]:
doc_term = vec.fit_transform(docs)

In [8]:
doc_term

<4295x207797 sparse matrix of type '<class 'numpy.float64'>'
	with 932757 stored elements in Compressed Sparse Row format>

_The below way of displaying vectorisation results as they relate to a specific document was taken from course materials_

In [9]:
i = 200

doc = pd.Series(name=docs[i],
                data=doc_term.toarray()[i],
                index=vec.get_feature_names()) \
   .sort_values(ascending=False)

doc[:20]

learning optimization       0.169742
distributed computing       0.157927
optimization                0.133840
computing                   0.124378
distributed                 0.120052
development performs        0.115446
software contributes        0.115446
contributes product         0.115446
works big                   0.115446
platforms develops          0.112732
data distributed            0.112732
develops                    0.112142
analyses works              0.110438
solutions prepares          0.110438
interpretations results     0.108450
protocols quantitative      0.108450
engineering focus           0.108450
fpl                         0.108450
comprehensive documented    0.108450
documented observations     0.108450
Name: |This position is responsible for developing algorithms, modeling techniques, and optimization methods that support many aspects of NextEra and FPL business. Employees in this role use knowledge of machine learning, optimization, statistics, and applied 

_The function below was adapted from an answer on Stack Overflow_

In [10]:
def get_top_n_words(bag_of_words, n=20):
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [11]:
get_top_n_words(doc_term, 100)

[('business', 107.43669720918231),
 ('learning', 84.7229440790238),
 ('models', 79.36568134916759),
 ('team', 75.29335165509197),
 ('machine', 74.07024563712949),
 ('machine learning', 73.28802829713275),
 ('work', 70.90615675652647),
 ('science', 67.79368691466766),
 ('develop', 64.63237097276576),
 ('analysis', 64.30917162054838),
 ('product', 61.74604715970079),
 ('data science', 60.9549583635833),
 ('solutions', 58.23415889559621),
 ('analytics', 57.2781233688693),
 ('experience', 55.606531899700514),
 ('new', 53.75699709944742),
 ('design', 53.47915699870511),
 ('development', 52.441042942207176),
 ('build', 50.79381329230613),
 ('model', 50.58463840706466),
 ('teams', 48.89249166218554),
 ('technical', 48.88263231756333),
 ('insights', 48.7494983485417),
 ('support', 47.48230144884882),
 ('research', 46.86498740687121),
 ('statistical', 46.1328184533729),
 ('techniques', 43.36484183893956),
 ('ml', 43.317962393165665),
 ('tools', 42.28033388516265),
 ('engineering', 42.2152456818

In [12]:
doc_term.shape

(4295, 207797)

In [13]:
feature_names = vec.get_feature_names()
feature_names

['00',
 '00 actual',
 '000',
 '000 145',
 '000 155',
 '000 156',
 '000 160',
 '000 500',
 '000 57',
 '000 68',
 '000 90',
 '000 actual',
 '000 additional',
 '000 amazon',
 '000 annually',
 '000 asia',
 '000 begin',
 '000 bonus',
 '000 companies',
 '000 compensation',
 '000 computer',
 '000 data',
 '000 dependent',
 '000 depending',
 '000 employees',
 '000 end',
 '000 families',
 '000 homesite',
 '000 independent',
 '000 individuals',
 '000 nmg',
 '000 passionate',
 '000 patients',
 '000 people',
 '000 pre',
 '000 products',
 '000 professionals',
 '000 retail',
 '000 small',
 '000 software',
 '000 stores',
 '000 strong',
 '000 team',
 '000 technicians',
 '000 usd',
 '000 year',
 '00am',
 '00am 30pm',
 '01',
 '01 2022',
 '01 24',
 '0161',
 '0161 694',
 '02',
 '02 18',
 '04',
 '04 01',
 '04 15',
 '05',
 '05 06',
 '05 2021',
 '06',
 '06 17',
 '06 2022',
 '07',
 '07 01',
 '07 15',
 '07704',
 '07704 152',
 '09',
 '09 01',
 '10',
 '10 000',
 '10 10',
 '10 13',
 '10 15',
 '10 20',
 '10 2021',


In [14]:
nmf_5 = NMF(n_components=5)
doc_topic_5 = nmf_5.fit_transform(doc_term)

In [15]:
nmf_5.components_.shape

(5, 207797)

_The function below was adapted from a function in the course materials_

In [16]:
def get_top_terms(topic, n_terms, nmf=nmf_5, terms=feature_names):
    # get the topic components (i.e., term weights)
    components = nmf.components_[topic, :]
    # print(components.argsort()[-n_terms:])
    # print(components.argsort()[:-n_terms - 1:-1])

    # get term indices, sorted (descending) by topic weights
    top_term_indices = components.argsort()[:-n_terms - 1:-1]

    # use the `terms` array to get the actual top terms
    top_terms = np.array(terms)[top_term_indices]

    return top_terms.tolist()

In [17]:
topics_5 = [get_top_terms(i, 5) for i in range(5)]
topics_5

[['learning', 'machine', 'machine learning', 'ml', 'models'],
 ['business', 'analytics', 'analysis', 'science', 'insights'],
 ['life', 'balance', 'culture', 'culture inclusion', 'team'],
 ['business', 'supports', 'model', 'analytical', 'testing'],
 ['accuracy', 'model', 'monitor', 'analyze', 'outcomes develop']]

In [18]:
nmf_20 = NMF(n_components=20)
nmf_20.fit_transform(doc_term)
topics_20 = [get_top_terms(i, 5, nmf_20) for i in range(20)]
topics_20

[['learning', 'machine', 'machine learning', 'ml', 'ai'],
 ['business', 'analysis', 'analytics', 'support', 'statistical'],
 ['life', 'balance', 'culture', 'culture inclusion', 'career growth'],
 ['business', 'supports', 'model', 'analytical', 'testing'],
 ['accuracy', 'monitor', 'outcomes develop', 'outcomes', 'custom data'],
 ['product', 'teams', 'metrics', 'drive', 'insights'],
 ['marketing', 'connect', 'walmart', 'media mix', 'media'],
 ['cognitive', 'decision makers', 'makers', 'analytics cognitive', 'clients'],
 ['business partners', 'community', 'business', 'partners', 'objectives'],
 ['vaccine', 'chase', 'jpmorgan', 'jpmorgan chase', 'social distancing'],
 ['background',
  'ensure exceptional',
  'open ambiguous',
  'help minimum',
  'solving open'],
 ['structured unstructured',
  'information',
  'unstructured data',
  'unstructured',
  'structured'],
 ['technical', 'project', 'business', 'data science', 'science'],
 ['structuring downstream',
  'downstream processing',
  'des

In [19]:
nmf_10 = NMF(n_components=10)
nmf_10.fit_transform(doc_term)
topics_10 = [get_top_terms(i, 5, nmf_10) for i in range(10)]
topics_10

[['learning', 'machine', 'machine learning', 'ml', 'models'],
 ['business', 'analysis', 'science', 'data science', 'analytics'],
 ['life', 'balance', 'culture', 'culture inclusion', 'team'],
 ['business', 'supports', 'model', 'analytical', 'testing'],
 ['accuracy', 'model', 'monitor', 'analyze', 'outcomes develop'],
 ['product', 'business', 'teams', 'insights', 'drive'],
 ['marketing', 'connect', 'walmart', 'media mix', 'modeling'],
 ['clients', 'analytics', 'cognitive', 'decision', 'decision makers'],
 ['partners', 'business partners', 'external', 'community', 'business'],
 ['vaccine', 'chase', 'jpmorgan chase', 'jpmorgan', 'card firm']]

MVP Comment: I think the above topic modelling shows promise for the purpose of creating a job listing recommender. This modelling could do with some further iterating in order to optimise the utility of the topics. The question of how many topics would be best for the recommender remains an open one, and depends in part on the specifics of the recommender.