In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF
import pickle

In [77]:
import warnings
warnings.filterwarnings('ignore')

In [78]:
with open('responsibilities.pickle', 'rb') as handle:
    responsibilities = pickle.load(handle)

In [79]:
vec = TfidfVectorizer(stop_words='english', ngram_range=(1,2), token_pattern="\\b[a-z][a-z]+\\b", max_df=0.8)

In [80]:
doc_term = vec.fit_transform(responsibilities)

In [81]:
doc_term

<3829x162441 sparse matrix of type '<class 'numpy.float64'>'
	with 790636 stored elements in Compressed Sparse Row format>

_The below way of displaying vectorisation results as they relate to a specific document was taken from course materials_

In [82]:
i = 200

doc = pd.Series(name=responsibilities[i],
                data=doc_term.toarray()[i],
                index=vec.get_feature_names()) \
   .sort_values(ascending=False)

doc[:20]

making mentor            0.108676
analytic                 0.105266
programming access       0.104841
execute deep             0.101866
forage data              0.101866
diagnose data            0.101866
leader participate       0.101866
query perform            0.101866
access transform         0.101866
assumption forage        0.101866
extensive programming    0.101866
modeling lead            0.101866
error document           0.101866
transform prepare        0.101866
community prepare        0.101866
perform extensive        0.101866
forage                   0.101866
gap engage               0.101866
dive diagnostic          0.101866
inconsistency error      0.101866
Name:   develop complex query perform extensive programming access transform prepare datum statistical modeling lead execute deep dive diagnostic predictive prescriptive analytic support data drive business decision making mentor develop junior data scientist analyst identify diagnose data inconsistency error document 

_The function below was adapted from an answer on Stack Overflow_

In [83]:
def get_top_n_words(bag_of_words, n=20):
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [84]:
get_top_n_words(doc_term, 20)

[('data', 154.86847985694112),
 ('datum', 119.17217813610506),
 ('model', 102.75457574131033),
 ('business', 102.41453171460711),
 ('team', 93.45905592651582),
 ('work', 81.75520471365722),
 ('learning', 81.31332713884308),
 ('develop', 75.11564441290079),
 ('machine', 72.1533437507641),
 ('machine learning', 70.92838613092705),
 ('product', 70.87290302980456),
 ('analysis', 69.7824886735079),
 ('solution', 62.37997201198718),
 ('build', 61.83343405928745),
 ('experience', 59.477981546610344),
 ('analytic', 56.74883599866434),
 ('design', 55.759993256474786),
 ('science', 54.565372821317276),
 ('customer', 54.387273179623165),
 ('support', 53.632936252233655)]

In [85]:
doc_term.shape

(3829, 162441)

In [86]:
feature_names = vec.get_feature_names()

In [87]:
nmf_5 = NMF(n_components=5)
doc_topic_5 = nmf_5.fit_transform(doc_term)

In [88]:
nmf_5.components_.shape

(5, 162441)

_The function below was adapted from a function in the course materials_

In [89]:
def get_top_terms(topic, n_terms, nmf=nmf_5, terms=feature_names):
    # get the topic components (i.e., term weights)
    components = nmf.components_[topic, :]
    # print(components.argsort()[-n_terms:])
    # print(components.argsort()[:-n_terms - 1:-1])

    # get term indices, sorted (descending) by topic weights
    top_term_indices = components.argsort()[:-n_terms - 1:-1]

    # use the `terms` array to get the actual top terms
    top_terms = np.array(terms)[top_term_indices]

    return top_terms.tolist()

In [90]:
topics_5 = [get_top_terms(i, 5) for i in range(5)]
topics_5

[['data', 'datum', 'business', 'analysis', 'analytic'],
 ['learning', 'machine', 'machine learning', 'model', 'ml'],
 ['life', 'balance', 'culture', 'culture inclusion', 'employee'],
 ['business', 'model', 'datum', 'analytical model', 'support'],
 ['model', 'accuracy', 'outcome develop', 'data', 'monitor']]

In [105]:
nmf_20 = NMF(n_components=20)
nmf_20.fit_transform(doc_term)

['business project technical statistical analytical solution analytic problem business partner business problem',
 'learning machine machine learning ml model engineer learning model ai build pipeline',
 'life balance culture culture inclusion career growth employee inclusion career team member',
 'business model datum analytical model support example testing problem technique appropriate',
 'outcome develop accuracy model monitor outcome data tool monitor effectiveness accuracy gathering technique accuracy new',
 'product drive team metric decision insight inform define strategy product team',
 'data science data science team scientist data scientist solution work business project',
 'experience ability skill computer degree field strong language work knowledge',
 'marketing connect term modeling mix brand partner optimization build enhance media mix',
 'clinical research datum health patient development study analysis support scientific',
 'client consulting service help client profe

In [None]:
topics_20 = [' '.join(get_top_terms(i, 10, nmf_20)) for i in range(20)]
topics_20

In [94]:
nmf_10 = NMF(n_components=10)
doc_topic_10 = nmf_10.fit_transform(doc_term)

In [120]:
[print(get_top_terms(i, 9, nmf_10)) for i in range(10)]

['datum', 'analysis', 'data', 'statistical', 'business', 'model', 'develop', 'analytic', 'process']
['learning', 'machine', 'machine learning', 'model', 'ml', 'engineer', 'learning model', 'ai', 'build']
['life', 'balance', 'culture', 'culture inclusion', 'career growth', 'employee', 'inclusion', 'career', 'team']
['business', 'model', 'analytical model', 'datum', 'support', 'problem', 'example', 'testing', 'business problem']
['accuracy', 'outcome develop', 'model', 'data', 'monitor', 'outcome', 'tool monitor', 'effectiveness accuracy', 'gathering technique']
['product', 'drive', 'team', 'metric', 'decision', 'insight', 'business', 'build', 'strategy']
['data', 'science', 'data science', 'business', 'team', 'solution', 'project', 'data scientist', 'scientist']
['experience', 'ability', 'skill', 'computer', 'degree', 'strong', 'work', 'field', 'language']
['marketing', 'connect', 'term', 'modeling', 'mix', 'brand', 'partner', 'optimization', 'build enhance']
['client', 'consulting', 's

[None, None, None, None, None, None, None, None, None, None]

In [106]:
topics_10 = [' '.join(get_top_terms(i, 10, nmf_10)) for i in range(10)]
topics_10

['datum analysis data statistical business model develop analytic process support',
 'learning machine machine learning model ml engineer learning model ai build pipeline',
 'life balance culture culture inclusion career growth employee inclusion career team member',
 'business model analytical model datum support problem example testing business problem technique',
 'accuracy outcome develop model data monitor outcome tool monitor effectiveness accuracy gathering technique accuracy new',
 'product drive team metric decision insight business build strategy define',
 'data science data science business team solution project data scientist scientist technical',
 'experience ability skill computer degree strong work field language knowledge',
 'marketing connect term modeling mix brand partner optimization build enhance media mix',
 'client consulting service help client professional technology consultant market mission help']

In [108]:
doc_topic_10

array([[0.03485058, 0.03592652, 0.00092143, ..., 0.00053528, 0.        ,
        0.        ],
       [0.03391614, 0.        , 0.        , ..., 0.0487905 , 0.00414699,
        0.03039746],
       [0.01914835, 0.00250696, 0.00767719, ..., 0.00694049, 0.        ,
        0.00545855],
       ...,
       [0.0440359 , 0.        , 0.        , ..., 0.        , 0.00997153,
        0.07137434],
       [0.0440359 , 0.        , 0.        , ..., 0.        , 0.00997153,
        0.07137434],
       [0.        , 0.08136391, 0.00413426, ..., 0.00884508, 0.00159724,
        0.00394441]])

In [113]:
doc_topic_new_5 = np.delete(doc_topic_10, [3,6,7,8,9], 1)
doc_topic_new_5

array([[0.03485058, 0.03592652, 0.00092143, 0.        , 0.00472712],
       [0.03391614, 0.        , 0.        , 0.00122443, 0.00982022],
       [0.01914835, 0.00250696, 0.00767719, 0.        , 0.        ],
       ...,
       [0.0440359 , 0.        , 0.        , 0.        , 0.        ],
       [0.0440359 , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.08136391, 0.00413426, 0.        , 0.02907686]])

In [115]:
doc_topic_new_5.max(axis=0)

array([0.12677527, 0.16401473, 0.39026006, 0.44803592, 0.25063744])

In [114]:
import pickle

with open('doc_topic_new_5.pickle', 'wb') as handle:
   pickle.dump(doc_topic_new_5, handle)

In [None]:
type(doc_topic_5)

In [None]:
df_doc_topic = pd.DataFrame(data=doc_topic_5, index=df.index)

In [None]:
df_doc_topic.describe()

In [None]:
doc_topic_5[0]

In [None]:
target = doc_topic_5[3].copy()

In [None]:
target[0] = 10

In [None]:

myvals = np.dot(doc_topic_5, target)
max_index = np.argmax(myvals)

In [None]:
max_index

In [None]:
max_similarity = myvals[max_index]
max_similarity

In [None]:
from scipy.spatial import distance

In [None]:
target

In [None]:
# def index_closest(target, doc_topic):
#     distances = distance.cdist([target], doc_topic, "cosine")[0]
#     return np.argmin(distances)

In [None]:
def index_closest(target, doc_topic):
    return distance.cdist([target], doc_topic, "cosine")[0]


In [None]:
distances = index_closest(target, doc_topic_5)

In [None]:
np.argmin(distances)

In [None]:
distances.argsort()[:5]

In [None]:
index_closest([100,100,0,0,0], doc_topic_5)

In [None]:
distances.argsort()[:3]

In [None]:
type(df_doc_topic.iloc[3138])

In [None]:
df_doc_topic.iloc[1]

In [None]:
df.iloc[1012]['responsibilities']