# Import Libraries

In [1]:
import re
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import spacy
import multiprocessing
from spacy.tokenizer import Tokenizer

In [5]:
courses = pd.read_excel('Courses.xlsx')
courses['Outcome Description'] = courses['jobFamily'] + " " \
                                + courses['Marketing Name'] + " " \
                                + courses['courseName'] + " " \
                                + courses ['moduleName'] + " " \
                                + courses['courseDesc'] + " " \
                                + courses['Outcome Description'] + " " \
                                + courses['competencyUnitDesc']
courses = courses[['productId', 'Marketing Name', 'Outcome Description', 'jobFamily', 'competencyLevel']]
courses

Unnamed: 0,productId,Marketing Name,Outcome Description,jobFamily,competencyLevel
0,1,Introduction to ERP Systems (SAP),Software Design & Development Introduction to ...,Software Design & Development,4 - Specialist Level
1,101,Express ERP - HCM,Software Design & Development Express ERP - HC...,Software Design & Development,4 - Specialist Level
2,102,Express ERP - SD,Software Design & Development Express ERP - SD...,Software Design & Development,4 - Specialist Level
3,103,Express ERP - FI,Software Design & Development Express ERP - FI...,Software Design & Development,4 - Specialist Level
4,104,Express ERP - MM,Software Design & Development Express ERP - MM...,Software Design & Development,4 - Specialist Level
...,...,...,...,...,...
1175,16002,Artificial Intelligence,Infrastructure Architecture Artificial Intell...,Infrastructure Architecture,4 - Specialist Level
1176,16003,New Hire-IT System Administrator,Cloud Computing New Hire-IT System Administrat...,Cloud Computing,4 - Specialist Level
1177,16004,New Hire and Redeployment – Infocomm Sales and...,Sales and Marketing New Hire and Redeployment ...,Sales and Marketing,4 - Specialist Level
1178,16005,Digital Fluency for All​,Strategy and Architecture Digital Fluency for ...,Strategy and Architecture,4 - Specialist Level


In [6]:
multiprocessing.cpu_count()

8

# Data Preprocessing

In [7]:
nlp = spacy.load('en_core_web_sm')
# Tokenize words only with the whitespace rule
# N-grams will no longer be treated as 'N' and '-grams'
nlp.tokenizer = Tokenizer(nlp.vocab)

def preprocess(txt):
    txt = txt.lower()
    txt = re.sub("'", "", txt) # Remove apostrophe
    txt = re.sub("’", "", txt) # Remove apostrophe
    txt = re.sub('<.*?>', '', txt)
    txt = re.sub('::', ' ', txt)
    txt = re.sub('&nbsp', '', txt)
    txt = re.sub('\\t|\\n', ' ', txt)
    txt = re.sub('[^a-zA-Z]', ' ', txt)
    txt = re.sub(' +', ' ', txt)
    return txt

def tagcol_paragraph_embeddings_features(train_data):

    # Expects a dataframe with a 'Outcome Description' column
    train_data_values = train_data['Outcome Description'].values
    
    # Remember to use token.text to get the raw string, otherwise doc2vec cannot build vocabulary
    columns = [TaggedDocument([token.text for token in nlp(text) if token is not token.is_stop] , [i]) for i, text in enumerate(train_data_values)]
    
    return columns

In [12]:
courses['Outcome Description'] = courses['Outcome Description'].astype(str)
courses['Outcome Description'] = courses['Outcome Description'].apply(preprocess)

# Train Doc2Vec Model

In [15]:
corpus = tagcol_paragraph_embeddings_features(courses)

In [16]:
model = Doc2Vec(dm=0, vector_size=50, workers=multiprocessing.cpu_count(), min_count=2, epochs=100, hs=1, negative=0)

In [17]:
model.build_vocab(corpus)

In [18]:
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [19]:
vector = model.infer_vector(
    ['marketing',
     'microsoft office',
     'microsoft word',
     'negotiation',
     'planning',
     'process improvement',
     'procurement',
     'project management',
     'purchasing',
     'quality assurance',
     'quality control'])

In [20]:
res = model.dv.most_similar([vector], topn=10)
res

[(756, 0.4596649408340454),
 (822, 0.4580027163028717),
 (57, 0.45430517196655273),
 (244, 0.45420798659324646),
 (731, 0.42308980226516724),
 (115, 0.41906335949897766),
 (94, 0.4159999489784241),
 (124, 0.41428330540657043),
 (62, 0.4098603427410126),
 (730, 0.40189942717552185)]

In [22]:
course_unique = set()
course_list = []
for i, prob in res:
    if courses.loc[i, 'Marketing Name'] not in course_unique:
        course_unique.add(courses.loc[i, 'Marketing Name'])
        course_list.append((courses.loc[i, 'productId'], courses.loc[i, 'Marketing Name'], courses.loc[i, 'competencyLevel']))


In [23]:
course_list

[(8801, 'Omni Channel Commerce(OCC)', '3 - Entrant Level'),
 (10110, 'Omnicom Sales & Marketing  ', '3 - Entrant Level'),
 (318, 'ERP Application Functional (SAP MM)', '4 - Specialist Level'),
 (7507, 'Intermediate SAP - Materials Management', '4 - Specialist Level'),
 (502, 'Materials Management (SAP MM)', '4 - Specialist Level'),
 (409, 'Express ERP - MM ', '4 - Specialist Level'),
 (7506, 'Basic SAP - Materials Management', '4 - Specialist Level')]