#This is my Job family prediction model¶

In [1]:
import os
import boto3
import re
import copy
import time
from time import gmtime, strftime
from sagemaker import get_execution_role

role = get_execution_role()

region = boto3.Session().region_name

bucket='testtestbill' # Replace with your s3 bucket name
prefix = 'sagemaker/jobfamily' # Used as part of the path in the bucket where you store data
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket) # The URL to access the bucket

In [2]:
import pandas as pd
import numpy as np

#Access data from the lake and filter down to target pop#

In [3]:
df = pd.read_csv('jfoutput.csv')

In [5]:
df['category_id'] = df['JF'].factorize()[0]

In [6]:
from io import StringIO
category_id_df = df[['JF', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'JF']].values)

#train the model

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.Position).toarray()
labels = df.category_id
features.shape

(53573, 3890)

In [8]:
from sklearn.feature_selection import chi2
import numpy as np

N = 2
for JF, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(JF))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

# 'AD_Administration':
  . Most correlated unigrams:
       . assistant
       . administrator
  . Most correlated bigrams:
       . executive assistant
       . vacation worker
# 'AS_Asset Strategy':
  . Most correlated unigrams:
       . strategic
       . asset
  . Most correlated bigrams:
       . practice lead
       . manager maintenance
# 'AU_Governance Audit & Assurance':
  . Most correlated unigrams:
       . internal
       . audit
  . Most correlated bigrams:
       . audit assurance
       . internal audit
# 'BA_Business Analysis':
  . Most correlated unigrams:
       . analysis
       . ba
  . Most correlated bigrams:
       . business analysis
       . advisor ba
# 'BD_Business Development':
  . Most correlated unigrams:
       . investor
       . ventures
  . Most correlated bigrams:
       . gm business
       . business development
# 'BI_Business Improvement':
  . Most correlated unigrams:
       . black
       . improvement
  . Most correlated bigrams:
       . black 

# 'LD_Learning & Development':
  . Most correlated unigrams:
       . trainer
       . training
  . Most correlated bigrams:
       . training officer
       . officer training
# 'LE_Legal':
  . Most correlated unigrams:
       . corporate
       . counsel
  . Most correlated bigrams:
       . chief counsel
       . corporate counsel
# 'LN_Land':
  . Most correlated unigrams:
       . permitting
       . approvals
  . Most correlated bigrams:
       . land mineral
       . mineral rights
# 'MC_Media & Communications':
  . Most correlated unigrams:
       . communications
       . translator
  . Most correlated bigrams:
       . advisor media
       . advisor communications
# 'ME_Mechanical':
  . Most correlated unigrams:
       . graduate
       . engineer
  . Most correlated bigrams:
       . mechanical engineer
       . engineer mechanical
# 'MG_Mine Geology':
  . Most correlated unigrams:
       . geology
       . geologist
  . Most correlated bigrams:
       . geologist underground

# 'SU_UG Surveying':
  . Most correlated unigrams:
       . survey
       . surveyor
  . Most correlated bigrams:
       . senior surveyor
       . technician surveyor
# 'TA_Talent':
  . Most correlated unigrams:
       . graduates
       . talent
  . Most correlated bigrams:
       . manager performance
       . adviser talent
# 'TL_Transportation & Logistics':
  . Most correlated unigrams:
       . locomotive
       . supply
  . Most correlated bigrams:
       . locomotive driver
       . officer supply
# 'TM_Tailings Management':
  . Most correlated unigrams:
       . dams
       . tailings
  . Most correlated bigrams:
       . tailings dams
       . engineer tailings
# 'TR_Treasury':
  . Most correlated unigrams:
       . treasury
       . structured
  . Most correlated bigrams:
       . global markets
       . corporate finance
# 'TX_Tax':
  . Most correlated unigrams:
       . indirect
       . tax
  . Most correlated bigrams:
       . advisor tax
       . tax risk
# 'VE_Ventilat

See here for the test train split stuff

here is a link for the estimator: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_iris/Scikit-learn%20Estimator%20Example%20With%20Batch%20Transform.ipynb



In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(df['Position'], df['JF'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [10]:
print(clf.predict(count_vect.transform(["HRBP"])))

['HO_HR Business Partnering']


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score


models = [
    LinearSVC()
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])



In [12]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

<Figure size 640x480 with 1 Axes>

In [13]:
cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC    0.720458
Name: accuracy, dtype: float64

In [14]:
from sklearn.model_selection import train_test_split

model = LinearSVC()

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [15]:
model.fit(features, labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [16]:
from sklearn.feature_selection import chi2

N = 2
for JF, category_id in sorted(category_to_id.items()):
  indices = np.argsort(model.coef_[category_id])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
  bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
  print("# '{}':".format(JF))
  print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
  print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

# 'AD_Administration':
  . Top unigrams:
       . secretary
       . receptionist
  . Top bigrams:
       . vacation worker
       . training support
# 'AS_Asset Strategy':
  . Top unigrams:
       . sme
       . growth
  . Top bigrams:
       . chief adv
       . practice lead
# 'AU_Governance Audit & Assurance':
  . Top unigrams:
       . audit
       . internal
  . Top bigrams:
       . audit assurance
       . internal audit
# 'BA_Business Analysis':
  . Top unigrams:
       . ba
       . economics
  . Top bigrams:
       . business analysis
       . business evaluation
# 'BD_Business Development':
  . Top unigrams:
       . ventures
       . bd
  . Top bigrams:
       . business development
       . gm business
# 'BI_Business Improvement':
  . Top unigrams:
       . lean
       . client
  . Top bigrams:
       . superintendent business
       . readiness lead
# 'BP_Business Strategy Planning':
  . Top unigrams:
       . ot
       . bus
  . Top bigrams:
       . business process
  

# 'PE_Project':
  . Top unigrams:
       . cmp
       . projets
  . Top bigrams:
       . coordinator construction
       . data systems
# 'PM_Project Management':
  . Top unigrams:
       . gtp
       . non
  . Top bigrams:
       . gm project
       . principal studies
# 'PO_Processing Operations':
  . Top unigrams:
       . controllers
       . aluminum
  . Top bigrams:
       . driver mobile
       . operator plant
# 'PP_Project Procurement':
  . Top unigrams:
       . contracts
       . proj
  . Top bigrams:
       . procurement contracts
       . manager contracts
# 'PS_Asset Planning & Scheduling':
  . Top unigrams:
       . planner
       . scheduler
  . Top bigrams:
       . master data
       . coordinator shutdown
# 'PS_Proc. Planning & Scheduling':
  . Top unigrams:
       . metal
       . blue
  . Top bigrams:
       . operations scheduler
       . production advisor
# 'PS_Procurement Services':
  . Top unigrams:
       . buy
       . p2p
  . Top bigrams:
       . mining m

In [17]:
import pickle

In [18]:
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))