In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

In [3]:
dataset = pd.read_csv('HR100.txt', delimiter = '\t', quoting = 3)

In [4]:
dataset.head(5)

Unnamed: 0,Name,KPIs,InPreferredDept,CurrDept,Count
0,Tarial,Performed VSS on CISCO catalyst switches and r...,0,Network,1
1,Anastasia,Conducts organizational assessments to priorit...,1,Security,1
2,Regis,Designed business analtytics and reporting tha...,0,Data,1
3,Brent,Developed and implemented complex Internet and...,0,WebDev,1
4,Saini,Provided job placement accomodations for emply...,0,HR,1


In [5]:
df = dataset.drop(['CurrDept', 'Count'], axis=1)

In [6]:
df.head(5)

Unnamed: 0,Name,KPIs,InPreferredDept
0,Tarial,Performed VSS on CISCO catalyst switches and r...,0
1,Anastasia,Conducts organizational assessments to priorit...,1
2,Regis,Designed business analtytics and reporting tha...,0
3,Brent,Developed and implemented complex Internet and...,0
4,Saini,Provided job placement accomodations for emply...,0


In [7]:
import re
import nltk 
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [8]:
stopwords.words('english')


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [10]:
def text_process(mess):
    ps = PorterStemmer()
    # Checking for punctuation
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    # Removing stopwords and stemming
    return [ps.stem(word.lower()) for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [18]:
df['KPIs'].head(5).apply(text_process)

0      [perform, vss, cisco, catalyst, switch, router]
1    [conduct, organiz, assess, priorit, secur, mat...
2    [design, busi, analtyt, report, doubl, product...
3    [develop, implement, complex, internet, intran...
4    [provid, job, placement, accomod, emply, medic...
Name: KPIs, dtype: object

In [19]:
df.head(5)

Unnamed: 0,Name,KPIs,InPreferredDept
0,Tarial,Performed VSS on CISCO catalyst switches and r...,0
1,Anastasia,Conducts organizational assessments to priorit...,1
2,Regis,Designed business analtytics and reporting tha...,0
3,Brent,Developed and implemented complex Internet and...,0
4,Saini,Provided job placement accomodations for emply...,0


In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(df['KPIs'])

print(len(bow_transformer.vocabulary_))

392


In [22]:
KPI_bow = bow_transformer.transform(df['KPIs'])

In [23]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(KPI_bow)

In [24]:
KPI_tfidf = tfidf_transformer.transform(KPI_bow)

In [25]:
from sklearn.naive_bayes import MultinomialNB
KPI_model = MultinomialNB().fit(KPI_tfidf, df['InPreferredDept'])

In [26]:
all_predictions = KPI_model.predict(KPI_tfidf)
print(all_predictions)

[0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1
 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1
 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1]


In [27]:
from sklearn.metrics import classification_report, confusion_matrix
print (confusion_matrix(df['InPreferredDept'], all_predictions))
print (classification_report(df['InPreferredDept'], all_predictions))

[[78  0]
 [ 0 22]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        78
           1       1.00      1.00      1.00        22

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [28]:
pred_kpi = "Conducts organizational assessments to prioritize security maturity level. Conducting malware analysis of attacker tooks by providing indicatiors for defensive measures."

In [29]:
bow_pred_kpi = bow_transformer.transform([pred_kpi])


In [30]:
tfidf_pred_kpi = tfidf_transformer.transform(bow_pred_kpi)


In [31]:
KPI_model.predict(tfidf_pred_kpi)[0]

1

In [34]:
pred_kpi1 = "Monitored organization's networks for security breaches and investigated violations."
bow_pred_kpi1 = bow_transformer.transform([pred_kpi1])
tfidf_pred_kpi1 = tfidf_transformer.transform(bow_pred_kpi1)
print(KPI_model.predict(tfidf_pred_kpi1)[0])

1


In [36]:
pred_kpi2 = "Built a customer attrition statistical model that improved customer retention for clients."
bow_pred_kpi2 = bow_transformer.transform([pred_kpi2])
tfidf_pred_kpi2 = tfidf_transformer.transform(bow_pred_kpi2)
print(KPI_model.predict(tfidf_pred_kpi2)[0])

0
