In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

In [2]:
import re
import nltk 
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [3]:
import spacy
import textacy

In [4]:
dataset = pd.read_csv('HR100.txt', delimiter = '\t', quoting = 3)
df = dataset.drop(['CurrDept', 'Count'], axis=1)

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
corpus = []

for i in range(0, df.shape[0]):
    individual_corpus = []
    details = re.sub('[^a-zA-Z0-9.,]', ' ', dataset['KPIs'][i])
    details = details.lower()
    details = details.split()
    details.append(" ")
    #new_stopwords = ['class', '10th', '12th']
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    #all_stopwords.extend(new_stopwords)
    all_stopwords.remove('not')
    details = [word for word in details if not word in set(all_stopwords)]
    details = ' '.join(details)
    individual_corpus.append(details)
    #tored_corpus.append(individual_corpus)
    #print(individual_corpus)
    corpus.append(details)
    corpus.append(details)

In [7]:
text = ''.join(corpus)
doc = nlp(text)
sentences = list(doc.sents)

In [8]:
verbs = [token for token in doc if token.pos_ == "VERB"]
list_of_verbs = [x.text for x in verbs]

In [9]:
adj = [token for token in doc if token.pos_ == "ADJ"]
list_of_adj = [x.text for x in adj]

In [10]:
adv = [token for token in doc if token.pos_ == "ADV"]
list_of_adv = [x.text for x in adv]

In [11]:
surplus_words = list_of_adj + list_of_adv + list_of_verbs

In [12]:
def preprocessing(mess):
    nopunc = [char.lower() for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    details = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    details = [word for word in details if not word in set(surplus_words)]
    details = ' '.join(details)
    fin = []
    fin.append(details)
    fin = list(set(fin))
    return fin
    

In [13]:
df['KPIs'].apply(preprocessing)

0                 [vss cisco catalyst switches routers]
1                 [assessments security maturity level]
2     [business analtytics reporting productivity bu...
3            [internet intranet applications platforms]
4             [job placement accomodations restriction]
                            ...                        
95                    [security technologies like siem]
96    [stability client server architecture networks...
97    [analysis data extraction machine analysis bus...
98                               [custom searches apis]
99                  [security events siem tools alerts]
Name: KPIs, Length: 100, dtype: object

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
import tensorflow as tf

In [16]:
ann = tf.keras.models.Sequential()

In [17]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [18]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [19]:
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [20]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [21]:
cv = CountVectorizer()
Z = []
for i in range(0, df.shape[0]):
    s = ''.join(preprocessing(df['KPIs'][i]))
    Z.append(s)
X = cv.fit_transform(Z)
y = df['InPreferredDept']

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.30,random_state=52)

In [23]:
X_train = X_train.toarray()
type(X_test)

scipy.sparse.csr.csr_matrix

In [24]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='loss', mode='max', verbose=1, patience=15)

In [25]:
ann.fit(X_train, y_train, batch_size = 16, epochs = 50, verbose = 1, callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 16: early stopping


<keras.callbacks.History at 0x2445cc357f0>

In [26]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)

def predict__class(text):
    fin_text = preprocessing(text)
    xo = cv.fit_transform(fin_text)
    fin_xo = convert_sparse_matrix_to_sparse_tensor(xo)
    ans = ann.predict(tf.sparse.reorder(fin_xo))*10
    return ans

X_test = convert_sparse_matrix_to_sparse_tensor(X_test)

In [39]:
y_pred = ann.predict(tf.sparse.reorder(X_test))
y_pred
y_pred = (y_pred > 0.46)



In [40]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[21  3]
 [ 2  4]]
              precision    recall  f1-score   support

           0       0.91      0.88      0.89        24
           1       0.57      0.67      0.62         6

    accuracy                           0.83        30
   macro avg       0.74      0.77      0.75        30
weighted avg       0.84      0.83      0.84        30



In [41]:
def predict_class(text):
    fin_text = preprocessing(text)
    xo = cv.fit_transform(fin_text)
    fin_xo = convert_sparse_matrix_to_sparse_tensor(xo)
    ans = ann.predict(tf.sparse.reorder(fin_xo))
    return ans

In [44]:
text = "Represented company's technical security interests to OWASP, following best practices in information security. Reviewed violations of computer security procedures and developed mitigation plans."
ans = predict_class(text)



In [45]:
ans

array([[0.5951981]], dtype=float32)

In [49]:
(ans > 0.46)

array([[ True]])

In [46]:
text1 = "Built a customer attrition statistical model that improved customer retention for clients."
ans1 = predict_class(text1)



In [47]:
ans1

array([[0.44523966]], dtype=float32)

In [50]:
(ans1 > 0.46)

array([[False]])