Importing required packages for processing data

In [1]:
from os import listdir
from os.path import isfile, join
import json
import pandas as pd
import os

Navigating to input files directory

In [2]:
path = os.getcwd() + '/data'
json_dir = path + '/docs'

Reading and importing data containing job descriptions

In [3]:
json_files = [f for f in listdir(json_dir) if isfile(join(json_dir, f))]
input_data = []
for i in range(len(json_files)):
    file = json_dir + '/' + json_files[i]
    with open(file) as f:  
        data = json.load(f)
        doc_info = [data["_id"], data["jd_information"]["description"]]
        input_data.append(doc_info)

json_data = pd.DataFrame(input_data)
json_data.columns = ['Document ID', 'JD']
json_data['Document ID'] = json_data['Document ID'].astype('int64')
json_data['JD'] = json_data['JD'].astype(str)

Checking that text strings are processed correctly

In [4]:
json_data.head()

Unnamed: 0,Document ID,JD
0,8126421,Hiring 3D Designer for Exhibitions and Stall D...
1,8260214,
2,8136071,&nbsp;&nbsp;Intrested candidates can drop thei...
3,8337941,AL- HAMD CONSULTANT SERVICEBHAJANPURA NEW DELH...
4,8370930,


Importing file containing job classes

In [5]:
depts = pd.read_csv(path + '/' + 'document_departments.csv')

Merging job descriptions to their respective classifications

In [6]:
full_data = pd.merge(json_data, depts, on = 'Document ID', how = 'left')

Viewing a snapshot of all data

In [7]:
full_data.head()

Unnamed: 0,Document ID,JD,Department
0,8126421,Hiring 3D Designer for Exhibitions and Stall D...,Marketing
1,8260214,,Sales
2,8136071,&nbsp;&nbsp;Intrested candidates can drop thei...,Ticketing
3,8337941,AL- HAMD CONSULTANT SERVICEBHAJANPURA NEW DELH...,IT
4,8370930,,Analytics


Cleaning data: Removing data points without any classifications

In [8]:
full_data = full_data[full_data['JD'] != '']

Randomizing dataset row orders for test and train separation

In [9]:
full_data = full_data.sample(frac=1).reset_index(drop=True)

Importing tensorflow libraries

In [10]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords

Removing Stopwords

In [12]:
train_data = list(full_data['JD'].values)
train_data2 = []

for i in train_data:
    ## split into words
    tokens = word_tokenize(i)
    ## convert to lower case
    tokens = [w.lower() for w in tokens]
    ## remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    ## remove remaining tokens that are not alphabetic
    word = [word for word in stripped if word.isalpha()]
    ## filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in word if not w in stop_words]
    #i = words
    train_data2.append(words)

Converting job description strings: converting words to numeric indexes (to match the input format of neural network)

In [13]:
t = keras.preprocessing.text.Tokenizer(num_words=10000)
t.fit_on_texts(train_data2)
vocab_size = len(t.word_index) + 1
encoded_data = t.texts_to_sequences(train_data2)
max_doc_len = len(max(encoded_data, key=len)) + 1
padded_data = keras.preprocessing.sequence.pad_sequences(
        encoded_data, maxlen = max_doc_len, padding='post')

Converting job classification labels from strings to numeric indexes 

In [14]:
labels = list(full_data['Department'].values)
unique_labels = list(set(labels))
label_indexes = dict()
for i in range(len(unique_labels)):
    label_indexes[i] = unique_labels[i]
encoded_labels = []
for label in labels:
    for val, word in label_indexes.items():
        if word == label:
            encoded_labels.append(val)

Import GloVe mappings

In [15]:
glove_file_path = os.getcwd()
embeddings_index = dict()
f = open(glove_file_path + '/' + 'glove.6B.50d.txt', encoding = 'utf8')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


Create a weight matrix for words in training docs

In [16]:
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

Building a basic neural network

In [17]:
e = keras.layers.Embedding(vocab_size, 50, weights=[embedding_matrix], 
                           input_length=max_doc_len, trainable=False)
model = keras.Sequential()
model.add(e)
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(512, activation=tf.nn.relu))
model.add(keras.layers.Dense(len(unique_labels), activation=tf.nn.softmax))

Instructions for updating:
Colocations handled automatically by placer.


Compiling the model

In [18]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

Summarizing the model

In [19]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 557, 50)           309100    
_________________________________________________________________
flatten (Flatten)            (None, 27850)             0         
_________________________________________________________________
dense (Dense)                (None, 512)               14259712  
_________________________________________________________________
dense_1 (Dense)              (None, 27)                13851     
Total params: 14,582,663
Trainable params: 14,273,563
Non-trainable params: 309,100
_________________________________________________________________
None


Separating training, validation and test data

In [20]:
partial_test_data = padded_data[:300]
partial_test_labels = encoded_labels[:300]

partial_train_data = padded_data[300:600]
partial_train_labels = encoded_labels[300:600]

x_val = padded_data[600:]
y_val = encoded_labels[600:]

Fitting the model to training data using validation data for observing if model over/under-fits

In [22]:
history = model.fit(partial_train_data, partial_train_labels, epochs=20, 
                    batch_size = 512, validation_data = (x_val, y_val), verbose=1)

Train on 300 samples, validate on 145 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Evalutating the model accuracy over test data

In [23]:
results = model.evaluate(partial_test_data, partial_test_labels, verbose=0)
print("Accuracy achieved: ",results[1])

Accuracy achieved:  0.5366667


Done.