In [0]:
import os,json

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import re

from string import punctuation
import string

import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import *
from nltk.corpus import stopwords


from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer


from keras.layers import Embedding
from keras.preprocessing import sequence
from keras import utils


from imblearn.over_sampling import SMOTE


from sklearn.model_selection import train_test_split
from keras.layers import Input , Dense , LSTM,GlobalAveragePooling1D,GlobalMaxPooling1D,Bidirectional,LSTM,Conv1D
from keras.models import Sequential, Model, load_model
from sklearn.metrics import classification_report,confusion_matrix


In [0]:
!unzip data.zip

In [None]:
import os,json


Reading the JSON files and extracting  ***jd_information*** and ***description*** and storing it in a dataframe

In [0]:
json_files_path = 'data/docs/'
json_files = [pos_json for pos_json in os.listdir(json_files_path) if pos_json.endswith('.json')]

job_desc = pd.DataFrame(columns=['Description', 'Document ID'])
for index, js in enumerate(json_files):
    with open(os.path.join(json_files_path, js)) as json_file:
        json_data = json.load(json_file)        
        desc = json_data['jd_information']['description']
        id= json_data['_id']
        job_desc.loc[index] = [desc ,id]
job_desc.head()

Reading Job Department file

In [253]:
job_dept=pd.read_csv('data/document_departments.csv')
job_dept.head()
job_dept["Document ID"]=job_dept["Document ID"].astype(str)
job_dept.shape


(1162, 2)

In [254]:
print(job_dept.columns)


Index(['Document ID', 'Department'], dtype='object')


Joining the two dataframe on Document ID

In [0]:
job_final = job_desc.merge(job_dept, on="Document ID", how = 'inner')
job_final.head()


Exploratory Data Analysis ( EDA ) of the Dataset




In [0]:

diff_words = job_final.groupby('Department').filter(lambda x: len(x) > 15)
words = diff_words['Department'].value_counts().index.tolist()
fig, ax = plt.subplots(figsize = (20, 10))
sns.countplot(x = diff_words['Department'], order = words, ax = ax)
plt.xticks(rotation = 90)
plt.show()



Checking if description column has some text or it is empty

In [0]:
job_final['Description'] == ""

Dropping empty description rows

In [258]:
job_final = job_final[job_final.Description != '']
job_final.shape

(745, 3)

Preprocessing the Text


Removing contractions

In [0]:
def remove_contraction(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase



In [0]:
final = job_final.apply(lambda row: remove_contraction(row['Description']), axis=1)
job_final['Description'] = final
job_final.head()

Removing numbers

In [0]:

punctuations=string.punctuation
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text
job_final['Description']=job_final['Description'].apply(lambda x:remove_punct(x))
job_final.head()

Tokenizing the description and storing it in tokenized_description column

In [0]:
job_final['tokenized_description']=job_final.apply(lambda row:nltk.word_tokenize(row['Description']),axis=1 )
job_final.head()

Removing stopwords

In [0]:
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text
job_final['tokenized_description']=job_final['tokenized_description'].apply(lambda x: remove_stopwords(x))

Stemming

In [0]:
ps = nltk.PorterStemmer()
def text_stemmer(text):
    text = [ps.stem(word) for word in text]
    return text

job_final['tokenized_description']=job_final['tokenized_description'].apply(lambda x: text_stemmer(x))

In [0]:
job_final.head()

checking for class imbalance

In [0]:
def class_pie_chart(labels):
    class_map={}
    for i in labels:
        if str(i) not in class_map:
            class_map[str(i)]=1
        else:
            class_map[str(i)]+=1
    return class_map

pie=class_pie_chart(job_final['Department'].values)
print(pie)
q=[i for i in pie.keys()]

sizes = [i for i in pie.values()]
colors = ['red', 'blue', 'yellow', 'green','purple']
plt.pie(sizes, labels=q, colors=colors,
         autopct='%1.1f%%',shadow=True)
plt.show()

In [0]:
class_map=pie
for item in pie.keys():
    if pie[item]<=5:
        tempdf = job_final.loc[job_final['Department'] == item]
        job_final = job_final.append(tempdf)
        print(tempdf)

Converting words to vectors

In [0]:
embedding_size=50
input_length=100


embeddings_index = {}
file = open('glove.6B.50d.txt')
for line in file:
    values = line.split()
    word = values[0]
    coef = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coef
f.close()

In [0]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(job_final.tokenized_description)
X_data=tokenizer.texts_to_sequences(job_final.tokenized_description)

In [270]:
word_index = tokenizer.word_index
print('Total length of tokens are ' +str( len(word_index)))

Total length of tokens are 7183


Encoding the target class 

In [0]:
lbEnc=LabelEncoder()
job_final['Department']=lbEnc.fit_transform(job_final['Department'])
job_final['Department'].head()



LSTM Architecture

In [0]:
max_features=200000
max_sentence_len=75
max_sentence_num=4
embedding_size=50

embedding_layer = Embedding(len(word_index)+1,embedding_size,input_length=max_sentence_len,trainable=False)


In [273]:
X = list(sequence.pad_sequences(X_data, maxlen=max_sentence_len))
X=np.array(X)

print(X.shape)


Y=utils.np_utils.to_categorical(job_final.Department)
print(Y.shape)

(778, 75)
(778, 27)


Using SMOTE for over sampling

In [0]:
def class_balancer(dataset,labels):
    smote_X,smote_Y=SMOTE(k_neighbors=1).fit_resample(dataset,labels)
    return smote_X,smote_Y

In [277]:
smote_X,smote_Y=class_balancer(X,Y)

print(smote_X.shape)
print(smote_Y.shape)


(6210, 75)
(6210, 27)


Splitting the data into test-train

In [0]:
X_train,X_test,Y_train,Y_test=train_test_split(smote_X,smote_Y,test_size=0.30)

In [0]:
def classifier():
    inp = Input(shape=(max_sentence_len,), dtype='int32')
    x = embedding_layer(inp)
    x = Bidirectional(LSTM(128, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))(x)
    outp = Dense(27, activation="softmax")(x)
    BiLSTM = Model(inp, outp)
    BiLSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    print(BiLSTM.summary())
    return BiLSTM



In [282]:
model=classifier()
history=model.fit(X_train,Y_train,batch_size=64,epochs=20,validation_data=[X_test,Y_test])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 75)                0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 75, 50)            359200    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 256)               183296    
_________________________________________________________________
dense_23 (Dense)             (None, 27)                6939      
Total params: 549,435
Trainable params: 190,235
Non-trainable params: 359,200
_________________________________________________________________
None
Train on 4968 samples, validate on 1242 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Ep

In [285]:
trained_model=np.argmax(model.predict(X_test),axis=1)
trained_model=trained_model+1
test=np.argmax(Y_test,axis=1)
test=test+1

print(classification_report(test, trained_model))

              precision    recall  f1-score   support

           1       0.49      0.55      0.52        49
           2       0.93      1.00      0.96        54
           3       1.00      1.00      1.00        41
           4       0.39      0.23      0.29        47
           5       1.00      1.00      1.00        42
           6       0.29      0.04      0.07        48
           7       1.00      1.00      1.00        44
           8       0.71      0.57      0.63        42
           9       1.00      1.00      1.00        49
          10       0.24      0.34      0.28        41
          11       0.98      1.00      0.99        46
          12       0.39      0.67      0.49        48
          13       0.98      1.00      0.99        48
          14       0.98      1.00      0.99        49
          15       0.64      0.78      0.70        50
          16       0.76      1.00      0.87        39
          17       0.91      1.00      0.95        53
          18       1.00    