In [None]:

import nltk
nltk.download('punkt_tab')
import ssl
import re
import matplotlib.pyplot as plt
import numpy as np
import gensim
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import pandas as pd

import pandas as pd, numpy as np, gensim.downloader as api
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, classification_report)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Dense, Dropout, Conv1D, GlobalMaxPooling1D)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
#import dataset -- data (1).csv
from google.colab import files
uploaded = files.upload()

Saving data (1).csv to data (1) (2).csv


In [None]:
#get data
data = pd.read_csv('data (1).csv')

TARGET = "job_category"
TEXT_COLUMNS = ['career_objective', 'skills', 'educational_institution_name','degree_names','passing_years', 'major_field_of_studies', 'professional_company_names', 'start_dates','end_dates','positions', 'locations','responsibilities','certification_providers','certification_skills','job_position_name']



In [None]:
def merge_text(row):
  return ' '.join(str(row[col]) for col in TEXT_COLUMNS if not pd.isnull(row[col]))

data['text'] = data.apply(merge_text, axis=1)
print(data['text'])

0       ['R', 'Python', 'Tableau', 'Power BI', 'SQL', ...
1       ['streamline', 'approach', 'balance sheet', 'b...
2       A curiosity-driven data scientist, eager to le...
3       To pursue excellence in the dynamic business w...
4       ['QA Engineering software expertise - Design, ...
                              ...                        
8176    ['Budget development', 'Exceptional interperso...
8177    Deep Learning fresher who is looking to join a...
8178    ['QA Engineering software expertise - Design, ...
8179    Seeking a Position of Engineering Technician. ...
8180    Post Graduate Analyst experienced in Business ...
Name: text, Length: 8181, dtype: object


In [None]:
#tokenize
#data['tokens'] = data['text'].apply(nltk.word_tokenize)
#print(data['tokens'])
data['tokens'] = data['text'].apply(lambda x: simple_preprocess(x, deacc=True))
print(data['tokens'])

0       [python, tableau, power, bi, sql, sas, deep, l...
1       [streamline, approach, balance, sheet, bonds, ...
2       [curiosity, driven, data, scientist, eager, to...
3       [to, pursue, excellence, in, the, dynamic, bus...
4       [qa, engineering, software, expertise, design,...
                              ...                        
8176    [budget, development, exceptional, interperson...
8177    [deep, learning, fresher, who, is, looking, to...
8178    [qa, engineering, software, expertise, design,...
8179    [seeking, position, of, engineering, technicia...
8180    [post, graduate, analyst, experienced, in, bus...
Name: tokens, Length: 8181, dtype: object


In [None]:
#load pretrained (google) word2vec
word2vec = api.load('word2vec-google-news-300')
dimension = 300

def row_vector(tokens):
  vectors = [word2vec[token] for token in tokens if token in word2vec]
  if vectors:
    return np.mean(vectors, axis=0)
  else:
    return np.zeros(dimension)

data['vectors'] = data['tokens'].apply(row_vector)
x = np.vstack(data['vectors'].values)

le = LabelEncoder()
y = le.fit_transform(data[TARGET])

In [None]:
#train/text split
#include stratify=y to preserve class distribution
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
#outline common metrics
def metrics(y_true, prediction, name):
  print("Accuracy:", accuracy_score(y_true, prediction))
  print("Precision:", precision_score(y_true, prediction, average='weighted', zero_division=0))
  print("Recall:", recall_score(y_true, prediction, average='weighted', zero_division=0))
  print("F1 Score:", f1_score(y_true, prediction, average='weighted', zero_division=0))
  print(classification_report(y_true, prediction, target_names = le.classes_, zero_division=0))


1. STAT: NAIVE BAYES CLASSIFIER

In [None]:
naive_bayes = GaussianNB()
naive_bayes.fit(x_train, y_train)
metrics(y_test, naive_bayes.predict(x_test), "Naive Bayes")

Accuracy: 0.6548564447159438
Precision: 0.6716427943542969
Recall: 0.6548564447159438
F1 Score: 0.6575599558161979
                                  precision    recall  f1-score   support

             Business Management       0.49      0.67      0.57       273
Civil and Mechanical Engineering       0.72      0.53      0.61       342
                  HR & Marketing       0.77      0.73      0.75       205
                   Software & IT       0.75      0.71      0.73       477
                Tech Engineering       0.60      0.65      0.62       340

                        accuracy                           0.65      1637
                       macro avg       0.67      0.66      0.66      1637
                    weighted avg       0.67      0.65      0.66      1637



2. STAT: SVM

In [None]:
svm = SVC(kernel='linear', C=1.0, class_weight='balanced')
svm.fit(x_train, y_train)
metrics(y_test, svm.predict(x_test), "SVM")

Accuracy: 0.9590714722052535
Precision: 0.9593147733840452
Recall: 0.9590714722052535
F1 Score: 0.9591017660479283
                                  precision    recall  f1-score   support

             Business Management       0.93      0.96      0.95       273
Civil and Mechanical Engineering       0.96      0.94      0.95       342
                  HR & Marketing       0.99      0.98      0.98       205
                   Software & IT       0.95      0.97      0.96       477
                Tech Engineering       0.98      0.96      0.97       340

                        accuracy                           0.96      1637
                       macro avg       0.96      0.96      0.96      1637
                    weighted avg       0.96      0.96      0.96      1637



NEURAL CLASSIFICATION

In [None]:
#specify class weights for neural models
class_weight = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i:w for i,w in enumerate(class_weight)}


3. NEURAL: FEEDFORWARD NEURAL NETWORK

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

ffnn = Sequential([
    Dense(256, activation='relu', input_shape=(dimension,)),
    Dropout(0.4),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(len(le.classes_), activation='softmax')
])

ffnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

ffnn.fit(x_train, y_train, epochs=5, batch_size=32, class_weight=class_weight_dict, verbose=0)

ffnn_probs = ffnn.predict(x_test)
prediction = ffnn_probs.argmax(axis=1)
metrics(y_test, prediction, "FFNN")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Accuracy: 0.9584605986560782
Precision: 0.9615414122773325
Recall: 0.9584605986560782
F1 Score: 0.9587852653969889
                                  precision    recall  f1-score   support

             Business Management       0.87      0.99      0.92       273
Civil and Mechanical Engineering       0.97      0.91      0.94       342
                  HR & Marketing       0.95      1.00      0.97       205
                   Software & IT       1.00      0.94      0.97       477
                Tech Engineering       0.98      0.98      0.98       340

                        accuracy                           0.96      1637
                       macro avg       0.95      0.96      0.96      1637
                    weighted avg       0.96      0.96      0.96      1637



4. NEURAL: CNN

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 100

def tokens_to_sequence(tokens):
  return [word2vec[token] for token in tokens if token in word2vec][:max_length]

seqs = [tokens_to_sequence(token) for token in data['tokens']]
seqs = [pad_sequences([seq], maxlen=max_length, dtype='float32', padding='post')[0] for seq in seqs]

x_seq = np.array(seqs)
x_train_seq, x_test_seq, y_train_seq, y_test_seq = train_test_split(x_seq, y, test_size=0.2, random_state=42, stratify=y)

cnn = Sequential([
    Conv1D(256, 5, activation='relu', input_shape=(max_length, dimension)),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

cnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn.fit(x_train_seq, y_train_seq, epochs=10, batch_size=32, validation_split=0.2, class_weight=class_weight_dict, verbose=0)

cnn_probs = cnn.predict(x_test_seq)
prediction = cnn_probs.argmax(axis=1)
metrics(y_test_seq, prediction, "CNN")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


KeyboardInterrupt: 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

def tokens_to_sequence(tokens):
    return [word2vec[token] for token in tokens if token in word2vec][:max_length]

seqs = [tokens_to_sequence(token) for token in data['tokens']]
seqs = [pad_sequences([seq], maxlen=max_length, dtype='float32', padding='post')[0] for seq in seqs]

x_seq = np.array(seqs)
x_train_seq, x_test_seq, y_train_seq, y_test_seq = train_test_split(x_seq, y, test_size=0.2, random_state=42, stratify=y)

max_length = 100
dimension = word2vec.vector_size  # Usually 300

rnn = Sequential([
    LSTM(128, input_shape=(max_length, dimension), return_sequences=False),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

rnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#include balanced class weights -- verbose ==1?
rnn.fit(x_train_seq, y_train_seq,
        epochs=10,
        batch_size=32,
        validation_split=0.2,
        class_weight=class_weight_dict,
        verbose=1)

rnn_probs = rnn.predict(x_test_seq)
prediction = rnn_probs.argmax(axis=1)

metrics(y_test_seq, prediction, "RNN")


  super().__init__(**kwargs)


Epoch 1/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 197ms/step - accuracy: 0.3694 - loss: 1.4545 - val_accuracy: 0.6692 - val_loss: 0.8222
Epoch 2/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 198ms/step - accuracy: 0.6820 - loss: 0.8099 - val_accuracy: 0.7746 - val_loss: 0.4888
Epoch 3/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 185ms/step - accuracy: 0.7732 - loss: 0.5050 - val_accuracy: 0.7785 - val_loss: 0.4621
Epoch 4/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 202ms/step - accuracy: 0.7649 - loss: 0.4905 - val_accuracy: 0.7785 - val_loss: 0.4435
Epoch 5/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 185ms/step - accuracy: 0.7823 - loss: 0.4619 - val_accuracy: 0.7800 - val_loss: 0.4527
Epoch 6/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 186ms/step - accuracy: 0.7512 - loss: 0.5900 - val_accuracy: 0.2170 - val_loss: 1.5994
Epoch 7/10