# Imports

In [93]:
import pandas as pd
import numpy as np
import glob
import re
import string
from bs4 import BeautifulSoup
import pickle
import time

from textblob import TextBlob

# nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yhadad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yhadad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yhadad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\yhadad\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [15]:
# change the working directory 
import os
os.chdir('C:\\Users\\yhadad\\Desktop\\DS\\NLP JT Classifier')
print(os.getcwd())

C:\Users\yhadad\Desktop\DS\NLP JT Classifier


In [16]:
# load data

df = pd.read_parquet('data_acquisition/dataset/corpus.parquet')
print(df.shape)

(31427, 5)


In [17]:
df.head()

Unnamed: 0,JobId,JobTitleId,ExtJobTitleText,JobTitle,Description
0,40282074,427,Business Development Center Agent,Medical Scientist,"For generations, the Landers family has been i..."
1,44324647,400725,Technician I,Automotive Technician,"<div class=""earcu_posdescriptionContainer""><di..."
2,46373521,401285,"RN or LPN Clinic Nurse, Urology",Nurse Practitioner (NP),<p><strong>Position:</strong> Licensed Practic...
3,48612525,400725,Service Technician,Automotive Technician,Service Technician <br/> Every employee with ...
4,50240652,400725,Mechanic Tech A,Automotive Technician,"<div class=""earcu_posdescriptionContainer""><di..."


In [18]:
# drop duplicates

df.drop_duplicates(inplace=True)
print(df.shape)

(31427, 5)


In [19]:
# remove nulls

df.dropna(inplace=True)
print(df.shape)

(31426, 5)


In [21]:
# filter categories with only one example

job_title_count = df.JobTitle.value_counts().to_frame()
job_title_count = job_title_count[job_title_count.JobTitle>1]
df = df[df.JobTitle.isin(job_title_count.index)]
print(df.shape)

(30689, 5)


In [22]:
df[df.JobTitleId<0]

Unnamed: 0,JobId,JobTitleId,ExtJobTitleText,JobTitle,Description
13,84279773,-1,Independent Operator,0000,"<div class=""earcu_posdescriptionContainer""><di..."
47,106345694,-1,Behavioral Health Activity Therapist,0000,"<p style=""margin: 0in 0in 8pt;""><span style=""f..."
255,112523983,-1,Family Medicine NP/PA - Locum Tenens - Souther...,0000,"<p><span style=""font-size:14px"">Locums FP NP/P..."
261,112524203,-1,Physician Only - Maternal Fetal Medicine Locum...,0000,<p><strong><u>Alumni Healthcare Staffing - </u...
262,112524210,-1,Full Time ER and Outpatient C/A Psychiatry Loc...,0000,<p> </p>\n\n<ul>\n\t<li>Immediate start date</...
...,...,...,...,...,...
31126,492956814,-1,Senior Designer,0000,<br> </p> Our client is a rapidly-growing agen...
31148,492993582,-1,Warehouse Clerical Support,0000,"<p>Top Job</p> <p>Located in Saint Cloud, MN</..."
31197,493028900,-1,Learning Disabilities Teacher-Consultant - Sho...,0000,"<p><i style=""font-size: 10pt; font-family: Cam..."
31357,493031025,-1,Test Job 12,0000,<BR><BR> Internal ID: a0x3r00000K65b2AAB


In [23]:
# remove unmapped Job-Titles = (-1) 

df.drop(df[df.JobTitleId<0].index, inplace=True)
print(df.shape)

(29940, 5)


In [6]:
# pip install nltk

In [54]:
# Apply a first round of text cleaning techniques

def clean_text_round1(text):
    """
    1. make text lowercase
    2. remove html tags
    3. remove punctuation 
    4. remove words containing numbers
    5. remove stopwords 
    6. return str of clean text words
    """  
    text = text.lower()
    text = re.sub('<.*?>', '', text)
    text = BeautifulSoup(text).get_text()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    tokens_without_sw = [word for word in text.split() if word not in stopwords.words('english')] 
    filtered_sentence = (" ").join(tokens_without_sw)
    
    return filtered_sentence

round1 = lambda x: clean_text_round1(x)

In [77]:
# Let's take a look at the updated text

start = time.time()
data_clean = pd.DataFrame(df.Description.apply(round1))
end = time.time() 
print('execution time in minutes: ', (end - start)/60) 

In [87]:
# Apply a second round of cleaning
def noun_only(text):
    noun_list = ['NN','NNS','NNP','NNPS']
    l1 = []
    for w in TextBlob(text).tags:
        if w[1] in noun_list:
            l1.append(w[0])
    text = " ".join([w for w in l1])
    
    return text 

In [89]:
start = time.time()
data_clean['noun_only'] = pd.DataFrame(data_clean.Description.apply(noun_only))
end = time.time() 
print('execution time in minutes: ', (end - start)/60) 

execution time in minutes:  5.4468150695165


In [None]:
data_clean['job_title'] = df.JobTitle

In [104]:
data_clean.rename(columns={'Description':'description'},inplace=True)

In [158]:
data_clean['desc_words_count'] = data_clean['description'].apply(lambda word: len([word for word in word.split()]))
data_clean['noun_only_words_count'] = data_clean['noun_only'].apply(lambda word: len([word for word in word.split()]))

In [163]:
print('avg words count in the description :',data_clean['desc_words_count'].mean())
print('avg words count in the noun only :',data_clean['noun_only_words_count'].mean())

avg words count in the description : 168.01169004676018
avg words count in the noun only : 92.71579826319305


In [159]:
data_clean.head()

Unnamed: 0,description,noun_only,job_title,desc_words_count,noun_only_words_count
0,generations landers family car businesssteve l...,generations landers family car businesssteve d...,Medical Scientist,244,133
1,avis budget group actionpacked highenergy work...,budget group workplace leader travel services ...,Automotive Technician,351,197
2,position licensed practical nurse register nur...,position nurse register nurse department urolo...,Nurse Practitioner (NP),76,42
3,service technician every employee starling che...,service employee chevrolet successour growth e...,Automotive Technician,123,65
4,avis budget group actionpacked highenergy work...,budget group workplace leader travel services ...,Automotive Technician,350,197


In [199]:
data_clean.describe(percentiles=[0.7,0.8,0.9,0.95,0.99])

Unnamed: 0,desc_words_count,noun_only_words_count
count,29940.0,29940.0
mean,168.01169,92.715798
std,130.989274,74.337104
min,0.0,0.0
50%,147.0,80.0
70%,213.0,119.0
80%,254.0,143.0
90%,325.0,180.0
95%,407.0,231.0
99%,623.0,350.61


In [201]:
# sampling the description before and after the nouns reduction 
s = data_clean.sample()
print(s.job_title.iloc[0])
print('\n')
print(s.description.iloc[0])
print('\n')
print(s.noun_only.iloc[0])

Production Manager


location new beaver ave pittsburgh paeveryone welcome us unique that’s makes us amazing believe inclusiveness celebrating person’s individuality there’s power bringing people different points view life experiences together bring best ideas provide equal employment opportunities eeo applicants considered regardless race color religion national origin age sex marital status ancestry physical mental disability veteran status sexual orientation feel safe comfortable there’s limit achieve nutshell… production manager goodblend oversees production packaging operations works closely general manager operations ensure production needs met provides leadership guidance teams manages employee relations production reporting scheduling effectivenessefficiencies facilitates lean initiatives acquisitions new staff role goodblend one renowned retail brands parallel umbrella parallel one largest multistate cannabis companies world owning operating five markets florida surterra welln

In [106]:
# save processed data

data_clean.to_pickle('data_acquisition\dataset\data_clean.pkl')

In [127]:
data_clean_sample = data_clean.sample(10000)

In [130]:
# We are going to create a document-term matrix using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
data_cv = cv.fit_transform(data_clean.description)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
data_dtm.index = data_clean.index


In [132]:
data_dtm.shape

(29940, 58678)

In [131]:
data_dtm.head()

Unnamed: 0,aa,aaa,aaae,aaahc,aaalac,aaas,aabb,aac,aacat,aacc,...,étirements,études,évaluations,êtes,êtesretenu,être,órdenes,œprotect,œuvre,œyes
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [114]:
cv.get_feature_names_out().tolist()

['aa',
 'aaeeo',
 'aagl',
 'aama',
 'ab',
 'aba',
 'abbey',
 'abemaobem',
 'abide',
 'abides',
 'abiertassalariogana',
 'abilities',
 'abilitieshave',
 'ability',
 'abilityjob',
 'abilitymentality',
 'able',
 'abm',
 'absence',
 'absencemanages',
 'absolutely',
 'abundance',
 'abuse',
 'ac',
 'academic',
 'academics',
 'academy',
 'acas',
 'acc',
 'accelerate',
 'accelerating',
 'accelerator',
 'accept',
 'acceptability',
 'acceptable',
 'acceptance',
 'accepted',
 'acceptedquality',
 'accepting',
 'access',
 'accessibility',
 'accessible',
 'accessories',
 'accident',
 'accidental',
 'accidents',
 'accommodate',
 'accommodated',
 'accommodating',
 'accommodation',
 'accommodationa',
 'accommodationamazon',
 'accommodationbase',
 'accommodationbasic',
 'accommodationlong',
 'accommodations',
 'accommodationseu',
 'accomplish',
 'accomplished',
 'accomplishments',
 'accomplishmentscontrols',
 'accordance',
 'accorded',
 'according',
 'accordingly',
 'account',
 'accountability',
 'accou

In [200]:
# Let's pickle it for later use
data_dtm.sample(1000).to_pickle("artifacts\dtm_1000.pkl")

In [141]:
pickle.dump(cv, open("artifacts\cv.pkl", "wb"))