# Imports

In [1]:
import pandas as pd
import numpy as np
import glob
import re
import string
from bs4 import BeautifulSoup

# nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yhadad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yhadad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data

df = pd.read_parquet('datasets\job_title_dataset.parquet')
print(df.shape)

(31427, 5)


In [3]:
df.head()

Unnamed: 0,JobId,JobTitleId,ExtJobTitleText,JobTitle,Description
0,40282074,427,Business Development Center Agent,Medical Scientist,"For generations, the Landers family has been i..."
1,44324647,400725,Technician I,Automotive Technician,"<div class=""earcu_posdescriptionContainer""><di..."
2,46373521,401285,"RN or LPN Clinic Nurse, Urology",Nurse Practitioner (NP),<p><strong>Position:</strong> Licensed Practic...
3,48612525,400725,Service Technician,Automotive Technician,Service Technician <br/> Every employee with ...
4,50240652,400725,Mechanic Tech A,Automotive Technician,"<div class=""earcu_posdescriptionContainer""><di..."


In [4]:
# drop duplicates

df.drop_duplicates(inplace=True)
print(df.shape)

(31427, 5)


In [5]:
# remove nulls

df.dropna(inplace=True)
print(df.shape)

(31426, 5)


In [171]:
# sampling maximum 100 example per category

# df_sample = df.groupby('JobTitle').apply(lambda s: s.sample(100, replace=True)).drop_duplicates()
# print(df_sample.shape)

In [6]:
# filter categories with only one example

job_title_count = df.JobTitle.value_counts().to_frame()
job_title_count = job_title_count[job_title_count.JobTitle>1]
df = df[df.JobTitle.isin(job_title_count.index)]
print(df.shape)

(30689, 5)


In [7]:
df[df.JobTitleId<0]

Unnamed: 0,JobId,JobTitleId,ExtJobTitleText,JobTitle,Description
13,84279773,-1,Independent Operator,0000,"<div class=""earcu_posdescriptionContainer""><di..."
47,106345694,-1,Behavioral Health Activity Therapist,0000,"<p style=""margin: 0in 0in 8pt;""><span style=""f..."
255,112523983,-1,Family Medicine NP/PA - Locum Tenens - Souther...,0000,"<p><span style=""font-size:14px"">Locums FP NP/P..."
261,112524203,-1,Physician Only - Maternal Fetal Medicine Locum...,0000,<p><strong><u>Alumni Healthcare Staffing - </u...
262,112524210,-1,Full Time ER and Outpatient C/A Psychiatry Loc...,0000,<p> </p>\n\n<ul>\n\t<li>Immediate start date</...
...,...,...,...,...,...
31126,492956814,-1,Senior Designer,0000,<br> </p> Our client is a rapidly-growing agen...
31148,492993582,-1,Warehouse Clerical Support,0000,"<p>Top Job</p> <p>Located in Saint Cloud, MN</..."
31197,493028900,-1,Learning Disabilities Teacher-Consultant - Sho...,0000,"<p><i style=""font-size: 10pt; font-family: Cam..."
31357,493031025,-1,Test Job 12,0000,<BR><BR> Internal ID: a0x3r00000K65b2AAB


In [8]:
# remove unmapped Job-Titles = (-1) 

df.drop(df[df.JobTitleId<0].index, inplace=True)
print(df.shape)

(29940, 5)


In [6]:
# pip install nltk

In [9]:
# Apply a first round of text cleaning techniques

def text_process(text):
    """
    1. make text lowercase
    2. remove html tags
    3. remove punctuation 
    4. remove words containing numbers
    5. return set of clean text words
    """  
    text = text.lower()
    text = re.sub('<.*?>', '', text)
    text = BeautifulSoup(text).get_text()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split() if word not in stopwords.words('english')] 

    return text

round1 = lambda x: text_process(x)

## spliting the data

In [101]:
# df = df_sample.copy()

In [10]:
df.shape

(29940, 5)

In [11]:
X = df.Description.apply(text_process)
y = df['JobTitle']

In [12]:
processed_data = pd.DataFrame(X)
processed_data['JobTitle'] = y
processed_data.head()

Unnamed: 0,Description,JobTitle
0,"[generations, landers, family, car, businessst...",Medical Scientist
1,"[avis, budget, group, actionpacked, highenergy...",Automotive Technician
2,"[position, licensed, practical, nurse, registe...",Nurse Practitioner (NP)
3,"[service, technician, every, employee, starlin...",Automotive Technician
4,"[avis, budget, group, actionpacked, highenergy...",Automotive Technician


In [14]:
# save processed data

processed_data.to_csv('datasets\processed_data.csv',index=False)