# Imports

In [1]:
import pandas as pd
import numpy as np
import glob
import re
import string
from bs4 import BeautifulSoup

# nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yhadad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yhadad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data

df = pd.read_parquet('job_title_dataset.parquet')

In [3]:
# remove nulls

df.dropna(inplace=True)

In [4]:
# remove unmapped Job-Titles = (-1) 

df.drop(df[df.JobTitleId<0].index, inplace=True)

In [5]:
# remove Job-Titles with less than 10 records

jt_gb = df.groupby('JobTitle').size()
job_titles = jt_gb[(jt_gb>10)].keys()
df = df[df.JobTitle.isin(job_titles)]

In [6]:
# pip install nltk

In [24]:
# Apply a first round of text cleaning techniques

def text_process(text):
    """
    1. make text lowercase
    2. remove html tags
    3. remove punctuation 
    4. remove words containing numbers
    5. return set of clean text words
    """  
    text = text.lower()
    text = re.sub('<.*?>', '', text)
    text = BeautifulSoup(text).get_text()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split() if word not in stopwords.words('english')] 

    return text

round1 = lambda x: text_process(x)

## spliting the data

In [8]:
df.shape

(26457, 5)

In [22]:
desc = df.Description.tolist()
desc = [text_process(text) for text in desc[:100]]

In [25]:
X = df.Description.apply(text_process)
y = df['JobTitle']

In [33]:
processed_data = pd.DataFrame(X)
processed_data['JobTitle'] = y
processed_data.head()

Unnamed: 0,Description,JobTitle
1,"[avis, budget, group, actionpacked, highenergy...",Automotive Technician
2,"[position, licensed, practical, nurse, registe...",Nurse Practitioner (NP)
3,"[service, technician, every, employee, starlin...",Automotive Technician
4,"[avis, budget, group, actionpacked, highenergy...",Automotive Technician
5,"[job, purpose, nurse, practitioner, provides, ...",Nurse Practitioner (NP)


In [34]:
# save processed data

processed_data.to_csv('datasets\processed_data.csv',index=False)