In [1]:
import pandas as pd
import re
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

# Download necessary NLTK packages
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
df = pd.read_csv('../dataset/JobDescriptions/JobDescription.csv')
df.head()

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,This position requires a baseline understandin...,1712896000000.0,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040.0,36059.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,,1713452000000.0,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601.0,19057.0


In [3]:
df = df[['job_id', 'title', 'description']]
df.head()

Unnamed: 0,job_id,title,description
0,921716,Marketing Coordinator,Job descriptionA leading real estate firm in N...
1,1829192,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ..."
2,10998357,Assitant Restaurant Manager,The National Exemplar is accepting application...
3,23221523,Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...
4,35982263,Service Technician,Looking for HVAC service tech with experience ...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   job_id       123849 non-null  int64 
 1   title        123849 non-null  object
 2   description  123842 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.8+ MB


In [5]:
df.isnull().sum()

job_id         0
title          0
description    7
dtype: int64

In [6]:
df = df.dropna(subset=['description'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 123842 entries, 0 to 123848
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   job_id       123842 non-null  int64 
 1   title        123842 non-null  object
 2   description  123842 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.8+ MB


In [7]:
df = df[df['description'].apply(lambda x: len(str(x).split())) > 20]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 123613 entries, 0 to 123848
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   job_id       123613 non-null  int64 
 1   title        123613 non-null  object
 2   description  123613 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.8+ MB


### PREPROCESSING

In [8]:
# Get a list of stop word
stop_words = set(nltk.corpus.stopwords.words('english'))

# Initialize the stemmer
stemmer = PorterStemmer()

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [9]:
def preprocess_text(text, remove_stopword=True, use_lemmatization=True, use_stemming=False):
    """
    Preprocesses the text: 
    - Converts to lowercase
    - Removes punctuation
    - Removes underscore
    - Removes multiple spaces
    - Tokenizes
    - Removes tokens with numbers
    - Removes stopwords based on parameter
    - Applies lemmatization or stemming based on parameters
    """
    
    text = text.lower()                     # Convert to lowercase
    text = re.sub(r'[^\w\s]', ' ', text)    # Remove all punctuation (\w = a-z, A-Z, 0-9 and underscore)
    text = re.sub(r'_', ' ', text)          # Remove underscore
    text = re.sub('\s+', ' ', text)         # Remove multiple spaces

    tokens = word_tokenize(text)            # Tokenize the text

    processed_tokens = []
    
    for token in tokens:
        # Remove words containing numbers
        if any(char.isdigit() for char in token):
            continue

        # Remove stopwords based on parameter
        if remove_stopword and (token in stop_words):
            continue

        # Applies lemmatization or stemming based on parameters
        if use_lemmatization:
            token = lemmatizer.lemmatize(token)
        if use_stemming:
            token = stemmer.stem(token)

        processed_tokens.append(token)
    
    return ' '.join(processed_tokens)


In [10]:
def preprocess_text_with_progress(df, column_name, **kwargs):
    """ 
    Function to preprocess text column with a progress bar 
    """
                
    tqdm.pandas(desc="Processing Resumes")  # Integrate tqdm progress bar into pandas
    return df[column_name].progress_apply(lambda x: preprocess_text(x, **kwargs))

In [11]:
parameter_sets = [
    {"remove_stopword": True, "use_lemmatization": True, "use_stemming": True},
    {"remove_stopword": True, "use_lemmatization": True, "use_stemming": False},
    {"remove_stopword": True, "use_lemmatization": False, "use_stemming": True},
    {"remove_stopword": False, "use_lemmatization": False, "use_stemming": False},
    {"remove_stopword": False, "use_lemmatization": True, "use_stemming": False}
]

In [12]:
output_folder = "processed_data"
os.makedirs(output_folder, exist_ok=True)

for params in parameter_sets:
    df_copy = df.copy()
    df_copy['description'] = preprocess_text_with_progress(df_copy, 'description', **params)

    file_name = 'JobDescription'
    if params["remove_stopword"]:
        file_name += '_stopword'
    if params["use_lemmatization"]:
        file_name += '_lemm'
    if params["use_stemming"]:
        file_name += '_stemm'
    file_name += '.csv'

    path = os.path.join(output_folder, file_name)
    
    # Save the DataFrame to the specified folder
    df_copy.to_csv(path, index=False)

    print(f"Processed DataFrame saved as: {file_name}")

Processing Resumes:   0%|          | 0/123613 [00:00<?, ?it/s]

Processing Resumes: 100%|██████████| 123613/123613 [16:08<00:00, 127.60it/s]


Processed DataFrame saved as: JobDescription_stopword_lemm_stemm.csv


Processing Resumes: 100%|██████████| 123613/123613 [05:59<00:00, 344.19it/s]


Processed DataFrame saved as: JobDescription_stopword_lemm.csv


Processing Resumes: 100%|██████████| 123613/123613 [12:46<00:00, 161.18it/s]


Processed DataFrame saved as: JobDescription_stopword_stemm.csv


Processing Resumes: 100%|██████████| 123613/123613 [03:18<00:00, 621.93it/s]


Processed DataFrame saved as: JobDescription.csv


Processing Resumes: 100%|██████████| 123613/123613 [06:56<00:00, 297.09it/s]


Processed DataFrame saved as: JobDescription_lemm.csv
