In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import re
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words
from tqdm import tqdm

from wordcloud import WordCloud

# Download necessary NLTK packages
# nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
nltk.data.path

['C:\\Users\\andre/nltk_data',
 'c:\\Users\\andre\\AppData\\Local\\Programs\\Python\\Python310\\nltk_data',
 'c:\\Users\\andre\\AppData\\Local\\Programs\\Python\\Python310\\share\\nltk_data',
 'c:\\Users\\andre\\AppData\\Local\\Programs\\Python\\Python310\\lib\\nltk_data',
 'C:\\Users\\andre\\AppData\\Roaming\\nltk_data',
 'C:\\nltk_data',
 'D:\\nltk_data',
 'E:\\nltk_data']

In [3]:
df = pd.read_csv('../dataset/postings/postings.csv')
df.head()

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,This position requires a baseline understandin...,1712896000000.0,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040.0,36059.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,,1713452000000.0,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601.0,19057.0


In [4]:
df = df[['job_id', 'title', 'description', 'skills_desc']]
df.head()

Unnamed: 0,job_id,title,description,skills_desc
0,921716,Marketing Coordinator,Job descriptionA leading real estate firm in N...,Requirements: \n\nWe are seeking a College or ...
1,1829192,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",
2,10998357,Assitant Restaurant Manager,The National Exemplar is accepting application...,We are currently accepting resumes for FOH - A...
3,23221523,Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,This position requires a baseline understandin...
4,35982263,Service Technician,Looking for HVAC service tech with experience ...,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   job_id       123849 non-null  int64 
 1   title        123849 non-null  object
 2   description  123842 non-null  object
 3   skills_desc  2439 non-null    object
dtypes: int64(1), object(3)
memory usage: 3.8+ MB


In [6]:
df.isnull().sum()

job_id              0
title               0
description         7
skills_desc    121410
dtype: int64

In [7]:
df = df.dropna(subset=['description'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 123842 entries, 0 to 123848
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   job_id       123842 non-null  int64 
 1   title        123842 non-null  object
 2   description  123842 non-null  object
 3   skills_desc  2439 non-null    object
dtypes: int64(1), object(3)
memory usage: 4.7+ MB


In [8]:
# 1. Calcolo del numero di parole per ogni descrizione
df['word_count'] = df['description'].apply(lambda x: len(str(x).split()))


# 2. Filtra le descrizioni con una sola parola
df = df[(df['word_count'] > 300) & (df['word_count'] < 600)]

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49469 entries, 0 to 123848
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   job_id       49469 non-null  int64 
 1   title        49469 non-null  object
 2   description  49469 non-null  object
 3   skills_desc  1137 non-null   object
 4   word_count   49469 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 2.3+ MB


### Preprocessing

In [9]:
# Get the list of valid English words
english_words = set(words.words())

# Get a list of stop word
stop_words = set(nltk.corpus.stopwords.words('english'))

# Initialize the stemmer
stemmer = PorterStemmer()

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_text(text, use_lemmatization=True, use_stemming=False):
    """
    Preprocesses the text: 
    - Converts to lowercase
    - Removes punctuation
    - Remove underscore
    - Tokenizes
    - Removes tokens with numbers
    - Removes stopwords
    - Applies lemmatization or stemming based on parameters
    """
    
    text = text.lower()                         # Convert to lowercase
    text = re.sub(r'[^\w\s]', ' ', text)        # Remove all punctuation  
    text = re.sub(r'_', ' ', text)              # Remove underscore
    text = re.sub('\s+', ' ', text)             # Remove multiple spaces
    
    tokens = word_tokenize(text) # Tokenize the text

    processed_tokens = []
    
    for token in tokens:
        # Remove words containing numbers
        if any(char.isdigit() for char in token):
            continue

        # Dictionary filtering
        # if token not in english_words:
        #    continue

        # Remove stopwords
        if token in stop_words:
            continue

        if use_lemmatization:
            token = lemmatizer.lemmatize(token)
        if use_stemming:
            token = stemmer.stem(token)

        processed_tokens.append(token)
    
    return ' '.join(processed_tokens)


In [11]:
def preprocess_text_with_progress(df, column_name, **kwargs):

    tqdm.pandas(desc="Processing Resumes")  # Enable tqdm for Pandas
    return df[column_name].progress_apply(lambda x: preprocess_text(x, **kwargs))

In [12]:
parameter_sets = [
    #{"use_lemmatization": True, "use_stemming": True},
    {"use_lemmatization": True, "use_stemming": False},
    #{"use_lemmatization": False, "use_stemming": True}
]

In [13]:
for params in parameter_sets:
    # Create a copy of the original DataFrame
    df_copy = df.copy()

    # Apply preprocessing with the current parameter combination
    df_copy['description'] = preprocess_text_with_progress(df_copy, 'description', **params)

    # Define the folder where CSV files will be saved
    output_folder = "processed_data"
    
    # Ensure the folder exists (create it if necessary)
    os.makedirs(output_folder, exist_ok=True)

    # Generate the full path for saving the file
    if params["use_lemmatization"] and not params["use_stemming"]:
        filename = os.path.join(output_folder, "Job_proc_lemm.csv")
    elif params["use_lemmatization"] and params["use_stemming"]: 
        filename = os.path.join(output_folder, "Job_proc_lemm_stem.csv")
    else:
        filename = os.path.join(output_folder, "Job_proc_stem.csv")
    
    # Save the DataFrame to the specified folder
    df_copy.to_csv(filename, index=False)

    print(f"Processed DataFrame saved as: {filename}")

Processing Resumes: 100%|██████████| 49469/49469 [15:12<00:00, 54.21it/s] 


Processed DataFrame saved as: processed_data\Job_proc_lemm.csv
