In [1]:
import pandas as pd
import re
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Download necessary NLTK packages
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
nltk.data.path

['C:\\Users\\filip/nltk_data',
 'c:\\Users\\filip\\anaconda3\\envs\\DMML\\nltk_data',
 'c:\\Users\\filip\\anaconda3\\envs\\DMML\\share\\nltk_data',
 'c:\\Users\\filip\\anaconda3\\envs\\DMML\\lib\\nltk_data',
 'C:\\Users\\filip\\AppData\\Roaming\\nltk_data',
 'C:\\nltk_data',
 'D:\\nltk_data',
 'E:\\nltk_data']

In [3]:
df = pd.read_csv('../dataset/Resumes/data/Resume.csv')
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


We can see that the dataset consists of four columns: ID, Resume_str, Resume_html, and Category. For our purpose, we don’t need the column containing the HTML code, as the plain text string is sufficient. Therefore, we will remove it.

In [4]:
df.drop(columns = ['Resume_html'], inplace = True)
df.head()

Unnamed: 0,ID,Resume_str,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,33176873,HR DIRECTOR Summary Over 2...,HR
3,27018550,HR SPECIALIST Summary Dedica...,HR
4,17812897,HR MANAGER Skill Highlights ...,HR


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          2484 non-null   int64 
 1   Resume_str  2484 non-null   object
 2   Category    2484 non-null   object
dtypes: int64(1), object(2)
memory usage: 58.3+ KB


In [6]:
# From the data visualization (Visualization.ipynb), we noticed that there is an empty resume in the dataset
df.select_dtypes(include='object').apply(lambda col: col.str.strip() == '').sum()

Resume_str    1
Category      0
dtype: int64

In [7]:
df = df[df['Resume_str'].str.strip() != '']

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2483 entries, 0 to 2483
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          2483 non-null   int64 
 1   Resume_str  2483 non-null   object
 2   Category    2483 non-null   object
dtypes: int64(1), object(2)
memory usage: 77.6+ KB


In [9]:
word_count = df['Resume_str'].apply(lambda x: len(str(x).split()))      # For each resume, "word_count" contains the number of words

desc = word_count.describe()
q1 = word_count.quantile(0.25)
q3 = word_count.quantile(0.75)
iqr = q3 - q1

print("Statistics of Word Count in Resumes:")
print(f"Minimum      : {desc['min']}")
print(f"1st Quartile : {q1}")
print(f"Median       : {desc['50%']}")
print(f"3rd Quartile : {q3}")
print(f"Maximum      : {desc['max']}")
print(f"Mean         : {desc['mean']:.2f}")
print(f"IQR (Q3 - Q1): {iqr}")

Statistics of Word Count in Resumes:
Minimum      : 113.0
1st Quartile : 651.0
Median       : 757.0
3rd Quartile : 933.0
Maximum      : 5190.0
Mean         : 811.65
IQR (Q3 - Q1): 282.0


### PREPROCESSING

In [10]:
# Get a list of stop word
stop_words = set(nltk.corpus.stopwords.words('english'))

# Initialize the stemmer
stemmer = PorterStemmer()

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [11]:
def preprocess_text(text, remove_stopword=True, use_lemmatization=True, use_stemming=False):
    """
    Preprocesses the text: 
    - Converts to lowercase
    - Removes punctuation
    - Removes underscore
    - Removes multiple spaces
    - Tokenizes
    - Removes tokens with numbers
    - Removes stopwords based on parameter
    - Applies lemmatization or stemming based on parameters
    """
    
    text = text.lower()                     # Convert to lowercase
    text = re.sub(r'[^\w\s]', ' ', text)    # Remove all punctuation (\w = a-z, A-Z, 0-9 and underscore)
    text = re.sub(r'_', ' ', text)          # Remove underscore
    text = re.sub('\s+', ' ', text)         # Remove multiple spaces

    tokens = word_tokenize(text)            # Tokenize the text

    processed_tokens = []
    
    for token in tokens:
        # Remove words containing numbers
        if any(char.isdigit() for char in token):
            continue

        # Remove stopwords based on parameter
        if remove_stopword and (token in stop_words):
            continue

        # Applies lemmatization or stemming based on parameters
        if use_lemmatization:
            token = lemmatizer.lemmatize(token)
        if use_stemming:
            token = stemmer.stem(token)

        processed_tokens.append(token)
    
    return ' '.join(processed_tokens)


In [12]:
def preprocess_text_with_progress(df, column_name, **kwargs):
    """ 
    Function to preprocess text column with a progress bar 
    """
                
    tqdm.pandas(desc="Processing Resumes")  # Integrate tqdm progress bar into pandas
    return df[column_name].progress_apply(lambda x: preprocess_text(x, **kwargs))

In [13]:
parameter_sets = [
    {"remove_stopword": True, "use_lemmatization": True, "use_stemming": True},
    {"remove_stopword": True, "use_lemmatization": True, "use_stemming": False},
    {"remove_stopword": True, "use_lemmatization": False, "use_stemming": True},
    {"remove_stopword": False, "use_lemmatization": False, "use_stemming": False},
    {"remove_stopword": False, "use_lemmatization": True, "use_stemming": False}
]

In [14]:
output_folder = "processed_data"
os.makedirs(output_folder, exist_ok=True)

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Category'])

for params in parameter_sets:
    df_train_copy = df_train.copy()
    df_test_copy = df_test.copy()
    df_train_copy['Resume_str'] = preprocess_text_with_progress(df_train_copy, 'Resume_str', **params)
    df_test_copy['Resume_str'] = preprocess_text_with_progress(df_test_copy, 'Resume_str', **params)

    file_name = 'Resume'
    if params["remove_stopword"]:
        file_name += '_removeStopword'
    if params["use_lemmatization"]:
        file_name += '_useLemm'
    if params["use_stemming"]:
        file_name += '_useStemm'

    folder_path = os.path.join(output_folder, file_name)
    os.makedirs(folder_path, exist_ok=True)

    path = os.path.join(folder_path, 'train.csv')
    df_train_copy.to_csv(path, index=False)

    path = os.path.join(folder_path, 'test.csv')
    df_test_copy.to_csv(path, index=False)

    print(f"Processed DataFrame saved as: {file_name}")

Processing Resumes:   0%|          | 0/1986 [00:00<?, ?it/s]

Processing Resumes: 100%|██████████| 1986/1986 [00:36<00:00, 54.44it/s]
Processing Resumes: 100%|██████████| 497/497 [00:07<00:00, 67.76it/s]


Processed DataFrame saved as: Resume_removeStopword_useLemm_useStemm


Processing Resumes: 100%|██████████| 1986/1986 [00:11<00:00, 166.68it/s]
Processing Resumes: 100%|██████████| 497/497 [00:02<00:00, 186.52it/s]


Processed DataFrame saved as: Resume_removeStopword_useLemm


Processing Resumes: 100%|██████████| 1986/1986 [00:25<00:00, 79.33it/s]
Processing Resumes: 100%|██████████| 497/497 [00:07<00:00, 70.55it/s]


Processed DataFrame saved as: Resume_removeStopword_useStemm


Processing Resumes: 100%|██████████| 1986/1986 [00:07<00:00, 260.77it/s]
Processing Resumes: 100%|██████████| 497/497 [00:01<00:00, 277.47it/s]


Processed DataFrame saved as: Resume


Processing Resumes: 100%|██████████| 1986/1986 [00:16<00:00, 117.40it/s]
Processing Resumes: 100%|██████████| 497/497 [00:03<00:00, 125.58it/s]


Processed DataFrame saved as: Resume_useLemm
