In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import nltk
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, words
from wordcloud import WordCloud
from tqdm import tqdm

# Download necessary NLTK packages
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
df = pd.read_csv('../dataset/DataSetKaggle/Resume/Resume.csv')
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [3]:
df.drop(columns = ['Resume_html'], inplace = True)
df.head()

Unnamed: 0,ID,Resume_str,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,33176873,HR DIRECTOR Summary Over 2...,HR
3,27018550,HR SPECIALIST Summary Dedica...,HR
4,17812897,HR MANAGER Skill Highlights ...,HR


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          2484 non-null   int64 
 1   Resume_str  2484 non-null   object
 2   Category    2484 non-null   object
dtypes: int64(1), object(2)
memory usage: 58.3+ KB


In [5]:
# From the data visualization, we noticed that there is an empty resume in the dataset
df.select_dtypes(include='object').apply(lambda col: col.str.strip() == '').sum()

Resume_str    1
Category      0
dtype: int64

In [6]:
df = df[df['Resume_str'].str.strip() != '']

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2483 entries, 0 to 2483
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          2483 non-null   int64 
 1   Resume_str  2483 non-null   object
 2   Category    2483 non-null   object
dtypes: int64(1), object(2)
memory usage: 77.6+ KB


## Preprocessing

In [8]:
# Get the list of valid English words
english_words = set(words.words())

# Get a list of stop word
stop_words = set(nltk.corpus.stopwords.words('english'))

# Initialize the stemmer
stemmer = PorterStemmer()

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [9]:
def preprocess_text(text, use_lemmatization=True, use_stemming=False):
    """
    Preprocesses the text: 
    - Converts to lowercase
    - Removes punctuation
    - Remove underscore
    - Tokenizes
    - Removes tokens with numbers
    - Removes stopwords
    - Applies lemmatization or stemming based on parameters
    """
    
    text = text.lower()                         # Convert to lowercase
    text = re.sub(r'[^\w\s]', ' ', text)        # Remove all punctuation  
    text = re.sub(r'_', ' ', text)              # Remove underscore
    text = re.sub('\s+', ' ', text)             # Remove multiple spaces
    
    tokens = word_tokenize(text) # Tokenize the text

    processed_tokens = []
    
    for token in tokens:
        # Remove words containing numbers
        if any(char.isdigit() for char in token):
            continue

        # Dictionary filtering
        # if token not in english_words:
        #    continue

        # Remove stopwords
        if token in stop_words:
            continue

        if use_lemmatization:
            token = lemmatizer.lemmatize(token)
        elif use_stemming:
            token = stemmer.stem(token)

        processed_tokens.append(token)
    
    return ' '.join(processed_tokens)


In [10]:
def preprocess_text_with_progress(df, column_name, **kwargs):

    tqdm.pandas(desc="Processing Resumes")  # Enable tqdm for Pandas
    return df[column_name].progress_apply(lambda x: preprocess_text(x, **kwargs))

In [11]:
parameter_sets = [
    {"use_lemmatization": True, "use_stemming": True},
    {"use_lemmatization": True, "use_stemming": False},
    {"use_lemmatization": False, "use_stemming": True}
]

In [12]:
for params in parameter_sets:
    # Create a copy of the original DataFrame
    df_copy = df.copy()

    # Apply preprocessing with the current parameter combination
    df_copy['Resume_str'] = preprocess_text_with_progress(df_copy, 'Resume_str', **params)

    # Define the folder where CSV files will be saved
    output_folder = "processed_data"
    
    # Ensure the folder exists (create it if necessary)
    os.makedirs(output_folder, exist_ok=True)

    # Generate the full path for saving the file
    if params["use_lemmatization"] and not params["use_stemming"]:
        filename = os.path.join(output_folder, "Resume_proc_lemm.csv")
    elif params["use_lemmatization"] and params["use_stemming"]: 
        filename = os.path.join(output_folder, "Resume_proc_lemm_stem.csv")
    else:
        filename = os.path.join(output_folder, "Resume_proc_stem.csv")
    
    # Save the DataFrame to the specified folder
    df_copy.to_csv(filename, index=False)

    print(f"Processed DataFrame saved as: {filename}")

Processing Resumes: 100%|██████████| 2483/2483 [00:25<00:00, 95.51it/s] 


Processed DataFrame saved as: processed_data\Resume_proc_lemm_stem.csv


Processing Resumes: 100%|██████████| 2483/2483 [00:22<00:00, 108.58it/s]


Processed DataFrame saved as: processed_data\Resume_proc_lemm.csv


Processing Resumes: 100%|██████████| 2483/2483 [00:47<00:00, 51.80it/s]


Processed DataFrame saved as: processed_data\Resume_proc_stem.csv


### Testing different type of preprocessing

In [None]:
# Get the list of all CSV files in the folder
input_folder = "processed_data"
csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

# Loop through the files and create a DataFrame for each one
for file in csv_files:

    print("\n\n")
    print(file)

    file_path = os.path.join(input_folder, file)
    
    # Load the CSV into a DataFrame
    df = pd.read_csv(file_path)

    count_vect = CountVectorizer(
        lowercase=True,
        binary = False,
        # stop_words = list(stop_words),      # TODO could be not useful, we already remove stop words in preprocess_text
        # ngram_range=(1,2),               # Considers unigrams and bigrams
        # max_df = 0.85,                    # Ignores words appearing in more than 85% of documents (too common)
        # min_df = 2,                       # Keeps words appearing in at least 2 documents (filters rare words)
    )
    
    countvectorizer_train = count_vect.fit_transform(df['Resume_str']).astype(float)


    


    print(len(count_vect.vocabulary_))

    tokens = count_vect.get_feature_names_out()
    print(tokens[:100])
    print(tokens[-100:])
    




Resume_proc_lemm.csv
33524
['avert' 'averted' 'averting' 'avery' 'aveta' 'avetis' 'avett' 'avg'
 'avian' 'aviara' 'aviatiaon' 'aviation' 'aviationaviation' 'aviator'
 'avid' 'avide' 'avila' 'avimark' 'avinashilingam' 'avionic' 'avionics'
 'avis' 'avisena' 'aviso' 'aviva' 'avl' 'avma' 'avnet' 'avo' 'avogadro'
 'avoid' 'avoidable' 'avoidance' 'avoided' 'avoiding' 'avon' 'avondale'
 'avp' 'avt' 'avum' 'awaiting' 'award' 'awarded' 'awardee' 'awardees'
 'awarding' 'awardminor' 'aware' 'awareness' 'away' 'awd' 'awe' 'aweber'
 'awesome' 'awf' 'awk' 'awkward' 'awol' 'awps' 'awr' 'aws' 'awse' 'awt'
 'awv' 'ax' 'axc' 'axial' 'axiom' 'axios' 'axis' 'axle' 'axsun' 'axtel'
 'axure' 'ayala' 'ayanniyi' 'ayoola' 'ayoub' 'ayp' 'ayrep' 'ayres'
 'ayroll' 'ayso' 'ayuda' 'az' 'azccr' 'azhar' 'azimuth' 'aziz' 'aznn'
 'azrouel' 'azteca' 'azure' 'azz' 'azzure' 'años' 'ba' 'baa' 'baan' 'bab']



Resume_proc_lemm_stem.csv
33524
['avert' 'averted' 'averting' 'avery' 'aveta' 'avetis' 'avett' 'avg'
 'avian' 'av