In [1]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('punkt')

# for tokenization
from nltk.tokenize import word_tokenize, sent_tokenize

# for POS tagging
from nltk import pos_tag

# for lemmatization
from nltk.stem import WordNetLemmatizer

# for stop words
from nltk.corpus import stopwords

# for punctuations
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
cleaned_df = pd.read_excel("Data/gerald_data_cleaned.xlsx")
cleaned_df.drop(columns='Unnamed: 0', inplace=True)
print(cleaned_df.shape)
cleaned_df.head()

(17880, 18)


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",marketing,,we are food52 and we have created a groundbrea...,food52 a fast growing james beard award winnin...,experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",success,,90 seconds the worlds cloud video production s...,organised focused vibrant awesome do you have ...,what we expect from you your key responsibilit...,what you will get from usthrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,valor services provides workforce solutions th...,our client located in houston is actively seek...,implement pre commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",sales,,our passion for improving quality of life thro...,the company esri environmental systems researc...,education bachelor s or master s in gis busine...,our culture is anything but corporate we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,spotsource solutions llc is a global human cap...,job title itemization review managerlocation f...,qualifications right now license in the state ...,full benefits offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [3]:
textual_columns = ["company_profile", "description", "requirements", "benefits"]

In [4]:
# Load the English stopwords
stop_words = set(stopwords.words("english"))

# Tokenization function with stop word removal and error handling for NaN values
def tokenize_text(text):
    try:
        if isinstance(text, str):
            # Tokenize the text
            tokens = word_tokenize(text)
            
            # Remove stop words
            tokens = [token for token in tokens if token.lower() not in stop_words]
            
            return tokens
        else:
            return []  # Return an empty list for non-string values
    except Exception as e:
        print(f"Error tokenizing: {e}")
        return []  # Return an empty list in case of any error


In [5]:
for col in textual_columns:
    cleaned_df['tokenized_' + col] = cleaned_df[col].apply(tokenize_text)

cleaned_df.head(5)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,employment_type,required_experience,required_education,industry,function,fraudulent,tokenized_company_profile,tokenized_description,tokenized_requirements,tokenized_benefits
0,1,Marketing Intern,"US, NY, New York",marketing,,we are food52 and we have created a groundbrea...,food52 a fast growing james beard award winnin...,experience with content management systems a m...,,0,...,Other,Internship,,,Marketing,0,"[food52, created, groundbreaking, award, winni...","[food52, fast, growing, james, beard, award, w...","[experience, content, management, systems, maj...",[]
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",success,,90 seconds the worlds cloud video production s...,organised focused vibrant awesome do you have ...,what we expect from you your key responsibilit...,what you will get from usthrough being part of...,0,...,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,"[90, seconds, worlds, cloud, video, production...","[organised, focused, vibrant, awesome, passion...","[expect, key, responsibility, communicate, cli...","[get, usthrough, part, 90, seconds, team, gain..."
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,valor services provides workforce solutions th...,our client located in houston is actively seek...,implement pre commissioning and commissioning ...,,0,...,,,,,,0,"[valor, services, provides, workforce, solutio...","[client, located, houston, actively, seeking, ...","[implement, pre, commissioning, commissioning,...",[]
3,4,Account Executive - Washington DC,"US, DC, Washington",sales,,our passion for improving quality of life thro...,the company esri environmental systems researc...,education bachelor s or master s in gis busine...,our culture is anything but corporate we have ...,0,...,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,"[passion, improving, quality, life, geography,...","[company, esri, environmental, systems, resear...","[education, bachelor, master, gis, business, a...","[culture, anything, corporate, collaborative, ..."
4,5,Bill Review Manager,"US, FL, Fort Worth",,,spotsource solutions llc is a global human cap...,job title itemization review managerlocation f...,qualifications right now license in the state ...,full benefits offered,0,...,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,"[spotsource, solutions, llc, global, human, ca...","[job, title, itemization, review, managerlocat...","[qualifications, right, license, state, texasd...","[full, benefits, offered]"


In [6]:
lemmatizer = WordNetLemmatizer()

# Lemmatization function with stop word removal and error handling for NaN values
def lemmatize_text(text):
    try:
        if isinstance(text, str):
            # Tokenize the text
            tokens = word_tokenize(text)
            
            # Remove stop words and lemmatize tokens
            lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.lower() not in stop_words]
            
            return lemmatized_tokens
        else:
            return []  # Return an empty list for non-string values
    except Exception as e:
        print(f"Error lemmatizing: {e}")
        return []  # Return an empty list in case of any error


In [7]:
# Apply lemmatization to each textual column in the DataFrame
nltk.download('wordnet')
for col in textual_columns:
    cleaned_df['lemmatized_' + col] = cleaned_df[col].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
import re
# cleaned_df.head(5)
cleaned_df["requirements"].fillna("", inplace = True)
cleaned_df["description"].fillna("", inplace = True)
cleaned_df["company_profile"].fillna("", inplace = True)
cleaned_df["benefits"].fillna("", inplace = True)

def transform_text(text):
    return re.sub(r'(phone|email|url)_', r'\1 ', text)

# Apply the transformation to the 'text_column'
cleaned_df['benefits'] = cleaned_df['benefits'].apply(lambda x: transform_text(x) if pd.notnull(x) else x)


In [28]:
cleaned_df.to_excel("Data/data_to_split.xlsx")