Imports

In [1]:
# import kagglehub
import joblib
import os
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from typing import List
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aamidmohsin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aamidmohsin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/aamidmohsin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Read CSV

In [2]:
path = "../data/raw/"
data = pd.read_csv(path + "DataSet.csv")

Create Trained Model Directory

In [3]:
model_dir = "../docker/model_assets/"
os.makedirs(model_dir, exist_ok=True)

Cleaning Dataset

In [4]:
def htmlparser(text):
    text = re.sub(r'<.*?>', '', str(text))  # remove html tags
    text = re.sub(r'[^A-Za-z0-9.,!?;:\'&-]+', ' ', text)  # remove special characters + non-alphanumeric characters
    text = text.replace('\n', ' ')  # replace newline with space
    text = text.strip()
    if text == '':
        text = 'Unspecified'
    return text

for column in data.columns:
    data[column].fillna("Unspecified", inplace=True)
    data[column] = data[column].apply(htmlparser)

Handling Categorical Columns

In [5]:
# categorical columns
categorical_columns = ['location', 'department', 'salary_range', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']


In [6]:
def clean_location(value):
    if isinstance(value, str) and re.match(r'[a-zA-Z0-9]+,\s?[a-zA-Z0-9]*,\s?[a-zA-Z0-9. ]*', value): return value
    return "Unspecified"

def clean_department(value):
    if isinstance(value, str) and re.match(r"[\w\s]", value): return value
    return "Unspecified"

def clean_salary_range(value):
    if isinstance(value, str) and re.match(r'^\d+$', value) or re.match(r'^\d+-\d+$', value): return value
    return "Unspecified"

def clean_employment_type(value):
    if isinstance(value, str) and value.strip().lower() in ['contract', 'full-time', 'part-time', 'temporary', 'other']: return value
    return "Unspecified"

def clean_required_experience(value):
    if isinstance(value, str) and value.strip().lower() in ['associate', 'director', 'entry level', 'executive', 'internship', 'mid-senior level', 'not applicable']: return value
    return "Unspecified"

def clean_required_education(value):
    if isinstance(value, str) and re.match(r'^[A-Z]', value): return value
    return "Unspecified"


data['location'] = data['location'].apply(clean_location)
data['department'] = data['department'].apply(clean_department)
data['salary_range'] = data['salary_range'].apply(clean_salary_range)
data['employment_type'] = data['employment_type'].apply(clean_employment_type)
data['required_experience'] = data['required_experience'].apply(clean_required_experience)
data['required_education'] = data['required_education'].apply(clean_required_education)

Handling Binary Columns

In [7]:
# binary columns
binary_columns = ['telecommuting', 'has_company_logo', 'has_questions', 'fraudulent', 'in_balanced_dataset']
for column in binary_columns:
    data[column] = data[column].replace({'t': 1, 'f': 0})

Handling Text Description Columns

In [8]:
# text columns
text_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']

In [9]:
for col in text_columns:
    data[col] = data[col].replace("Unspecified", "")

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def punctuation_remover(strings: List[str]) -> List[str]:
    cleaned_strings = []
    for string in strings:
        cleaned_string = re.sub(r'[^\w\s]', '', string)
        cleaned_strings.append(cleaned_string)
    return cleaned_strings


def preprocess_text(text: str) -> str:
       text = text.lower()
       text = punctuation_remover([text])[0]
       text = re.sub(r'\s+', ' ', text).strip()
       tokens = word_tokenize(text)
       tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
       return ' '.join(tokens)


for col in text_columns:
    data[col] = data[col].apply(preprocess_text)

# Combining all text columns into one feature
data['combined_text'] = data[text_columns].apply(lambda row: " ".join(row), axis=1)
print(data['combined_text'].head())

0    marketing intern food52 weve created groundbre...
1    customer service cloud video production 90 sec...
2    commissioning machinery assistant cma valor se...
3    account executive washington dc passion improv...
4    bill review manager spotsource solution llc gl...
Name: combined_text, dtype: object


# Feature Extraction - Bag of Words and TF-IDF Vector

In [10]:
# 1) Bag-of-Words (Unigrams Only)
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(data['combined_text'])

# 2) TF-IDF (Unigrams + Bigrams)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_tfidf = tfidf_vectorizer.fit_transform(data['combined_text'])


# Model Selection and Training

In [11]:
# Define target labels
y = data['fraudulent']

# Train on the whole dataset (no train/test split)
# Use all data for training
X_train_bow = X_bow
y_train_bow = y
# Note: No test set when training on full dataset
X_test_bow = None
y_test_bow = None

Model 1: Train Multinomial Bayes Model (Bag of Words)

In [12]:
nb_model_bow = MultinomialNB(alpha=1.0)
nb_model_bow.fit(X_train_bow, y_train_bow)
# y_pred_nb_bow = nb_model_bow.predict(X_test_bow)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [13]:
# save the model
joblib.dump(nb_model_bow, os.path.join(model_dir, "nb_model_bow.joblib"))

# save the fitted bow vectorizer
joblib.dump(bow_vectorizer, os.path.join(model_dir, "bow_vectorizer.joblib"))

['../docker/model_assets/bow_vectorizer.joblib']

Model 2: Train Logistic Regression Model (Bag of Words)

In [14]:
lr_model_bow = LogisticRegression(max_iter=1000)
lr_model_bow.fit(X_train_bow, y_train_bow)
# y_pred_lr_bow = lr_model_bow.predict(X_test_bow)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [15]:
# save the model
joblib.dump(lr_model_bow, os.path.join(model_dir, "lr_model_bow.joblib"))

# save the fitted bow vectorizer
# joblib.dump(bow_vectorizer, os.path.join(model_dir, "bow_vectorizer.joblib"))

['../docker/model_assets/lr_model_bow.joblib']

# Testing Models Using TF-IDF Feature vectors

Model 1: Train Multinomial Bayes Model (TF-IDF)

In [16]:
# Train on the whole dataset (no train/test split)
# Use all data for training
X_train_tfidf = X_tfidf
y_train_tfidf = y
# Note: No test set when training on full dataset
X_test_tfidf = None
y_test_tfidf = None

In [17]:
nb_model_tfidf = MultinomialNB(alpha=1.0)
nb_model_tfidf.fit(X_train_tfidf, y_train_tfidf)
# y_pred_nb_tfidf = nb_model_tfidf.predict(X_test_tfidf)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [18]:
# save the model
joblib.dump(nb_model_tfidf, os.path.join(model_dir, "nb_model_tfidf.joblib"))

# save the fitted bow vectorizer
joblib.dump(tfidf_vectorizer, os.path.join(model_dir, "tfidf_vectorizer.joblib"))

['../docker/model_assets/tfidf_vectorizer.joblib']

Model 2: Train Logistic Regression Model (TF-IDF)

In [19]:
# Logistic Regression (TF-IDF)
lr_model_tfidf = LogisticRegression(max_iter=1000)
lr_model_tfidf.fit(X_train_tfidf, y_train_tfidf)
# y_pred_lr_tfidf = lr_model_tfidf.predict(X_test_tfidf)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [20]:
# save the model
joblib.dump(lr_model_tfidf, os.path.join(model_dir, "lr_model_tfidf.joblib"))

# save the fitted bow vectorizer
# joblib.dump(tfidf_vectorizer, os.path.join(model_dir, "tfidf_vectorizer.joblib"))

['../docker/model_assets/lr_model_tfidf.joblib']

# Lets try oversampling the minority (fraudlent) class now:

In [21]:
from imblearn.over_sampling import SMOTE

# ros = RandomOverSampler(random_state=42)
smote = SMOTE(random_state=42)

# Fit and resample the training set to oversample the minority (fraudulent) class.
# X_train_bow_res, y_train_bow_res = ros.fit_resample(X_train_bow, y_train_bow)
X_train_bow_res, y_train_bow_res = smote.fit_resample(X_train_bow, y_train_bow)


Model 1: Train Multinomial Bayes Model + Oversampling (Bag of Words)

In [22]:
nb_model_bow_res = MultinomialNB(alpha=1.0)
nb_model_bow_res.fit(X_train_bow_res, y_train_bow_res)
# y_pred_nb_bow_res = nb_model_bow_res.predict(X_test_bow)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [23]:
# save the model
joblib.dump(nb_model_bow_res, os.path.join(model_dir, "nb_model_bow_res.joblib"))

# save the fitted bow vectorizer
# joblib.dump(bow_vectorizer, os.path.join(model_dir, "bow_vectorizer.joblib"))

['../docker/model_assets/nb_model_bow_res.joblib']

Model 2: Train Logistic Regression + Oversampling (Bag of Words)

In [24]:
lr_model_bow_res = LogisticRegression(max_iter=1000)
lr_model_bow_res.fit(X_train_bow_res, y_train_bow_res)

# Predict on the original test set
# y_pred_lr_bow_res = lr_model_bow_res.predict(X_test_bow)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [25]:
# save the model
joblib.dump(lr_model_bow_res, os.path.join(model_dir, "lr_model_bow_res.joblib"))

# save the fitted bow vectorizer
# joblib.dump(bow_vectorizer, os.path.join(model_dir, "bow_vectorizer.joblib"))

['../docker/model_assets/lr_model_bow_res.joblib']

Model 1: Train Multinomial Bayes Model + Oversampling (TF-IDF)

In [26]:
X_train_tfidf_res, y_train_tfidf_res = smote.fit_resample(X_train_tfidf, y_train_tfidf)

nb_model_tfidf_res = MultinomialNB(alpha=1.0)
nb_model_tfidf_res.fit(X_train_tfidf_res, y_train_tfidf_res)
# y_pred_nb_tfidf_res = nb_model_tfidf_res.predict(X_test_tfidf)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [27]:
# save the model
joblib.dump(nb_model_tfidf_res, os.path.join(model_dir, "nb_model_tfidf_res.joblib"))

# save the fitted bow vectorizer
# joblib.dump(tfidf_vectorizer, os.path.join(model_dir, "tfidf_vectorizer.joblib"))

['../docker/model_assets/nb_model_tfidf_res.joblib']

Model 2: Train Logistic Regression + Oversampling (TF-IDF)

In [28]:
lr_model_tfidf_res = LogisticRegression(max_iter=1000)
lr_model_tfidf_res.fit(X_train_tfidf_res, y_train_tfidf_res)
# y_pred_lr_tfidf_res = lr_model_tfidf_res.predict(X_test_tfidf)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [29]:
# save the model
joblib.dump(lr_model_tfidf_res, os.path.join(model_dir, "lr_model_tfidf_res.joblib"))

# save the fitted bow vectorizer
# joblib.dump(tfidf_vectorizer, os.path.join(model_dir, "tfidf_vectorizer.joblib"))

['../docker/model_assets/lr_model_tfidf_res.joblib']