In [None]:
# Fake Job Postings Detection using PyTorch
# Ngan Cao
# Tim Mei 101268588

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Load and preprocess the dataset
print("Loading dataset...")
df = pd.read_csv('fake_job_postings.csv')   # Load the CSV file

# Combine the title, description, and requirements into one text field
df['text'] = (
    df['title'].fillna('') + ' ' +
    df['description'].fillna('') + ' ' +
    df['requirements'].fillna('')
)

# Remove rows where text is empty
df = df[df['text'].str.strip() != ''].reset_index(drop=True)

print(f"Total job posts: {len(df)}")
print(f"Fake jobs: {df['fraudulent'].sum()} ({df['fraudulent'].mean()*100:.1f}%)")



class JobDataset(Dataset):
    def __init__(self, texts, labels, vocab=None, max_length=150):
        self.texts = texts
        self.labels = labels
        self.max_length = max_length
        
        # Build vocab from training set if vocab is not provided
        if vocab is None:
            self.vocab = self.build_vocab(texts)
        else:
            self.vocab = vocab
    
    def tokenize(self, text):

        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
        return text.split()
    
    #makes it so that it gets the most common words. Only keeps words occurring at least 'min_freq' times, we're cappin it at 5000 words (can increase if needed)
    def build_vocab(self, texts, min_freq=2):

        counter = Counter()
        for text in texts:
            counter.update(self.tokenize(text))
        
        # tokens
        vocab = {'<PAD>': 0, '<UNK>': 1}
        
        # addin frequent words up to 5000
        for word, count in counter.most_common(5000):
            if count >= min_freq:
                vocab[word] = len(vocab)
        
        return vocab
    
    def text_to_indices(self, text):

        tokens = self.tokenize(text)
        
        # covert tokens to vocab index 
        indices = [self.vocab.get(t, 1) for t in tokens[:self.max_length]]
        
        # pad with <PAD> (0) if too short
        indices.extend([0] * (self.max_length - len(indices)))
        
        return indices
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.text_to_indices(self.texts[idx])),
            torch.tensor(self.labels[idx], dtype=torch.float)
        )


#training and split 
texts = df['text'].tolist()
labels = df['fraudulent'].tolist()

#  80/20 split, (from the a4 code)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# this is creatin training dataset  
train_dataset = JobDataset(train_texts, train_labels)

# creates test dataset (reuses same vocab)
test_dataset = JobDataset(test_texts, test_labels, vocab=train_dataset.vocab)

# dataloader batchs the data and shuffle during training
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"\nTraining samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")
print(f"Vocabulary size: {len(train_dataset.vocab)}")


Loading dataset...
Total job posts: 17880
Fake jobs: 866 (4.8%)

Training samples: 14304
Test samples: 3576
Vocabulary size: 5002
