In [None]:
# Fake Job Postings Detection using PyTorch
# Ngan Cao
# Tim Mei 101268588

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import re
from collections import Counter
from sklearn.model_selection import train_test_split   # kept because it's used

# Load and preprocess the dataset
print("Loading dataset...")
df = pd.read_csv('fake_job_postings.csv')   # Load the CSV file

# Combine the title, description, and requirements into one text field
df['text'] = (
    df['title'].fillna('') + ' ' +
    df['description'].fillna('') + ' ' +
    df['requirements'].fillna('')
)

# Remove rows where text is empty
df = df[df['text'].str.strip() != ''].reset_index(drop=True)
print(f"Total job posts: {len(df)}")
print(f"Fake jobs: {df['fraudulent'].sum()} ({df['fraudulent'].mean()*100:.1f}%)")


class JobDataset(Dataset):
    def __init__(self, texts, labels, vocab=None, max_length=150):
        self.texts = texts
        self.labels = labels
        self.max_length = max_length
        
        # Build vocab from training set if vocab is not provided
        if vocab is None:
            self.vocab = self.build_vocab(texts)
        else:
            self.vocab = vocab
    def tokenize(self, text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
        return text.split()
    
    #makes it so that it gets the most common words. Only keeps words occurring at least 'min_freq' times, we're cappin it at 5000 words (can increase if needed)
    def build_vocab(self, texts, min_freq=2):

        counter = Counter()
        for text in texts:
            counter.update(self.tokenize(text))        
        # tokens
        vocab = {'<PAD>': 0, '<UNK>': 1}
        # addin frequent words up to 5000
        for word, count in counter.most_common(5000):
            if count >= min_freq:
                vocab[word] = len(vocab)        
        return vocab
    
    def text_to_indices(self, text):

        tokens = self.tokenize(text)
        
        # covert tokens to vocab index 
        indices = [self.vocab.get(t, 1) for t in tokens[:self.max_length]]
        
        # pad with <PAD> (0) if too short
        indices.extend([0] * (self.max_length - len(indices)))
        
        return indices
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.text_to_indices(self.texts[idx])),
            torch.tensor(self.labels[idx], dtype=torch.float)
        )


#training and split 
texts = df['text'].tolist()
labels = df['fraudulent'].tolist()

#  80/20 split, (from the a4 code)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# this is creatin training dataset  
train_dataset = JobDataset(train_texts, train_labels)

# creates test dataset (reuses same vocab)
test_dataset = JobDataset(test_texts, test_labels, vocab=train_dataset.vocab)

# dataloader batchs the data and shuffle during training
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"\nTraining samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")
print(f"Vocabulary size: {len(train_dataset.vocab)}")


class FakeJobDetector(nn.Module):

    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=128):
        super(FakeJobDetector, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, 
                           bidirectional=True)       
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        output = self.fc(self.dropout(hidden))
        # Apply sigmoid to get probabilities
        return self.sigmoid(output).squeeze()


def train_model(model, train_loader, test_loader, epochs=10, lr=0.001):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)  # Move model to device
    print(f"Training on: {device}")
    #measures how far predictions are from true labels (0 or 1)
    criterion = nn.BCELoss()
    #Adam adjusts weights to minimize loss
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)   
    # Training loop
    for epoch in range(epochs):
        model.train()  # Set model to training mode (enables dropout)
        total_loss = 0
        correct = 0
        total = 0
        
        # Process each batch
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            # Calculate loss (how wrong the predictions are)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        # Calculate training accuracy for this epoch
        train_accuracy = 100 * correct / total
        
        # Evaluate on test set after each epoch
        model.eval()  # Set model to evaluation mode (disables dropout)
        test_correct = 0
        test_total = 0
        
        with torch.no_grad():  # Don't calculate gradients (saves memory)
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                # Predict fake (1) if probability > 0.5, else real (0)
                predicted = (outputs > 0.5).float()
                test_total += labels.size(0)
                test_correct += (predicted == labels).sum().item()
        
        # Calculate and print metrics
        test_accuracy = 100 * test_correct / test_total
        avg_loss = total_loss / len(train_loader)
        
        print(f"Epoch [{epoch+1:2d}/{epochs}] | "
              f"Loss: {avg_loss:.4f} | "
              f"Train Acc: {train_accuracy:.2f}% | "
              f"Test Acc: {test_accuracy:.2f}%")
    
    return model


Loading dataset...
Total job posts: 17880
Fake jobs: 866 (4.8%)

Training samples: 14304
Test samples: 3576
Vocabulary size: 5002
