In [None]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import keras

keras.utils.set_random_seed(42)

In [None]:
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from google.colab import drive

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define file paths in Google Drive
real_jobs_path = '/content/drive/My Drive/your_real_jobs_file.csv'
fake_jobs_path = '/content/drive/My Drive/your_fake_jobs_file.csv'

In [None]:

# Load data into Pandas DataFrames
real_jobs = pd.read_csv(real_jobs_path, encoding='utf-8')
fake_jobs = pd.read_csv(fake_jobs_path, encoding='utf-8')


In [None]:
fake_jobs.head()

Unnamed: 0,title,description,requirements,company_profile,location,salary_range,employment_type,industry,benefits,fraudulent
0,Mental health nurse,Arm drive court sure vote. Earn $5000/week! Im...,"Basic knowledge in live, no degree required. F...",Rivera and Sons - Established 2022.,West Jeffrey,$55016-$100476,Internship,IT,Free meals,1
1,Conference centre manager,Government whom its bed go tax tree black. Ear...,"Basic knowledge in seek, no degree required. F...","Davidson, Jones and Gomez - Established 2003.",Lake Meredithberg,$53438-$93138,Part-Time,Finance,Flexible hours,1
2,"Engineer, land",I member discuss follow way there nation. Earn...,"Basic knowledge in worker, no degree required....",Allen Ltd - Established 1998.,Lake Cathybury,$45584-$105229,Part-Time,IT,Free travel,1
3,Forest/woodland manager,House across wait approach face. Earn $5000/we...,"Basic knowledge in example, no degree required...",Forbes Ltd - Established 1990.,South Matthewstad,$66188-$139621,Full-Time,Education,Free travel,1
4,"Production designer, theatre/television/film",Case best environmental full finally leader me...,"Basic knowledge in smile, no degree required. ...","Jennings, Martin and Sanchez - Established 1975.",East Rhondafurt,$32183-$115012,Temporary,Retail,Flexible hours,1


In [None]:


# Step 1: Select matching columns
# (both datasets have title, description, requirements, and fraudulent)
real_jobs_df = real_jobs[['title', 'description', 'requirements', 'fraudulent']].copy()
fake_jobs_df = fake_jobs[['title', 'description', 'requirements', 'fraudulent']].copy()

# Step 2: Combine real + fake datasets
df_primary = pd.concat([real_jobs_df, fake_jobs_df], axis=0).reset_index(drop=True)

# Remove rows with missing critical fields
df_primary.dropna(subset=['title', 'description'], inplace=True)

# Deduplicate
df_primary.drop_duplicates(subset=['title', 'description', 'requirements'], inplace=True)

# Rename label for consistency
df_primary.rename(columns={'fraudulent': 'label'}, inplace=True)

# ============================
# STEP 2: Train/Validation/Test Split
# ============================

# First, split into train+val vs test (80/20 split)
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
    df_primary[['title', 'description', 'requirements']], df_primary['label'], test_size=0.2, random_state=42, stratify=df_primary['label']
)

# Then split train+val into train and validation (80/20 split of that)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_val_texts, train_val_labels, test_size=0.2, random_state=42, stratify=train_val_labels
)

# ============================
# STEP 3: Combine text fields
# ============================

def combine_fields(df):
    # Combine title, description, and requirements into a single string
    return (
        df['title'].fillna('') + ' ' +
        df['description'].fillna('') + ' ' +
        df['requirements'].fillna('')
    )

train_combined = combine_fields(train_texts).tolist()
val_combined = combine_fields(val_texts).tolist()
test_combined = combine_fields(test_texts).tolist()

# Convert labels to list for model consumption
train_labels = train_labels.tolist()
val_labels = val_labels.tolist()
test_labels = test_labels.tolist()

# ============================
# STEP 4: Print class balance
# ============================

print("Train class distribution:", pd.Series(train_labels).value_counts())
print("Validation class distribution:", pd.Series(val_labels).value_counts())
print("Test class distribution:", pd.Series(test_labels).value_counts())

Train class distribution: 0    9766
1    6839
Name: count, dtype: int64
Validation class distribution: 0    2442
1    1710
Name: count, dtype: int64
Test class distribution: 0    3052
1    2138
Name: count, dtype: int64


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report


# ================================
# BASELINE MODEL
# ================================

# Step 1: Vectorize text using Bag-of-Words
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train_combined)
X_val = vectorizer.transform(val_combined)

# Step 2: Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
lr_model.fit(X_train, train_labels)

# Step 3: Make predictions
lr_preds = lr_model.predict(X_val)
lr_probs = lr_model.predict_proba(X_val)[:, 1]


In [None]:
# Step 4: Evaluate baseline
print("=== Baseline Logistic Regression Results ===")
print(f"Accuracy: {accuracy_score(val_labels, lr_preds):.4f}")
print(f"Precision: {precision_score(val_labels, lr_preds):.4f}")
print(f"Recall: {recall_score(val_labels, lr_preds):.4f}")
print(f"F1-Score: {f1_score(val_labels, lr_preds):.4f}")
print(f"ROC-AUC: {roc_auc_score(val_labels, lr_probs):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(val_labels, lr_preds))

print("\nClassification Report:")
print(classification_report(val_labels, lr_preds))


=== Baseline Logistic Regression Results ===
Accuracy: 0.9812
Precision: 0.9863
Recall: 0.9678
F1-Score: 0.9770
ROC-AUC: 0.9924

Confusion Matrix:
[[2419   23]
 [  55 1655]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      2442
           1       0.99      0.97      0.98      1710

    accuracy                           0.98      4152
   macro avg       0.98      0.98      0.98      4152
weighted avg       0.98      0.98      0.98      4152



In [None]:
from transformers import BertTokenizer

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text fields
train_encodings = tokenizer(train_combined, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_combined, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_combined, truncation=True, padding=True, max_length=512)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import torch
from torch.utils.data import Dataset

class JobDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Prepare datasets
train_dataset = JobDataset(train_encodings, train_labels)
val_dataset = JobDataset(val_encodings, val_labels)
test_dataset = JobDataset(test_encodings, test_labels)


In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()
trainer.evaluate()  # Validate after training


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mreids[0m ([33mreids-massachusetts-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.0706,0.078166
2,0.0688,0.065617
3,0.0568,0.079309


{'eval_loss': 0.07930879294872284,
 'eval_runtime': 25.8005,
 'eval_samples_per_second': 160.927,
 'eval_steps_per_second': 10.077,
 'epoch': 3.0}

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Get predictions on the validation or test set
predictions = trainer.predict(val_dataset)  # You can also switch to test_dataset here

# Extract logits
logits = predictions.predictions
probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1).numpy()

# Get predicted classes
y_pred = np.argmax(probs, axis=1)
y_true = np.array(val_labels)  # Or use test_labels if evaluating on test set

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
roc_auc = roc_auc_score(y_true, probs[:, 1])

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))

# Full classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred))


Accuracy: 0.9834
Precision: 0.9887
Recall: 0.9708
F1-Score: 0.9796
ROC-AUC: 0.9967

Confusion Matrix:
[[2423   19]
 [  50 1660]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2442
           1       0.99      0.97      0.98      1710

    accuracy                           0.98      4152
   macro avg       0.98      0.98      0.98      4152
weighted avg       0.98      0.98      0.98      4152

