
# AlexNet Fine-Tuning Classifier (Lightning)

This notebook fine-tunes **AlexNet (pretrained on ImageNet)** for image classification using **PyTorch Lightning**.

**Highlights**
- Safe dataset that skips unreadable/corrupt images
- `CrossEntropyLoss` + `AdamW`
- Callbacks: **EarlyStopping**, **ModelCheckpoint**, **LearningRateMonitor**
- Optional **feature-extraction** cell for 4096-D embeddings (fusion-ready)


In [2]:

import os, csv, math, random, json
from pathlib import Path
from typing import Optional, Any, Dict, Tuple, List

import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import models, transforms

import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import CSVLogger

# ---- Hyperparameters ----
CSV_PATH = "Projects/FakeJob/fake_job_postings.csv"    # path to your csv with columns: image_path,label
IMG_ROOT = "Projects/FakeJob/images"                          # base folder joined with image_path if needed
NUM_CLASSES = 2                         # change if needed
BATCH_SIZE = 32
MAX_EPOCHS = 3
LR = 1e-4
NUM_WORKERS = 2
VAL_SPLIT = 0.2                         # split % for validation if no explicit split columns
SEED = 42

torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", device)

from sklearn.model_selection import StratifiedShuffleSplit
from collections import Counter

from sklearn.metrics import (roc_auc_score, average_precision_score, f1_score, precision_score, recall_score, matthews_corrcoef, balanced_accuracy_score)

Device: cpu


In [3]:

class JobImageDataset(Dataset):
    def __init__(self, df: pd.DataFrame, img_root: str = ".", tfm=None):
        self.df = df.reset_index(drop=True)
        self.img_root = img_root
        self.tfm = tfm

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]
        img_path = row['image_path']
        label = int(row['label'])

        # construct full path if needed
        full_path = img_path if os.path.isabs(img_path) else os.path.join(self.img_root, img_path)
        try:
            with Image.open(full_path) as im:
                im = im.convert('RGB')
            if self.tfm is not None:
                im = self.tfm(im)
            return im, label
        except Exception as e:
            # Return a None sample so collate can drop it
            return None

def collate_skip_none(batch):
    batch = [b for b in batch if b is not None]
    if len(batch) == 0:
        return None
    x, y = zip(*batch)
    return torch.stack(x, dim=0), torch.tensor(y, dtype=torch.long)


In [4]:

# ===== Data loading: ImageFolder preferred, CSV fallback =====
# Set this to True to use images/0 and images/1 structure directly.
USE_IMAGEFOLDER = True
IMAGE_ROOT = "Projects/FakeJob/images"   # expects subfolders '0', '1', ... for class labels

# Standard ImageNet mean/std and input size for AlexNet (224x224)
mean = [0.485, 0.456, 0.406]
std  = [0.229, 0.224, 0.225]

train_tfm = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])
val_tfm = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

CLASS_WEIGHTS = None  # will be a list/np.array like [w0, w1, ...]

if USE_IMAGEFOLDER:
    # --- ImageFolder mode (no CSV needed) ---
    from torchvision import datasets

    base_ds = datasets.ImageFolder(root=IMAGE_ROOT, transform=train_tfm)
    print("Classes detected:", base_ds.classes)

    # Compute class counts and weights (inverse frequency)
    targets = [y for _, y in base_ds.samples]
    counts = Counter(targets)
    total = sum(counts.values())
    CLASS_WEIGHTS = [total / counts[i] for i in range(len(counts))]
    print("Class counts:", dict(counts))
    print("Class weights (inv freq):", CLASS_WEIGHTS)

    # Stratified split
    import numpy as np
    targets_np = np.array(targets)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=VAL_SPLIT, random_state=SEED)
    train_idx, val_idx = next(sss.split(np.zeros(len(targets_np)), targets_np))

    train_ds = torch.utils.data.Subset(base_ds, train_idx)
    val_ds   = torch.utils.data.Subset(base_ds, val_idx)
    # Ensure val uses val transforms
    val_ds.dataset.transform = val_tfm

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                              num_workers=NUM_WORKERS, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False,
                              num_workers=NUM_WORKERS, pin_memory=True)

    print("Stratified Train/Val sizes:", len(train_ds), len(val_ds))
else:
    # --- CSV fallback (expects CSV with columns: image_path,label) ---
    df = pd.read_csv(CSV_PATH)
    assert {'image_path','label'}.issubset(set(df.columns)), "CSV must contain columns: image_path,label"

    # Compute class weights
    counts = Counter(df['label'].astype(int).tolist())
    total = sum(counts.values())
    CLASS_WEIGHTS = [total / counts[i] for i in sorted(counts.keys())]
    print("Class counts:", dict(counts))
    print("Class weights (inv freq):", CLASS_WEIGHTS)

    # Stratified split
    import numpy as np
    y = df['label'].astype(int).to_numpy()
    sss = StratifiedShuffleSplit(n_splits=1, test_size=VAL_SPLIT, random_state=SEED)
    train_idx, val_idx = next(sss.split(np.zeros(len(y)), y))

    df_train = df.iloc[train_idx].copy().reset_index(drop=True)
    df_val   = df.iloc[val_idx].copy().reset_index(drop=True)

    print("Stratified Train/Val sizes:", len(df_train), len(df_val))

    train_ds = JobImageDataset(df_train, img_root=IMG_ROOT, tfm=train_tfm)
    val_ds   = JobImageDataset(df_val,   img_root=IMG_ROOT, tfm=val_tfm)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                              num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_skip_none)
    val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False,
                              num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_skip_none)


Classes detected: ['0', '1']
Class counts: {0: 5579, 1: 856}
Class weights (inv freq): [1.1534325147875963, 7.517523364485982]
Stratified Train/Val sizes: 5148 1287


In [5]:

def build_alexnet_classifier(num_classes=2, pretrained=True):
    # For torchvision >= 0.13, use Weights enums
    weights = models.AlexNet_Weights.IMAGENET1K_V1 if pretrained else None
    model = models.alexnet(weights=weights)
    # Replace final classifier layer
    in_feats = model.classifier[-1].in_features
    model.classifier[-1] = nn.Linear(in_feats, num_classes)
    return model


In [6]:

from typing import Optional

class AlexNetLitModule(L.LightningModule):
    def __init__(self, num_classes: int = NUM_CLASSES, lr: float = LR, class_weights: Optional[list] = None):
        super().__init__()
        self.save_hyperparameters()
        self.model = build_alexnet_classifier(num_classes=num_classes, pretrained=True)

        # class weights buffer
        if class_weights is not None:
            cw = torch.as_tensor(class_weights, dtype=torch.float32)
            self.register_buffer("class_weights", cw)
        else:
            self.class_weights = None

        self.criterion = nn.CrossEntropyLoss(weight=self.class_weights)
        self.lr = lr

        # caches for validation metrics
        self._val_targets, self._val_probs, self._val_preds = [], [], []

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        if batch is None:
            return None
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = (preds == y).float().mean()
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_acc",  acc,  on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def on_validation_epoch_start(self):
        self._val_targets, self._val_probs, self._val_preds = [], [], []

    def validation_step(self, batch, batch_idx):
        if batch is None:
            return None
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        preds = torch.argmax(logits, dim=1)
        probs = torch.softmax(logits, dim=1)[:, 1] if logits.size(1) > 1 else torch.sigmoid(logits).squeeze(-1)
        acc = (preds == y).float().mean()

        self._val_targets.extend(y.detach().cpu().tolist())
        self._val_preds.extend(preds.detach().cpu().tolist())
        self._val_probs.extend(probs.detach().cpu().tolist())

        self.log("val_loss", loss, on_epoch=True, prog_bar=True)
        self.log("val_acc",  acc,  on_epoch=True, prog_bar=True)
        return {"val_loss": loss, "val_acc": acc}

    def on_validation_epoch_end(self):
        if not self._val_targets:
            return
        import numpy as np
        y_true  = np.array(self._val_targets, dtype=int)
        y_pred  = np.array(self._val_preds,   dtype=int)
        y_score = np.array(self._val_probs,   dtype=float)

        # ranking metrics
        try:
            roc_auc = roc_auc_score(y_true, y_score)
        except Exception:
            roc_auc = float("nan")
        try:
            pr_auc  = average_precision_score(y_true, y_score)
        except Exception:
            pr_auc = float("nan")

        # thresholded metrics (fraud class = 1)
        try:
            f1   = f1_score(y_true, y_pred, pos_label=1)
            prec = precision_score(y_true, y_pred, pos_label=1)
            rec  = recall_score(y_true, y_pred, pos_label=1)
            mcc  = matthews_corrcoef(y_true, y_pred)
            bal_acc = balanced_accuracy_score(y_true, y_pred)
        except Exception:
            f1 = prec = rec = mcc = bal_acc = float("nan")

        self.log("PR_AUC",        pr_auc,   prog_bar=True,  on_epoch=True)
        self.log("ROC_AUC",       roc_auc,  prog_bar=False, on_epoch=True)
        self.log("F1_fraud",      f1,       prog_bar=True,  on_epoch=True)
        self.log("Prec_fraud",    prec,     prog_bar=False, on_epoch=True)
        self.log("Rec_fraud",     rec,      prog_bar=False, on_epoch=True)
        self.log("MCC",           mcc,      prog_bar=False, on_epoch=True)
        self.log("BalancedAcc",   bal_acc,  prog_bar=False, on_epoch=True)

    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=self.lr)


In [7]:

monitors_metric = "PR_AUC"

checkpoint_cb = ModelCheckpoint(
    monitor=monitors_metric,
    mode="max",
    save_last=True,
    save_top_k=1,
    filename="alexnet-{epoch:02d}-{val_acc:.4f}"
)
early_cb = EarlyStopping(
    monitor=monitors_metric, 
    mode="max", 
    patience=3
)
lrmon = LearningRateMonitor(logging_interval="epoch")

# precision/accelerator/devices logic
accelerator = "gpu" if torch.cuda.is_available() else "cpu"
devices = 1
precision = "16-mixed" if torch.cuda.is_available() else "32-true"

logger = CSVLogger(save_dir="logs", name="alexnet_finetune")

model = AlexNetLitModule(num_classes=NUM_CLASSES, lr=LR, class_weights=CLASS_WEIGHTS)

trainer = L.Trainer(
    max_epochs=MAX_EPOCHS,
    accelerator=accelerator,
    devices=devices,
    callbacks=[checkpoint_cb, early_cb, lrmon],
    log_every_n_steps=20,
    precision=precision,
    logger=logger
)

trainer.fit(model, train_loader, val_loader)


GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/anaconda3/envs/myenv/lib/python3.13/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | AlexNet          | 57.0 M | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
57.0 M    Trainable params
0         Non-trainable params
57.0 M    Total params
228.048   Total estimated model params size (MB)
25        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/anaconda3/envs/myenv/lib/python3.13/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:428: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

/opt/anaconda3/envs/myenv/lib/python3.13/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:428: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 0: 100%|██████████| 161/161 [01:17<00:00,  2.09it/s, v_num=4, train_loss_step=0.504, train_acc_step=0.786]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/41 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▏         | 1/41 [00:00<00:08,  4.46it/s][A
Validation DataLoader 0:   5%|▍         | 2/41 [00:00<00:08,  4.73it/s][A
Validation DataLoader 0:   7%|▋         | 3/41 [00:00<00:07,  4.84it/s][A
Validation DataLoader 0:  10%|▉         | 4/41 [00:00<00:07,  4.91it/s][A
Validation DataLoader 0:  12%|█▏        | 5/41 [00:01<00:07,  4.94it/s][A
Validation DataLoader 0:  15%|█▍        | 6/41 [00:01<00:07,  4.97it/s][A
Validation DataLoader 0:  17%|█▋        | 7/41 [00:01<00:06,  4.98it/s][A
Validation DataLoader 0:  20%|█▉        | 8/41 [00:01<00:06,  5.00it/s][A
Validation DataLoader 0:  22%|██▏       | 9/41 [00:01<00:06,  4.76it/s][A
Validation DataLoader 0:  24%|██▍       | 10/41



Epoch 1: 100%|██████████| 161/161 [01:27<00:00,  1.83it/s, v_num=4, train_loss_step=0.387, train_acc_step=0.929, val_loss=0.505, val_acc=0.693, PR_AUC=0.625, F1_fraud=0.428, train_loss_epoch=0.559, train_acc_epoch=0.766] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/41 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▏         | 1/41 [00:00<00:09,  4.33it/s][A
Validation DataLoader 0:   5%|▍         | 2/41 [00:00<00:08,  4.65it/s][A
Validation DataLoader 0:   7%|▋         | 3/41 [00:00<00:07,  4.79it/s][A
Validation DataLoader 0:  10%|▉         | 4/41 [00:00<00:07,  4.88it/s][A
Validation DataLoader 0:  12%|█▏        | 5/41 [00:01<00:07,  4.92it/s][A
Validation DataLoader 0:  15%|█▍        | 6/41 [00:01<00:07,  4.95it/s][A
Validation DataLoader 0:  17%|█▋        | 7/41 [00:01<00:06,  4.97it/s][A
Validation DataLoader 0:  20%|█▉        | 8/41 [00:01<00:06,  5.00it/s][A
Validation Da



Epoch 2: 100%|██████████| 161/161 [01:27<00:00,  1.84it/s, v_num=4, train_loss_step=0.293, train_acc_step=0.893, val_loss=0.448, val_acc=0.860, PR_AUC=0.671, F1_fraud=0.581, train_loss_epoch=0.403, train_acc_epoch=0.823] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/41 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▏         | 1/41 [00:00<00:09,  4.23it/s][A
Validation DataLoader 0:   5%|▍         | 2/41 [00:00<00:08,  4.64it/s][A
Validation DataLoader 0:   7%|▋         | 3/41 [00:00<00:07,  4.77it/s][A
Validation DataLoader 0:  10%|▉         | 4/41 [00:00<00:07,  4.84it/s][A
Validation DataLoader 0:  12%|█▏        | 5/41 [00:01<00:07,  4.88it/s][A
Validation DataLoader 0:  15%|█▍        | 6/41 [00:01<00:07,  4.92it/s][A
Validation DataLoader 0:  17%|█▋        | 7/41 [00:01<00:06,  4.94it/s][A
Validation DataLoader 0:  20%|█▉        | 8/41 [00:01<00:06,  4.96it/s][A
Validation Da

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 161/161 [01:50<00:00,  1.46it/s, v_num=4, train_loss_step=0.293, train_acc_step=0.893, val_loss=0.536, val_acc=0.765, PR_AUC=0.660, F1_fraud=0.486, train_loss_epoch=0.339, train_acc_epoch=0.848]


In [8]:

# --- Export validation predictions to CSV ---
model.eval()
preds_all = []
gts_all = []

with torch.no_grad():
    for batch in val_loader:
        if batch is None:
            continue
        x, y = batch
        logits = model(x.to(model.device))
        pred = torch.argmax(logits, dim=1).cpu().numpy().tolist()
        gt = y.numpy().tolist()
        preds_all.extend(pred)
        gts_all.extend(gt)

out_df = pd.DataFrame({"gt": gts_all, "pred": preds_all})
out_path = "val_predictions_alexnet.csv"
out_df.to_csv(out_path, index=False)
print(f"Saved {out_path} with {len(out_df)} rows")




Saved val_predictions_alexnet.csv with 1287 rows


## Optional: Extract 4096-D AlexNet features for multimodal fusion

In [10]:

# This cell turns AlexNet into a 4096-D feature extractor by dropping the last FC layer.
# It saves numpy arrays: features (N,4096) and labels (N,)

from torch.utils.data import DataLoader

def build_alexnet_feature_extractor(pretrained=True):
    weights = models.AlexNet_Weights.IMAGENET1K_V1 if pretrained else None
    model = models.alexnet(weights=weights)
    # Remove the last layer to get 4096-d features
    model.classifier = nn.Sequential(*list(model.classifier.children())[:-1])
    return model.eval()

def extract_features(loader: DataLoader, device='cuda' if torch.cuda.is_available() else 'cpu'):
    fe = build_alexnet_feature_extractor(pretrained=True).to(device).eval()
    feats, labs = [], []
    with torch.no_grad():
        for batch in loader:
            if batch is None:
                continue
            x, y = batch
            x = x.to(device, non_blocking=True)
            v = fe(x).cpu().numpy()  # [B,4096]
            feats.append(v); labs.extend(y.numpy().tolist())
    feats = np.concatenate(feats, axis=0)
    labs = np.array(labs)
    return feats, labs

feat_dir = "alexnet_features"
os.makedirs(feat_dir, exist_ok=True)

train_feats, train_labels = extract_features(train_loader)
val_feats,   val_labels   = extract_features(val_loader)

np.save(os.path.join(feat_dir, "train_alexnet_feats.npy"), train_feats)
np.save(os.path.join(feat_dir, "train_labels.npy"),       train_labels)
np.save(os.path.join(feat_dir, "val_alexnet_feats.npy"),  val_feats)
np.save(os.path.join(feat_dir, "val_labels.npy"),         val_labels)

print("Saved feature arrays under:", feat_dir)
print("Train feats:", train_feats.shape, "Val feats:", val_feats.shape)




Saved feature arrays under: alexnet_features
Train feats: (5148, 4096) Val feats: (1287, 4096)
