<a href="https://colab.research.google.com/github/akhtarali34/AI-Resume-Screening-/blob/main/NakbaTask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import numpy as np

from sklearn.metrics import (
    f1_score, accuracy_score,
    precision_score, recall_score,
    confusion_matrix
)

import matplotlib.pyplot as plt
from tqdm import tqdm


In [None]:
!pip install timm scikit-learn




In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


In [None]:
# =======================
# Mount Google Drive
# =======================
from google.colab import drive
drive.mount("/content/drive")

# =======================
# Imports
# =======================
import os
import zipfile
import pandas as pd

# =======================
# Paths
# =======================
ZIP_PATH = "/content/drive/MyDrive/NakbaData/ImgClassDevPhasePublic.zip"
EXTRACT_ROOT = "/content/NakbaData"

# =======================
# Extract dataset
# =======================
os.makedirs(EXTRACT_ROOT, exist_ok=True)

with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
    zip_ref.extractall(EXTRACT_ROOT)

# =======================
# Locate dataset root
# =======================
DATA_ROOT = None
for root, _, files in os.walk(EXTRACT_ROOT):
    if {"Train.xlsx", "Val.xlsx"}.issubset(files):
        DATA_ROOT = root
        break

if DATA_ROOT is None:
    raise FileNotFoundError("Dataset root not found")

# =======================
# Directories
# =======================
TRAIN_IMG_DIR = os.path.join(DATA_ROOT, "Train")
VAL_IMG_DIR   = os.path.join(DATA_ROOT, "Val")

# =======================
# Load labels
# =======================
train_df = pd.read_excel(os.path.join(DATA_ROOT, "Train.xlsx"))
val_df   = pd.read_excel(os.path.join(DATA_ROOT, "Val.xlsx"))

# =======================
# Label encoding
# =======================
LABEL_MAP = {
    "not_destruction": 0,
    "destruction": 1
}

train_df["label"] = train_df["label"].map(LABEL_MAP)
val_df["label"]   = val_df["label"].map(LABEL_MAP)

print("Train samples:", len(train_df))
print("Val samples:", len(val_df))
print(train_df["label"].value_counts())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train samples: 1400
Val samples: 199
label
0    906
1    494
Name: count, dtype: int64


In [None]:
class NakbaDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.loc[idx, "name"]   # ✅ FIXED
        label = self.df.loc[idx, "label"]

        img_path = os.path.join(self.img_dir, img_name)
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, label


In [None]:
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(0.2, 0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])


In [None]:
train_dataset = NakbaDataset(train_df, TRAIN_IMG_DIR, train_transform)
val_dataset   = NakbaDataset(val_df, VAL_IMG_DIR, val_transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)


In [None]:
def get_model(name):
    if name == "resnet50":
        model = models.resnet50(pretrained=True)
        model.fc = nn.Linear(model.fc.in_features, 1)

    elif name == "mobilenet":
        model = models.mobilenet_v2(pretrained=True)
        model.classifier[1] = nn.Linear(model.last_channel, 1)

    elif name == "efficientnet":
        model = models.efficientnet_b0(pretrained=True)
        model.classifier[1] = nn.Linear(
            model.classifier[1].in_features, 1
        )

    return model.to(device)


In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.bce = nn.BCEWithLogitsLoss(reduction="none")

    def forward(self, logits, targets):
        bce_loss = self.bce(logits, targets)
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()


In [None]:
def evaluate(model, loader, threshold=0.5):
    model.eval()
    probs, labels = [], []

    with torch.no_grad():
        for images, targets in loader:
            images = images.to(device)
            targets = targets.to(device)

            outputs = model(images).squeeze()
            p = torch.sigmoid(outputs)

            probs.extend(p.cpu().numpy())
            labels.extend(targets.cpu().numpy())

    probs = np.array(probs)
    labels = np.array(labels)
    preds = (probs > threshold).astype(int)

    return {
        "f1": f1_score(labels, preds, average="macro"),
        "acc": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "cm": confusion_matrix(labels, preds)
    }


In [None]:
def find_best_threshold(model, loader):
    best_f1, best_t = 0, 0.5
    for t in np.arange(0.3, 0.71, 0.05):
        metrics = evaluate(model, loader, t)
        if metrics["f1"] > best_f1:
            best_f1, best_t = metrics["f1"], t
    return best_t, best_f1


In [None]:
print(train_df.columns)
print(train_df.head())


Index(['name', 'label'], dtype='object')
                                                name  label
0  3207587610565662721_1379830319_320758761056566...      0
1  3208072596685860139_6036075308_320807259668586...      0
2  3208234742834392153_53084429579_32082347428343...      0
3  3208236458824789277_37252106111_32082364588247...      1
4  3208376724789487084_1379830319_320837672478948...      0


In [None]:
def train_model(model, loss_fn, epochs=20):
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="max", patience=3
    )

    best_f1 = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0

        for images, targets in tqdm(train_loader):
            images = images.to(device)
            targets = targets.float().to(device)

            optimizer.zero_grad()
            outputs = model(images).squeeze()
            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        metrics = evaluate(model, val_loader)
        scheduler.step(metrics["f1"])

        print(f"\nEpoch {epoch+1}")
        print("Loss:", running_loss / len(train_loader))
        print(metrics)

        if metrics["f1"] > best_f1:
            best_f1 = metrics["f1"]
            torch.save(model.state_dict(), "best_model.pth")
            print("✅ Best model saved")


In [None]:
model = get_model("resnet50")
loss_fn = FocalLoss()

train_model(model, loss_fn, epochs=20)


100%|██████████| 44/44 [00:32<00:00,  1.34it/s]



Epoch 1
Loss: 0.13698747982694345
{'f1': 0.7910909348843808, 'acc': 0.8140703517587939, 'precision': 0.7619047619047619, 'recall': 0.6857142857142857, 'cm': array([[114,  15],
       [ 22,  48]])}
✅ Best model saved


100%|██████████| 44/44 [00:33<00:00,  1.30it/s]



Epoch 2
Loss: 0.09929138574410569
{'f1': 0.6378779395296752, 'acc': 0.7386934673366834, 'precision': 0.875, 'recall': 0.3, 'cm': array([[126,   3],
       [ 49,  21]])}


100%|██████████| 44/44 [00:32<00:00,  1.36it/s]



Epoch 3
Loss: 0.10042552887038751
{'f1': 0.7846320346320346, 'acc': 0.8090452261306532, 'precision': 0.7580645161290323, 'recall': 0.6714285714285714, 'cm': array([[114,  15],
       [ 23,  47]])}


100%|██████████| 44/44 [00:32<00:00,  1.36it/s]



Epoch 4
Loss: 0.08847512745044449
{'f1': 0.7709771918811468, 'acc': 0.7788944723618091, 'precision': 0.6413043478260869, 'recall': 0.8428571428571429, 'cm': array([[96, 33],
       [11, 59]])}


100%|██████████| 44/44 [00:35<00:00,  1.23it/s]



Epoch 5
Loss: 0.08134110094132749
{'f1': 0.7760440429479576, 'acc': 0.8140703517587939, 'precision': 0.851063829787234, 'recall': 0.5714285714285714, 'cm': array([[122,   7],
       [ 30,  40]])}


100%|██████████| 44/44 [00:32<00:00,  1.34it/s]



Epoch 6
Loss: 0.06956643526527015
{'f1': 0.7615301889691197, 'acc': 0.8040201005025126, 'precision': 0.8444444444444444, 'recall': 0.5428571428571428, 'cm': array([[122,   7],
       [ 32,  38]])}


100%|██████████| 44/44 [00:30<00:00,  1.44it/s]



Epoch 7
Loss: 0.06511189712380822
{'f1': 0.7831719128329298, 'acc': 0.8190954773869347, 'precision': 0.8541666666666666, 'recall': 0.5857142857142857, 'cm': array([[122,   7],
       [ 29,  41]])}


100%|██████████| 44/44 [00:31<00:00,  1.41it/s]



Epoch 8
Loss: 0.0607830969278108
{'f1': 0.8040695766327535, 'acc': 0.8341708542713567, 'precision': 0.8627450980392157, 'recall': 0.6285714285714286, 'cm': array([[122,   7],
       [ 26,  44]])}
✅ Best model saved


100%|██████████| 44/44 [00:30<00:00,  1.44it/s]



Epoch 9
Loss: 0.06206415627490391
{'f1': 0.785988631126133, 'acc': 0.8241206030150754, 'precision': 0.8888888888888888, 'recall': 0.5714285714285714, 'cm': array([[124,   5],
       [ 30,  40]])}


100%|██████████| 44/44 [00:30<00:00,  1.43it/s]



Epoch 10
Loss: 0.05681423080915755
{'f1': 0.7760440429479576, 'acc': 0.8140703517587939, 'precision': 0.851063829787234, 'recall': 0.5714285714285714, 'cm': array([[122,   7],
       [ 30,  40]])}


100%|██████████| 44/44 [00:30<00:00,  1.45it/s]



Epoch 11
Loss: 0.05548336699774319
{'f1': 0.7787249814677539, 'acc': 0.8190954773869347, 'precision': 0.8863636363636364, 'recall': 0.5571428571428572, 'cm': array([[124,   5],
       [ 31,  39]])}


100%|██████████| 44/44 [00:30<00:00,  1.46it/s]



Epoch 12
Loss: 0.05190151840956374
{'f1': 0.8040695766327535, 'acc': 0.8341708542713567, 'precision': 0.8627450980392157, 'recall': 0.6285714285714286, 'cm': array([[122,   7],
       [ 26,  44]])}


100%|██████████| 44/44 [00:30<00:00,  1.45it/s]



Epoch 13
Loss: 0.05166570990431038
{'f1': 0.8040695766327535, 'acc': 0.8341708542713567, 'precision': 0.8627450980392157, 'recall': 0.6285714285714286, 'cm': array([[122,   7],
       [ 26,  44]])}


100%|██████████| 44/44 [00:30<00:00,  1.46it/s]



Epoch 14
Loss: 0.049945054279470984
{'f1': 0.830886446886447, 'acc': 0.8542713567839196, 'precision': 0.8727272727272727, 'recall': 0.6857142857142857, 'cm': array([[122,   7],
       [ 22,  48]])}
✅ Best model saved


100%|██████████| 44/44 [00:30<00:00,  1.46it/s]



Epoch 15
Loss: 0.05380384141409939
{'f1': 0.817620103473762, 'acc': 0.8442211055276382, 'precision': 0.8679245283018868, 'recall': 0.6571428571428571, 'cm': array([[122,   7],
       [ 24,  46]])}


100%|██████████| 44/44 [00:30<00:00,  1.45it/s]



Epoch 16
Loss: 0.053610369182107126
{'f1': 0.817620103473762, 'acc': 0.8442211055276382, 'precision': 0.8679245283018868, 'recall': 0.6571428571428571, 'cm': array([[122,   7],
       [ 24,  46]])}


100%|██████████| 44/44 [00:30<00:00,  1.45it/s]



Epoch 17
Loss: 0.05411439782685854
{'f1': 0.8242877325170709, 'acc': 0.8492462311557789, 'precision': 0.8703703703703703, 'recall': 0.6714285714285714, 'cm': array([[122,   7],
       [ 23,  47]])}


100%|██████████| 44/44 [00:30<00:00,  1.45it/s]



Epoch 18
Loss: 0.05087910393591632
{'f1': 0.817620103473762, 'acc': 0.8442211055276382, 'precision': 0.8679245283018868, 'recall': 0.6571428571428571, 'cm': array([[122,   7],
       [ 24,  46]])}


100%|██████████| 44/44 [00:30<00:00,  1.45it/s]



Epoch 19
Loss: 0.048673064574937926
{'f1': 0.830886446886447, 'acc': 0.8542713567839196, 'precision': 0.8727272727272727, 'recall': 0.6857142857142857, 'cm': array([[122,   7],
       [ 22,  48]])}


100%|██████████| 44/44 [00:29<00:00,  1.47it/s]



Epoch 20
Loss: 0.0515242515724491
{'f1': 0.8374183006535948, 'acc': 0.8592964824120602, 'precision': 0.875, 'recall': 0.7, 'cm': array([[122,   7],
       [ 21,  49]])}
✅ Best model saved


In [None]:
robust_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.GaussianBlur(5),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],
                         [0.229,0.224,0.225])
])

robust_dataset = NakbaDataset(val_df, VAL_IMG_DIR, robust_transform)
robust_loader = DataLoader(robust_dataset, batch_size=32)

robust_metrics = evaluate(model, robust_loader)
print("Robustness Macro F1:", robust_metrics["f1"])


Robustness Macro F1: 0.8242877325170709


In [None]:
# Load the best trained model
model = get_model("resnet50")
model.load_state_dict(torch.load("best_model.pth", map_location=device))
model.eval()




ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
# Run inference on VALIDATION SET (no shuffle)
predictions = []

with torch.no_grad():
    for images, _ in tqdm(val_loader):
        images = images.to(device)

        outputs = model(images).squeeze()
        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).long().cpu().numpy()

        predictions.extend(preds)


100%|██████████| 7/7 [00:04<00:00,  1.72it/s]


In [None]:
# Convert predictions to TEXT labels
INV_LABEL_MAP = {
    0: "not_destruction",
    1: "destruction"
}

predicted_labels = [INV_LABEL_MAP[p] for p in predictions]


In [None]:
# Create predictions.csv (MATCH XLSX ORDER)
submission_df = pd.DataFrame({
    "name": val_df["name"].values,
    "label": predicted_labels
})


In [None]:
submission_df.head()


Unnamed: 0,name,label
0,3208719757836632197_6193457977_320871975783663...,not_destruction
1,3208891248927756376_47097946384_32088912489277...,not_destruction
2,3209223995252929700_1476949184_386898870_33482...,destruction
3,3210450343581475802_51621841193_32104503435814...,destruction
4,3212537336353015700_44724541933_32125373363530...,destruction


In [None]:
# Drive directory (must exist)
SUBMISSION_DIR = "/content/drive/MyDrive/NakbaTask/submission"
os.makedirs(SUBMISSION_DIR, exist_ok=True)

CSV_PATH = os.path.join(SUBMISSION_DIR, "predictions.csv")

# Create predictions.csv (MATCH XLSX ORDER)
submission_df = pd.DataFrame({
    "name": val_df["name"].values,
    "label": predicted_labels
})

# Save CSV (UTF-8)
submission_df.to_csv(CSV_PATH, index=False, encoding="utf-8")

print("✅ predictions.csv saved to Drive:", CSV_PATH)

✅ predictions.csv saved to Drive: /content/drive/MyDrive/NakbaTask/submission/predictions.csv


In [None]:
import zipfile
import os

ZIP_PATH = os.path.join(SUBMISSION_DIR, "predictions.zip")

with zipfile.ZipFile(ZIP_PATH, "w", zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(CSV_PATH, arcname="predictions.csv")

print("✅ predictions.zip saved to Drive:", ZIP_PATH)

✅ predictions.zip saved to Drive: /content/drive/MyDrive/NakbaTask/submission/predictions.zip


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
