In [1]:
import torch, os
import torch.nn as nn
import timm
from PIL import Image
from torchvision import transforms
from transformers import (
    RobertaTokenizerFast,
    AutoModel,
    CLIPModel,
    AutoTokenizer,
    AutoModel as HF_AutoModel,
)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
IMG_SIZE = 256
MAX_LEN  = 96

T1_PATH = r"/Users/yatharthnehva/Desktop/ALL_MODELS/T1_FUSION_FINAL.pt"
T2_PATH = r"/Users/yatharthnehva/Desktop/ALL_MODELS/T2T_TEXT.pt"
T3_PATH = r"/Users/yatharthnehva/Desktop/ALL_MODELS/T3_MULTIMODAL.pt"
T4_PATH = r"/Users/yatharthnehva/Desktop/ALL_MODELS/T4_MULTIMODAL.pt"

class T1_FUSION(nn.Module):
    def __init__(self):
        super().__init__()
        self.txt = HF_AutoModel.from_pretrained("distilroberta-base")
        self.img = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        for p in self.img.vision_model.parameters():
            p.requires_grad = False
        self.txtp = nn.Linear(768, 256)
        self.imgp = nn.Linear(768, 256)
        self.fc   = nn.Sequential(
            nn.Linear(512, 256),
            nn.GELU(),
            nn.Linear(256, 2),
        )

    def forward(self, ids, mask, img):
        t = self.txt(input_ids=ids, attention_mask=mask).last_hidden_state[:, 0]
        with torch.no_grad():
            v = self.img.vision_model(img).pooler_output
        fused = torch.cat([self.txtp(t), self.imgp(v)], dim=1)
        return self.fc(fused)

t1_tok = RobertaTokenizerFast.from_pretrained("distilroberta-base")
t1_img_tfms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.481, 0.457, 0.408], [0.269, 0.261, 0.276]),
])
t1_id2label = {0: "non_informative", 1: "informative"}

class T2TextModel(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.txt = HF_AutoModel.from_pretrained("distilbert-base-uncased")
        H = self.txt.config.hidden_size
        self.ln = nn.LayerNorm(H)
        self.head = nn.Sequential(
            nn.Linear(H, 256),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes),
        )

    def forward(self, input_ids, attention_mask):
        out = self.txt(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        cls = out[:, 0, :]
        cls = self.ln(cls)
        return self.head(cls)

t2_tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
t2_id2label = {0: "humanitarian", 1: "non_informative", 2: "structure"}

class T3Multimodal(nn.Module):
    def __init__(self, num_classes=3, backbone_name="convnext_tiny"):
        super().__init__()
        self.img = timm.create_model(
            backbone_name,
            pretrained=True,
            num_classes=0,
            drop_path_rate=0.0,
        )
        with torch.no_grad():
            dummy = torch.zeros(1, 3, IMG_SIZE, IMG_SIZE)
            d_img = self.img(dummy).shape[-1]

        self.txt = HF_AutoModel.from_pretrained("distilbert-base-uncased")
        d_txt = self.txt.config.hidden_size
        self.txt_ln = nn.LayerNorm(d_txt)

        fused_dim = d_img + d_txt
        self.head = nn.Sequential(
            nn.Linear(fused_dim, 256),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.GELU(),
            nn.Linear(64, num_classes),
        )

    def forward(self, input_ids, attention_mask, image):
        f_img = self.img(image)
        out_txt = self.txt(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        cls_txt = self.txt_ln(out_txt[:, 0, :])
        fused = torch.cat([f_img, cls_txt], dim=-1)
        return self.head(fused)

t3_tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
t3_img_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
])
t3_id2label = {
    0: "little_or_no_damage",
    1: "mild_damage",
    2: "severe_damage",
}

class T4Multimodal(nn.Module):
    def __init__(self, num_classes=3, backbone_name="convnext_tiny"):
        super().__init__()
        self.img = timm.create_model(
            backbone_name,
            pretrained=True,
            num_classes=0,
            drop_path_rate=0.1,
        )
        with torch.no_grad():
            dummy = torch.zeros(1, 3, IMG_SIZE, IMG_SIZE)
            d_img = self.img(dummy).shape[-1]

        self.txt = HF_AutoModel.from_pretrained("distilbert-base-uncased")
        d_txt = self.txt.config.hidden_size
        self.txt_ln = nn.LayerNorm(d_txt)

        fused_dim = d_img + d_txt
        self.head = nn.Sequential(
            nn.Linear(fused_dim, 256),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.GELU(),
            nn.Linear(64, num_classes),
        )

    def forward(self, input_ids, attention_mask, image):
        f_img = self.img(image)
        out_txt = self.txt(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        cls_txt = self.txt_ln(out_txt[:, 0, :])
        fused = torch.cat([f_img, cls_txt], dim=-1)
        return self.head(fused)

t4_tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
t4_img_tfms = t3_img_tfms
t4_id2label = {
    0: "people_affected",
    1: "rescue_needed",
    2: "no_human",
}

t1_model = T1_FUSION().to(DEVICE)
t1_model.load_state_dict(torch.load(T1_PATH, map_location=DEVICE))
t1_model.eval()

t2_model = T2TextModel(num_classes=3).to(DEVICE)
t2_model.load_state_dict(torch.load(T2_PATH, map_location=DEVICE))
t2_model.eval()

t3_model = T3Multimodal(num_classes=3).to(DEVICE)
t3_model.load_state_dict(torch.load(T3_PATH, map_location=DEVICE))
t3_model.eval()

t4_model = T4Multimodal(num_classes=3).to(DEVICE)
t4_model.load_state_dict(torch.load(T4_PATH, map_location=DEVICE))
t4_model.eval()

print("✅ All four models loaded")

  from .autonotebook import tqdm as notebook_tqdm


✅ All four models loaded


In [2]:
import torch, os
import torch.nn as nn
import timm
from PIL import Image
from torchvision import transforms
from transformers import (
    RobertaTokenizerFast,
    AutoModel,
    CLIPModel,
    AutoTokenizer,
    AutoModel as HF_AutoModel,
)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
IMG_SIZE = 256
MAX_LEN  = 96

T1_PATH = r"/Users/yatharthnehva/Desktop/ALL_MODELS/T1_FUSION_FINAL.pt"
T2_PATH = r"/Users/yatharthnehva/Desktop/ALL_MODELS/T2T_TEXT.pt"
T3_PATH = r"/Users/yatharthnehva/Desktop/ALL_MODELS/T3_MULTIMODAL.pt"
T4_PATH = r"/Users/yatharthnehva/Desktop/ALL_MODELS/T4_MULTIMODAL.pt"

class T1_FUSION(nn.Module):
    def __init__(self):
        super().__init__()
        self.txt = HF_AutoModel.from_pretrained("distilroberta-base")
        self.img = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        for p in self.img.vision_model.parameters():
            p.requires_grad = False
        self.txtp = nn.Linear(768, 256)
        self.imgp = nn.Linear(768, 256)
        self.fc   = nn.Sequential(
            nn.Linear(512, 256),
            nn.GELU(),
            nn.Linear(256, 2),
        )

    def forward(self, ids, mask, img):
        t = self.txt(input_ids=ids, attention_mask=mask).last_hidden_state[:, 0]
        with torch.no_grad():
            v = self.img.vision_model(img).pooler_output
        fused = torch.cat([self.txtp(t), self.imgp(v)], dim=1)
        return self.fc(fused)

t1_tok = RobertaTokenizerFast.from_pretrained("distilroberta-base")
t1_img_tfms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.481, 0.457, 0.408], [0.269, 0.261, 0.276]),
])
t1_id2label = {0: "non_informative", 1: "informative"}

class T2TextModel(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.txt = HF_AutoModel.from_pretrained("distilbert-base-uncased")
        H = self.txt.config.hidden_size
        self.ln = nn.LayerNorm(H)
        self.head = nn.Sequential(
            nn.Linear(H, 256),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes),
        )

    def forward(self, input_ids, attention_mask):
        out = self.txt(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        cls = out[:, 0, :]
        cls = self.ln(cls)
        return self.head(cls)

t2_tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
t2_id2label = {0: "humanitarian", 1: "non_informative", 2: "structure"}

class T3Multimodal(nn.Module):
    def __init__(self, num_classes=3, backbone_name="convnext_tiny"):
        super().__init__()
        self.img = timm.create_model(
            backbone_name,
            pretrained=True,
            num_classes=0,
            drop_path_rate=0.0,
        )
        with torch.no_grad():
            dummy = torch.zeros(1, 3, IMG_SIZE, IMG_SIZE)
            d_img = self.img(dummy).shape[-1]

        self.txt = HF_AutoModel.from_pretrained("distilbert-base-uncased")
        d_txt = self.txt.config.hidden_size
        self.txt_ln = nn.LayerNorm(d_txt)

        fused_dim = d_img + d_txt
        self.head = nn.Sequential(
            nn.Linear(fused_dim, 256),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.GELU(),
            nn.Linear(64, num_classes),
        )

    def forward(self, input_ids, attention_mask, image):
        f_img = self.img(image)
        out_txt = self.txt(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        cls_txt = self.txt_ln(out_txt[:, 0, :])
        fused = torch.cat([f_img, cls_txt], dim=-1)
        return self.head(fused)

t3_tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
t3_img_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
])
t3_id2label = {
    0: "little_or_no_damage",
    1: "mild_damage",
    2: "severe_damage",
}

class T4Multimodal(nn.Module):
    def __init__(self, num_classes=3, backbone_name="convnext_tiny"):
        super().__init__()
        self.img = timm.create_model(
            backbone_name,
            pretrained=True,
            num_classes=0,
            drop_path_rate=0.1,
        )
        with torch.no_grad():
            dummy = torch.zeros(1, 3, IMG_SIZE, IMG_SIZE)
            d_img = self.img(dummy).shape[-1]

        self.txt = HF_AutoModel.from_pretrained("distilbert-base-uncased")
        d_txt = self.txt.config.hidden_size
        self.txt_ln = nn.LayerNorm(d_txt)

        fused_dim = d_img + d_txt
        self.head = nn.Sequential(
            nn.Linear(fused_dim, 256),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.GELU(),
            nn.Linear(64, num_classes),
        )

    def forward(self, input_ids, attention_mask, image):
        f_img = self.img(image)
        out_txt = self.txt(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        cls_txt = self.txt_ln(out_txt[:, 0, :])
        fused = torch.cat([f_img, cls_txt], dim=-1)
        return self.head(fused)

t4_tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
t4_img_tfms = t3_img_tfms
t4_id2label = {
    0: "people_affected",
    1: "rescue_needed",
    2: "no_human",
}

t1_model = T1_FUSION().to(DEVICE)
t1_model.load_state_dict(torch.load(T1_PATH, map_location=DEVICE))
t1_model.eval()

t2_model = T2TextModel(num_classes=3).to(DEVICE)
t2_model.load_state_dict(torch.load(T2_PATH, map_location=DEVICE))
t2_model.eval()

t3_model = T3Multimodal(num_classes=3).to(DEVICE)
t3_model.load_state_dict(torch.load(T3_PATH, map_location=DEVICE))
t3_model.eval()

t4_model = T4Multimodal(num_classes=3).to(DEVICE)
t4_model.load_state_dict(torch.load(T4_PATH, map_location=DEVICE))
t4_model.eval()

print("✅ All four models loaded")

tweet_text = (
    "A large apartment building has collapsed after the earthquake, the entire street is underwater, "
    "and dozens of injured people are still trapped inside waiting for rescue teams to arrive."
)

def load_image_for_t1(path=None):
    img = Image.new("RGB", (224, 224), (0, 0, 0))
    return t1_img_tfms(img).unsqueeze(0).to(DEVICE)

def load_image_for_t34(path=None):
    img = Image.new("RGB", (IMG_SIZE, IMG_SIZE), (0, 0, 0))
    return t3_img_tfms(img).unsqueeze(0).to(DEVICE)

def encode_t1_text(text):
    enc = t1_tok(text, padding="max_length", truncation=True, max_length=72, return_tensors="pt")
    return enc["input_ids"].to(DEVICE), enc["attention_mask"].to(DEVICE)

def encode_t2_text(text):
    enc = t2_tok(text, padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt")
    return enc["input_ids"].to(DEVICE), enc["attention_mask"].to(DEVICE)

def encode_t3_text(text):
    enc = t3_tok(text, padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt")
    return enc["input_ids"].to(DEVICE), enc["attention_mask"].to(DEVICE)

def encode_t4_text(text):
    enc = t4_tok(text, padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt")
    return enc["input_ids"].to(DEVICE), enc["attention_mask"].to(DEVICE)

@torch.no_grad()
def run_full_pipeline(text):
    # Task 1
    ids1, mask1 = encode_t1_text(text)
    img1 = load_image_for_t1()
    logits1 = t1_model(ids1, mask1, img1)
    t1_pred = logits1.argmax(1).item()
    t1_label = t1_id2label[t1_pred]

    # Task 2
    ids2, mask2 = encode_t2_text(text)
    logits2 = t2_model(ids2, mask2)
    t2_pred = logits2.argmax(1).item()
    t2_label = t2_id2label[t2_pred]

    img34 = load_image_for_t34()

    # Task 3 (always)
    ids3, mask3 = encode_t3_text(text)
    logits3 = t3_model(ids3, mask3, img34)
    t3_pred = logits3.argmax(1).item()
    t3_label = t3_id2label[t3_pred]

    # Task 4 (always)
    ids4, mask4 = encode_t4_text(text)
    logits4 = t4_model(ids4, mask4, img34)
    t4_pred = logits4.argmax(1).item()
    t4_label = t4_id2label[t4_pred]

    return {
        "task1": t1_label,
        "task2": t2_label,
        "task3": t3_label,
        "task4": t4_label,
    }

result = run_full_pipeline(tweet_text)
print(result)

✅ All four models loaded
{'task1': 'non_informative', 'task2': 'structure', 'task3': 'severe_damage', 'task4': 'people_affected'}
