In [20]:
import os, random
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"   # 또는 ":16:8"
os.environ["TOKENIZERS_PARALLELISM"] = "false"      # (HF 토크나이저 스레딩 고정용)
import numpy as np
import pandas as pd
import tqdm
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch
import torch.backends.cudnn as cudnn
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.model_selection import train_test_split

In [21]:
SEED = 42  # 딱 하나만 바꿔서 실험 구분

# (1) 파이썬/넘파이/파이토치 시드
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)   # multi-GPU 대비

# (2) 완전 결정론 모드 (성능 약간 하락 가능)
torch.use_deterministic_algorithms(True)

# (3) cuDNN/TF32 비활성화
cudnn.benchmark = False
cudnn.deterministic = True
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

def seed_worker(worker_id):
    # 각 worker별 고유 시드 부여
    worker_seed = (SEED + worker_id) % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(SEED)

<torch._C.Generator at 0x7fa22a877610>

In [22]:
os.listdir('yelp-dataset')
print("CWD =", os.getcwd())

CWD = /disk1/jupyter/placeness/SWTEST-2024/TEST1-YELP


In [23]:
fdf = pd.read_json('yelp_dataset_10000.json')
label_mapping = {label: idx for idx, label in enumerate(fdf['label'].unique())}

In [36]:
fdf.to_csv('yelp_dataset_10000.csv', index=False)
fdf.shape

(10000, 6)

In [25]:
# Sample DataFrame (replace this with your actual DataFrame)
df = pd.DataFrame({'photo_path': fdf['photo_path'].tolist(),
                   'caption': fdf['caption'], 
                   'label': fdf['labelidx']})

# Split data into Test (20%) and Temp (80%) sets
temp_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df['label'])

# Further split Temp into Train (60% of original) and Validation (20% of original)
train_df, val_df = train_test_split(temp_df, test_size=0.25, random_state=SEED, stratify=temp_df['label'])

# Custom Dataset class
class MultiModalDataset(Dataset):
    def __init__(self, dataframe, transform=None, tokenizer=None):
        self.data = dataframe
        self.transform = transform
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = self.data.iloc[idx, 0]
        image = Image.open(img_path).convert("RGB")
        label = self.data.iloc[idx, 2]
        
        if self.transform:
            image = self.transform(image)
        
        caption = self.data.iloc[idx, 1]
        text_encoded = self.tokenizer(caption, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
        
        for key in text_encoded:
            text_encoded[key] = text_encoded[key].squeeze(0)
        
        return image, text_encoded, torch.tensor(label, dtype=torch.long)

# Define image transformations
# transform = transforms.Compose([
#     transforms.CenterCrop(224),     # Center crop to 1:1 aspect ratio (224x224)
#     transforms.Resize((224, 224)),  # Resize to 224x224 (if needed to enforce size)
#     transforms.ToTensor()
# ])

image_size = 224

train_tf = transforms.Compose([
    transforms.RandomResizedCrop(image_size, scale=(0.7, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandAugment(num_ops=2, magnitude=9),  # torchvision>=0.13
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406),
                         std=(0.229, 0.224, 0.225))
])

val_tf = transforms.Compose([
    transforms.Resize(int(image_size * 1.14)),
    transforms.CenterCrop(image_size),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406),
                         std=(0.229, 0.224, 0.225))
])


# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create Dataset instances for training, validation, and test sets
train_dataset = MultiModalDataset(dataframe=train_df, transform=train_tf, tokenizer=tokenizer)
val_dataset = MultiModalDataset(dataframe=val_df, transform=val_tf, tokenizer=tokenizer)
test_dataset = MultiModalDataset(dataframe=test_df, transform=val_tf, tokenizer=tokenizer)

# Create DataLoaders for each set
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=0, pin_memory=True, worker_init_fn=seed_worker, generator=g) 
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=0, pin_memory=True, worker_init_fn=seed_worker, generator=g)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=0, pin_memory=True, worker_init_fn=seed_worker, generator=g)


In [26]:
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torchvision import models

class LightweightMultiModalClassifier(nn.Module):
    def __init__(self, num_classes):
        super(LightweightMultiModalClassifier, self).__init__()
        # Load pretrained SqueezeNet and freeze its parameters
        self.squeezenet = models.squeezenet1_1(pretrained=True)
        for param in self.squeezenet.parameters():
            param.requires_grad = False  # Freeze SqueezeNet
        
        # Modify SqueezeNet's classifier to output 256 features
        # The final output of SqueezeNet is 512 channels for its feature extractor
        self.squeezenet.classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Conv2d(512, 256, kernel_size=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten()
        )
        
        # Load TinyBERT and freeze its parameters
        self.bert = AutoModel.from_pretrained('prajjwal1/bert-tiny')
        for param in self.bert.parameters():
            param.requires_grad = False  # Freezes TinyBERT
        
        self.fc_text = nn.Linear(128, 256)  # TinyBERT has a hidden size of 128
        
        # Combined classifier with smaller fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, image, text):
        # Image processing with SqueezeNet
        image_features = self.squeezenet(image)
        
        # Text processing with TinyBERT
        text_features = self.bert(**text).last_hidden_state[:, 0, :]  # Using CLS token
        text_features = self.fc_text(text_features)
        
        # Combine and classify
        combined_features = torch.cat((image_features, text_features), dim=1)
        output = self.fc(combined_features)
        
        return output


In [27]:
# Initialize TinyBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny')

# Example tokenization
text = "Example caption text."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)


In [28]:
def build_param_groups(m, base_lr_text=3e-5, head_lr=1e-3, wd=0.01):
    groups = []
    for n, p in m.named_parameters():
        if not p.requires_grad: 
            continue
        # DistilBERT 쪽은 낮은 lr, 헤더/퓨전/클래시파이어는 높은 lr
        lr = base_lr_text if ("distilbert" in n.lower()) else head_lr
        # bias와 LayerNorm 계열은 weight decay 제외
        if any(x in n for x in ["bias", "LayerNorm.weight", "LayerNorm.bias"]):
            groups.append({"params":[p], "lr": lr, "weight_decay": 0.0})
        else:
            groups.append({"params":[p], "lr": lr, "weight_decay": wd})
    return groups

In [29]:
import torch.optim as optim

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare the model, loss function, and optimizer
num_classes = len(label_mapping)
model = LightweightMultiModalClassifier(num_classes).to(device)
criterion = nn.CrossEntropyLoss(label_smoothing=0.05)
optimizer = optim.AdamW(build_param_groups(model), eps=1e-8)



In [30]:
# Function to save the model
def save_model(model, optimizer, epoch, path="lightweight_multimodal_model_HJ.pth"):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, path)

# Function to load the model
def load_model(model, optimizer, path="lightweight_multimodal_model_HJ.pth"):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    return model, optimizer, start_epoch


In [31]:
import numpy as np
import tqdm

# Early stopping parameters
patience = 5  # Number of epochs to wait for improvement
min_val_loss = np.Inf
epochs_no_improve = 0

# Training loop with early stopping
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, texts, labels in tqdm.tqdm(train_loader):
        images, labels = images.to(device), labels.to(device)
        texts = {k: v.to(device) for k, v in texts.items()}
        
        optimizer.zero_grad()
        outputs = model(images, texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    avg_train_loss = running_loss / len(train_loader)
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    
    with torch.no_grad():
        for images, texts, labels in tqdm.tqdm(val_loader):  # Assuming val_loader is defined
            images, labels = images.to(device), labels.to(device)
            texts = {k: v.to(device) for k, v in texts.items()}

            outputs = model(images, texts)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    # Calculate average validation loss and accuracy
    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = correct_predictions / total_samples

    print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy * 100:.2f}%")

    
    # Early stopping
    if avg_val_loss < min_val_loss:
        min_val_loss = avg_val_loss
        epochs_no_improve = 0
        save_model(model, optimizer, epoch)  # Save the best model
        print(f"Validation loss improved; model saved.")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break


  0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 24/24 [00:54<00:00,  2.25s/it]
100%|██████████| 8/8 [00:11<00:00,  1.39s/it]


Epoch [1/100], Train Loss: 0.8334, Val Loss: 0.5270, Val Accuracy: 88.80%
Validation loss improved; model saved.


100%|██████████| 24/24 [00:54<00:00,  2.27s/it]
100%|██████████| 8/8 [00:11<00:00,  1.42s/it]


Epoch [2/100], Train Loss: 0.5124, Val Loss: 0.4522, Val Accuracy: 91.65%
Validation loss improved; model saved.


100%|██████████| 24/24 [00:37<00:00,  1.56s/it]
100%|██████████| 8/8 [00:08<00:00,  1.09s/it]


Epoch [3/100], Train Loss: 0.4643, Val Loss: 0.4104, Val Accuracy: 93.60%
Validation loss improved; model saved.


100%|██████████| 24/24 [00:36<00:00,  1.52s/it]
100%|██████████| 8/8 [00:08<00:00,  1.11s/it]


Epoch [4/100], Train Loss: 0.4512, Val Loss: 0.3995, Val Accuracy: 93.45%
Validation loss improved; model saved.


100%|██████████| 24/24 [00:36<00:00,  1.52s/it]
100%|██████████| 8/8 [00:08<00:00,  1.09s/it]


Epoch [5/100], Train Loss: 0.4329, Val Loss: 0.4080, Val Accuracy: 92.30%


100%|██████████| 24/24 [00:36<00:00,  1.52s/it]
100%|██████████| 8/8 [00:08<00:00,  1.10s/it]


Epoch [6/100], Train Loss: 0.4239, Val Loss: 0.3937, Val Accuracy: 93.05%
Validation loss improved; model saved.


100%|██████████| 24/24 [00:36<00:00,  1.52s/it]
100%|██████████| 8/8 [00:08<00:00,  1.10s/it]


Epoch [7/100], Train Loss: 0.4129, Val Loss: 0.4027, Val Accuracy: 92.95%


100%|██████████| 24/24 [00:36<00:00,  1.52s/it]
100%|██████████| 8/8 [00:08<00:00,  1.10s/it]


Epoch [8/100], Train Loss: 0.4040, Val Loss: 0.3962, Val Accuracy: 92.95%


100%|██████████| 24/24 [00:36<00:00,  1.53s/it]
100%|██████████| 8/8 [00:08<00:00,  1.09s/it]


Epoch [9/100], Train Loss: 0.4054, Val Loss: 0.3884, Val Accuracy: 93.30%
Validation loss improved; model saved.


100%|██████████| 24/24 [00:36<00:00,  1.53s/it]
100%|██████████| 8/8 [00:08<00:00,  1.09s/it]


Epoch [10/100], Train Loss: 0.3962, Val Loss: 0.3827, Val Accuracy: 93.60%
Validation loss improved; model saved.


100%|██████████| 24/24 [00:36<00:00,  1.52s/it]
100%|██████████| 8/8 [00:08<00:00,  1.09s/it]


Epoch [11/100], Train Loss: 0.3969, Val Loss: 0.3972, Val Accuracy: 92.90%


100%|██████████| 24/24 [00:36<00:00,  1.51s/it]
100%|██████████| 8/8 [00:08<00:00,  1.09s/it]


Epoch [12/100], Train Loss: 0.3947, Val Loss: 0.3833, Val Accuracy: 93.45%


100%|██████████| 24/24 [00:36<00:00,  1.51s/it]
100%|██████████| 8/8 [00:08<00:00,  1.09s/it]


Epoch [13/100], Train Loss: 0.3875, Val Loss: 0.3792, Val Accuracy: 93.65%
Validation loss improved; model saved.


100%|██████████| 24/24 [00:36<00:00,  1.52s/it]
100%|██████████| 8/8 [00:08<00:00,  1.10s/it]


Epoch [14/100], Train Loss: 0.3975, Val Loss: 0.3807, Val Accuracy: 93.45%


100%|██████████| 24/24 [00:36<00:00,  1.52s/it]
100%|██████████| 8/8 [00:08<00:00,  1.10s/it]


Epoch [15/100], Train Loss: 0.3846, Val Loss: 0.3850, Val Accuracy: 94.05%


100%|██████████| 24/24 [00:36<00:00,  1.52s/it]
100%|██████████| 8/8 [00:08<00:00,  1.10s/it]


Epoch [16/100], Train Loss: 0.3876, Val Loss: 0.4127, Val Accuracy: 93.25%


100%|██████████| 24/24 [00:36<00:00,  1.51s/it]
100%|██████████| 8/8 [00:08<00:00,  1.08s/it]


Epoch [17/100], Train Loss: 0.3820, Val Loss: 0.3756, Val Accuracy: 94.30%
Validation loss improved; model saved.


100%|██████████| 24/24 [00:35<00:00,  1.50s/it]
100%|██████████| 8/8 [00:08<00:00,  1.10s/it]


Epoch [18/100], Train Loss: 0.3743, Val Loss: 0.3771, Val Accuracy: 94.05%


100%|██████████| 24/24 [00:36<00:00,  1.52s/it]
100%|██████████| 8/8 [00:08<00:00,  1.10s/it]


Epoch [19/100], Train Loss: 0.3727, Val Loss: 0.3871, Val Accuracy: 94.30%


100%|██████████| 24/24 [00:36<00:00,  1.51s/it]
100%|██████████| 8/8 [00:08<00:00,  1.09s/it]


Epoch [20/100], Train Loss: 0.3714, Val Loss: 0.3761, Val Accuracy: 93.80%


100%|██████████| 24/24 [00:36<00:00,  1.52s/it]
100%|██████████| 8/8 [00:08<00:00,  1.08s/it]


Epoch [21/100], Train Loss: 0.3598, Val Loss: 0.3790, Val Accuracy: 94.30%


100%|██████████| 24/24 [00:36<00:00,  1.51s/it]
100%|██████████| 8/8 [00:08<00:00,  1.08s/it]

Epoch [22/100], Train Loss: 0.3660, Val Loss: 0.3832, Val Accuracy: 94.00%
Early stopping at epoch 22





In [32]:
# Load the model and optimizer
model, optimizer, start_epoch = load_model(model, optimizer, path="lightweight_multimodal_model_HJ.pth")
print(f"Resuming from epoch {start_epoch}")

Resuming from epoch 16


In [33]:
import torch

# Ensure the model is in evaluation mode
model.eval()

# Variables to track the total loss and accuracy
total_loss = 0.0
correct_predictions = 0
total_samples = 0

# Use no_grad context as we are not training, only evaluating
with torch.no_grad():
    for images, texts, labels in tqdm.tqdm(test_loader):
        images, labels = images.to(device), labels.to(device)
        texts = {k: v.to(device) for k, v in texts.items()}

        # Forward pass
        outputs = model(images, texts)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        
        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_samples += labels.size(0)

# Calculate average loss and accuracy
average_loss = total_loss / len(test_loader)
accuracy = correct_predictions / total_samples

# print(f"Test Loss: {average_loss:.4f}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

100%|██████████| 8/8 [00:08<00:00,  1.09s/it]

Test Accuracy: 93.30%





In [34]:
# Test Accuracy: 93.30%