In [1]:
import os
import numpy as np
import tqdm
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim

from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer
from torchvision import models

In [2]:
os.listdir('yelp-dataset')

['photos']

In [3]:
fdf = pd.read_json('yelp_dataset_10000.json')
label_mapping = {label: idx for idx, label in enumerate(fdf['label'].unique())}

In [4]:
fdf['label'].unique()

array(['inside', 'outside', 'food', 'drink', 'menu'], dtype=object)

In [5]:
# Sample DataFrame (replace this with your actual DataFrame)
df = pd.DataFrame({'photo_path': fdf['photo_path'].tolist(),
                   'caption': fdf['caption'], 
                   'label': fdf['labelidx']})

# Split data into Test (20%) and Temp (80%) sets
temp_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Further split Temp into Train (60% of original) and Validation (20% of original)
train_df, val_df = train_test_split(temp_df, test_size=0.25, random_state=42)

# Custom Dataset class
class MultiModalDataset(Dataset):
    def __init__(self, dataframe, transform=None, tokenizer=None):
        self.data = dataframe
        self.transform = transform
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = self.data.iloc[idx, 0]
        image = Image.open(img_path).convert("RGB")
        label = self.data.iloc[idx, 2]
        
        if self.transform:
            image = self.transform(image)
        
        caption = self.data.iloc[idx, 1]
        text_encoded = self.tokenizer(caption, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
        
        for key in text_encoded:
            text_encoded[key] = text_encoded[key].squeeze(0)
        
        return image, text_encoded, torch.tensor(label, dtype=torch.long)

# Define image transformations
transform = transforms.Compose([
    transforms.CenterCrop(224),     # Center crop to 1:1 aspect ratio (224x224)
    transforms.Resize((224, 224)),  # Resize to 224x224 (if needed to enforce size)
    transforms.ToTensor()
])


# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create Dataset instances for training, validation, and test sets
train_dataset = MultiModalDataset(dataframe=train_df, transform=transform, tokenizer=tokenizer)
val_dataset = MultiModalDataset(dataframe=val_df, transform=transform, tokenizer=tokenizer)
test_dataset = MultiModalDataset(dataframe=test_df, transform=transform, tokenizer=tokenizer)

# Create DataLoaders for each set
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)


In [23]:
#df.head(10)

In [24]:
#df.tail(10)

In [25]:
#df.loc[df['label']==4]

In [6]:
class LightweightMultiModalClassifier(nn.Module):
    def __init__(self, num_classes):
        super(LightweightMultiModalClassifier, self).__init__()
        # Load pretrained SqueezeNet and freeze its parameters
        self.squeezenet = models.squeezenet1_1(pretrained=True)
        for param in self.squeezenet.parameters():
            param.requires_grad = False  # Freeze SqueezeNet
        
        # Modify SqueezeNet's classifier to output 256 features
        # The final output of SqueezeNet is 512 channels for its feature extractor
        self.squeezenet.classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Conv2d(512, 256, kernel_size=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten()
        )
        
        # Load TinyBERT and freeze its parameters
        self.bert = AutoModel.from_pretrained('prajjwal1/bert-tiny')
        for param in self.bert.parameters():
            param.requires_grad = False  # Freezes TinyBERT
        
        self.fc_text = nn.Linear(128, 256)  # TinyBERT has a hidden size of 128
        
        # Combined classifier with smaller fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, image, text):
        # Image processing with SqueezeNet
        image_features = self.squeezenet(image)
        
        # Text processing with TinyBERT
        text_features = self.bert(**text).last_hidden_state[:, 0, :]  # Using CLS token
        text_features = self.fc_text(text_features)
        
        # Combine and classify
        combined_features = torch.cat((image_features, text_features), dim=1)
        output = self.fc(combined_features)
        
        return output


In [7]:
# Initialize TinyBERT tokenizer
#tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny')

In [8]:
# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare the model, loss function, and optimizer
num_classes = len(label_mapping)
model = LightweightMultiModalClassifier(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and may be removed in the future, "
Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# Function to save the model
def save_model(model, optimizer, epoch, path="lightweight_multimodal_model_HJ.pth"):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, path)

# Function to load the model
def load_model(model, optimizer, path="lightweight_multimodal_model.pth"):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    return model, optimizer, start_epoch


In [10]:
import numpy as np
import tqdm

# Early stopping parameters
patience = 10  # Number of epochs to wait for improvement
min_val_loss = np.Inf
epochs_no_improve = 0

# Training loop with early stopping
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, texts, labels in tqdm.tqdm(train_loader):
        images, labels = images.to(device), labels.to(device)
        texts = {k: v.to(device) for k, v in texts.items()}
        
        optimizer.zero_grad()
        outputs = model(images, texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    avg_train_loss = running_loss / len(train_loader)
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    
    with torch.no_grad():
        for images, texts, labels in tqdm.tqdm(val_loader):  # Assuming val_loader is defined
            images, labels = images.to(device), labels.to(device)
            texts = {k: v.to(device) for k, v in texts.items()}

            outputs = model(images, texts)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    # Calculate average validation loss and accuracy
    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = correct_predictions / total_samples

    print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy * 100:.2f}%")

    
    # Early stopping
    if avg_val_loss < min_val_loss:
        min_val_loss = avg_val_loss
        epochs_no_improve = 0
        save_model(model, optimizer, epoch)  # Save the best model
        print(f"Validation loss improved; model saved.")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:17<00:00,  1.34it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.45it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [1/100], Train Loss: 0.7453, Val Loss: 0.4328, Val Accuracy: 85.50%
Validation loss improved; model saved.


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.46it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [2/100], Train Loss: 0.3720, Val Loss: 0.3474, Val Accuracy: 88.20%
Validation loss improved; model saved.


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.45it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [3/100], Train Loss: 0.3029, Val Loss: 0.2839, Val Accuracy: 90.20%
Validation loss improved; model saved.


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.46it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [4/100], Train Loss: 0.2597, Val Loss: 0.2773, Val Accuracy: 90.15%
Validation loss improved; model saved.


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.43it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.46it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [5/100], Train Loss: 0.2674, Val Loss: 0.2729, Val Accuracy: 90.50%
Validation loss improved; model saved.


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.45it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [6/100], Train Loss: 0.2389, Val Loss: 0.2635, Val Accuracy: 90.70%
Validation loss improved; model saved.


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.44it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [7/100], Train Loss: 0.2243, Val Loss: 0.2747, Val Accuracy: 90.35%


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.46it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [8/100], Train Loss: 0.2088, Val Loss: 0.2589, Val Accuracy: 90.10%
Validation loss improved; model saved.


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.46it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [9/100], Train Loss: 0.1935, Val Loss: 0.2462, Val Accuracy: 90.75%
Validation loss improved; model saved.


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.46it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [10/100], Train Loss: 0.1836, Val Loss: 0.2579, Val Accuracy: 91.50%


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.47it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [11/100], Train Loss: 0.1845, Val Loss: 0.2890, Val Accuracy: 90.50%


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.46it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [12/100], Train Loss: 0.1781, Val Loss: 0.2629, Val Accuracy: 90.10%


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.46it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [13/100], Train Loss: 0.1922, Val Loss: 0.2568, Val Accuracy: 91.25%


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.45it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [14/100], Train Loss: 0.1573, Val Loss: 0.2582, Val Accuracy: 91.10%


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.46it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [15/100], Train Loss: 0.1601, Val Loss: 0.2846, Val Accuracy: 90.95%


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.45it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [16/100], Train Loss: 0.1599, Val Loss: 0.2498, Val Accuracy: 91.40%


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.43it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.46it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [17/100], Train Loss: 0.1480, Val Loss: 0.2587, Val Accuracy: 91.30%


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.47it/s]
  0%|                                                                                            | 0/24 [00:00<?, ?it/s]

Epoch [18/100], Train Loss: 0.1285, Val Loss: 0.2563, Val Accuracy: 91.30%


100%|███████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.44it/s]

Epoch [19/100], Train Loss: 0.1431, Val Loss: 0.2997, Val Accuracy: 90.45%
Early stopping at epoch 19





In [11]:
# Load the model and optimizer
model, optimizer, start_epoch = load_model(model, optimizer, path="lightweight_multimodal_model_HJ.pth")
print(f"Resuming from epoch {start_epoch}")

Resuming from epoch 9


In [12]:
# Ensure the model is in evaluation mode
model.eval()

# Variables to track the total loss and accuracy
total_loss = 0.0
correct_predictions = 0
total_samples = 0

# Use no_grad context as we are not training, only evaluating
with torch.no_grad():
    for images, texts, labels in tqdm.tqdm(test_loader):
        images, labels = images.to(device), labels.to(device)
        texts = {k: v.to(device) for k, v in texts.items()}

        # Forward pass
        outputs = model(images, texts)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        
        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_samples += labels.size(0)

# Calculate average loss and accuracy
average_loss = total_loss / len(test_loader)
accuracy = correct_predictions / total_samples

# print(f"Test Loss: {average_loss:.4f}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.43it/s]

Test Accuracy: 91.25%





In [26]:
from sklearn import __version__ as sklearn_version
import torchvision

# version check
def print_versions():
    print(f"os: {os.name}")
    print(f"numpy: {np.__version__}")
    print(f"torch: {torch.__version__}")
    print(f"pandas: {pd.__version__}")
    print(f"tqdm: {tqdm.__version__}")
    print(f"torchvision: {torchvision.__version__}")
    print(f"PIL: {Image.__version__}")
    print(f"transformers: {DistilBertModel.__module__.split('.')[0]}")  # transformers의 주요 버전
    print(f"scikit-learn: {sklearn_version}")

# 버전 정보 출력
print_versions()

os: posix
numpy: 1.19.4
torch: 1.13.1+cu117
pandas: 1.1.5
tqdm: 4.60.0
torchvision: 0.14.1+cu117
PIL: 9.5.0
transformers: transformers
scikit-learn: 0.24.0
