In [2]:
from google.colab import drive
drive.mount('/content/drive')
!pip install datasets --quiet

import os
import cv2
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# 데이터셋 경로 설정
data_path = '/content/drive/MyDrive/Colab Notebooks/COSE474/MVSA/data/'  # 이미지 및 텍스트 파일 폴더

# 텍스트 전처리 함수
def preprocess_text(text):
    text = text.lower()  # 소문자로 변환
    text = ''.join(e for e in text if e.isalnum() or e.isspace())  # 특수문자 제거
    return text

# 커스텀 데이터셋 클래스 정의
class ImageTextDataset(Dataset):
    def __init__(self, folder_path, transform=None):
        self.folder_path = folder_path
        self.transform = transform
        self.data = []

        # 폴더 내 파일 목록 가져오기
        file_names = [f.split('.')[0] for f in os.listdir(folder_path) if f.endswith('.jpg')]  # .jpg 파일 기준
        unique_ids = set(file_names)  # 중복 제거

        for file_id in unique_ids:
            # 이미지와 텍스트 파일 경로 생성
            image_file = os.path.join(folder_path, f"{file_id}.jpg")
            text_file = os.path.join(folder_path, f"{file_id}.txt")

            # 이미지와 텍스트 파일이 모두 있는 경우 로드
            if os.path.exists(image_file) and os.path.exists(text_file):
                self.data.append((image_file, text_file))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_file, text_file = self.data[idx]

        # 이미지 로드 및 전처리
        image = cv2.imread(image_file)
        if image is None:
            raise ValueError(f"이미지를 로드할 수 없습니다: {image_file}")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # RGB로 변환

        if self.transform:
            image = self.transform(image)
        else:
            # 기본 변환 적용
            image = cv2.resize(image, (224, 224))
            image = image / 255.0  # 정규화
            image = torch.tensor(image, dtype=torch.float32)
            image = image.permute(2, 0, 1)  # (H, W, C) -> (C, H, W)

        # 텍스트 로드 및 전처리
        with open(text_file, 'r') as file:
            raw_text = file.read().strip()
        text = preprocess_text(raw_text)

        return image, text

# 데이터셋 및 데이터로더 생성
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

dataset = ImageTextDataset(data_path, transform=transform)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=2)

In [None]:
save_path = '/content/drive/MyDrive/Colab Notebooks/COSE474/MVSA/dataset'
dataset.save_to_disk(save_path)

In [None]:
from datasets import load_from_disk

# 저장된 데이터셋 로드
loaded_dataset = load_from_disk(save_path)
print(loaded_dataset)

In [None]:

# 2. 데이터셋 전처리 (토크나이저 사용)
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

# 텍스트 토크나이즈
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# PyTorch DataLoader 생성
def collate_fn(batch):
    # 텍스트 입력과 이미지를 함께 반환
    input_ids = torch.stack([b["input_ids"][0] for b in batch]).to(device)
    attention_mask = torch.stack([b["attention_mask"][0] for b in batch]).to(device)
    images = torch.tensor([b["image"] for b in batch]).to(device)
    return {"input_ids": input_ids, "attention_mask": attention_mask}, images

# DataLoader 설정
batch_size = 32
data_loader = DataLoader(tokenized_dataset, batch_size=batch_size, collate_fn=collate_fn)


In [None]:

# 3. 모델 예측 (배치 처리)
all_labels = []
all_probabilities = []

for batch in data_loader:
    inputs, _ = batch  # 텍스트 입력만 사용
    with torch.no_grad():
        outputs = model(**inputs)

    # 소프트맥스 확률 계산
    probabilities = F.softmax(outputs.logits, dim=1)
    labels = torch.argmax(probabilities, dim=1).cpu().numpy()

    # 결과 저장
    all_labels.extend(labels)
    all_probabilities.extend(probabilities.cpu().numpy())

In [None]:
# 4. 결과 매핑 및 출력
label_map = {0: "negative", 1: "positive"}
all_labels = [label_map[label] for label in all_labels]

# 첫 번째 배치 결과 출력
for text, label, prob in zip(texts[:batch_size], all_labels[:batch_size], all_probabilities[:batch_size]):
    print(f"Text: {text}")
    print(f"Predicted Sentiment: {label}")
    print(f"Probabilities: {prob}")

> 여기까지 1차 task: 전처리+추후 학습에 쓸 이미지 pos/neg 비율(이건 러프하게 잡아도 될 것 같음. 어차피 학습을 서로 돌리면서 맞출 거니까)

In [None]:

# ALBert 모델과 토크나이저 로드
albert_model_name = "albert-base-v2"  # Hugging Face 모델 이름
tokenizer = AutoTokenizer.from_pretrained(albert_model_name)
albert_model = TFAutoModel.from_pretrained(albert_model_name)

# 입력 텍스트 처리 함수
def preprocess_texts(texts, max_length=128):
    tokenized = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="tf"
    )
    # 'token_type_ids' 제거
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"]
    }

# BiLSTM 모델 정의
def build_bilstm_model(albert_model, lstm_units=128):
    # ALBert 입력
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="attention_mask")

    # ALBert 임베딩 추출
    albert_outputs = albert_model(input_ids, attention_mask=attention_mask)
    albert_embeddings = albert_outputs.last_hidden_state  # (batch_size, seq_length, hidden_size)

    # BiLSTM 적용
    bilstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(lstm_units, return_sequences=False)
    )(albert_embeddings)

    # 최종 출력
    output = tf.keras.layers.Dense(lstm_units, activation="relu")(bilstm)

    # 모델 정의
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
    return model

# 텍스트 예제
texts = [
    "This is a positive example.",
    "I feel sad and lonely."
]

# 텍스트 전처리
tokenized_inputs = preprocess_texts(texts)

# BiLSTM 모델 빌드
bilstm_model = build_bilstm_model(albert_model)

# 모델 출력 확인
features = bilstm_model(tokenized_inputs)
print(features.shape)  # (batch_size, lstm_units)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFAlbertModel: ['predictions.LayerNorm.weight', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing TFAlbertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFAlbertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertModel for predictions without further training.


(2, 128)
