# 수집 이미지

## 구글 드라이브 마운트 및 압축 해제

In [None]:
# 구글 드라이브 마운트

from google.colab import drive
drive.mount('/content/drive', reforce_remount=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 압축 해제

import zipfile

with zipfile.ZipFile('/content/drive/MyDrive/Poster.zip') as zip_ref:
    zip_ref.extractall()

In [None]:
# 압축 파일 경로 확인

import os

extracted_folder_path = '/content/Poster'

print("Contents of the extracted folder:")
for item in os.listdir(extracted_folder_path):
    print(item)

os.chdir(extracted_folder_path)
print("\nCurrent working directory:")
print(os.getcwd())

Contents of the extracted folder:
Contest
Book
Concert
Movie

Current working directory:
/content/Poster


## 손상된 파일 제거

In [None]:
# 손상된 파일 직접 제거

import PIL
from PIL import Image, ImageFile
import os
from tqdm.auto import tqdm

ImageFile.LOAD_TRUNCATED_IMAGES = True
Image.MAX_IMAGE_PIXELS = None

# 이미지가 저장된 디렉토리의 경로를 설정합니다.
image_directories = ['/content/Poster/Concert', '/content/Poster/Contest', '/content/Poster/Movie', '/content/Poster/Book']

# 손상된 파일의 목록을 저장할 리스트
corrupted_files = []

for image_directory in tqdm(image_directories):

    # 디렉토리 내의 모든 파일을 확인합니다.
    for filename in tqdm(os.listdir(image_directory)):
        # 파일 경로를 구성합니다.
        file_path = os.path.join(image_directory, filename)
        try:
            # 이미지 파일을 엽니다. 오류가 발생하면 except 블록으로 이동합니다.
            with Image.open(file_path) as img:
                # 파일이 이미지로 열리는지 간단하게 확인합니다.
                img.verify()
        except Exception as e:
            # 오류가 발생한 파일을 리스트에 추가합니다.
            print(f'Corrupted file: {file_path}')
            corrupted_files.append(file_path)

    # 손상된 파일들을 제거하거나 다른 처리를 수행합니다.
for corrupted_file in corrupted_files:
    os.remove(corrupted_file)

print(f'Removed corrupted files')

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/67945 [00:00<?, ?it/s]

Corrupted file: /content/Poster/Concert/PF130087.jpg
Corrupted file: /content/Poster/Concert/PF124355.jpg
Corrupted file: /content/Poster/Concert/PF123475.jpg
Corrupted file: /content/Poster/Concert/PF121523.jpg
Corrupted file: /content/Poster/Concert/PF123298.jpg
Corrupted file: /content/Poster/Concert/PF122225.jpg
Corrupted file: /content/Poster/Concert/PF130386.jpg
Corrupted file: /content/Poster/Concert/PF127820.jpg
Corrupted file: /content/Poster/Concert/PF126621.jpg
Corrupted file: /content/Poster/Concert/PF129213.jpg
Corrupted file: /content/Poster/Concert/PF128606.jpg
Corrupted file: /content/Poster/Concert/PF134650.jpg
Corrupted file: /content/Poster/Concert/PF122788.jpg
Corrupted file: /content/Poster/Concert/PF145120.jpg
Corrupted file: /content/Poster/Concert/PF130884.jpg
Corrupted file: /content/Poster/Concert/PF121527.jpg
Corrupted file: /content/Poster/Concert/PF123121.jpg
Corrupted file: /content/Poster/Concert/PF138510.jpg
Corrupted file: /content/Poster/Concert/PF1375



Corrupted file: /content/Poster/Concert/PF143511.jpg
Corrupted file: /content/Poster/Concert/PF137132.jpg
Corrupted file: /content/Poster/Concert/PF142559.jpg
Corrupted file: /content/Poster/Concert/PF128769.jpg
Corrupted file: /content/Poster/Concert/PF131847.jpg
Corrupted file: /content/Poster/Concert/PF136676.jpg
Corrupted file: /content/Poster/Concert/PF144234.jpg
Corrupted file: /content/Poster/Concert/PF132153.jpg
Corrupted file: /content/Poster/Concert/PF143638.jpg
Corrupted file: /content/Poster/Concert/PF130088.jpg
Corrupted file: /content/Poster/Concert/PF137081.jpg
Corrupted file: /content/Poster/Concert/PF145340.jpg
Corrupted file: /content/Poster/Concert/PF129380.jpg
Corrupted file: /content/Poster/Concert/PF133212.jpg
Corrupted file: /content/Poster/Concert/PF133503.jpg
Corrupted file: /content/Poster/Concert/PF122224.jpg
Corrupted file: /content/Poster/Concert/PF121529.jpg
Corrupted file: /content/Poster/Concert/PF129980.jpg
Corrupted file: /content/Poster/Concert/PF1359

  0%|          | 0/31315 [00:00<?, ?it/s]

Corrupted file: /content/Poster/Contest/CP_3645_.jpg
Corrupted file: /content/Poster/Contest/CP_15414.jpg


  0%|          | 0/56823 [00:00<?, ?it/s]

Removed corrupted files


In [None]:
# 저장된 corrupted file 제거

import pandas as pd
import os
from tqdm.auto import tqdm

df = pd.read_csv('/content/drive/MyDrive/final/corrupted_files.csv')

corrupted_files = df['corrupted_files'].tolist()

for corrupted_file in tqdm(corrupted_files):
    try:
        os.remove(corrupted_file)
    except FileNotFoundError:
        print(f'{corrupted_file} not found')
        continue

print(f'Removed corrupted files')

  0%|          | 0/440 [00:00<?, ?it/s]

Removed corrupted files


In [None]:
# 이미지 파일 확인

im = Image.open('/content/Poster/Concert/PF113674.jpg')
im.show()

# Feature Vector 추출

In [None]:
!pip install efficientnet-pytorch

Collecting efficientnet-pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.1-py3-none-any.whl size=16428 sha256=6cb74d2db2d0b198b300a0768fb878dbd4e94540024b6d738b852a6f0c368d02
  Stored in directory: /root/.cache/pip/wheels/03/3f/e9/911b1bc46869644912bda90a56bcf7b960f20b5187feea3baf
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.1


## fine tuning된 모델 로드

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset, ConcatDataset
from torchvision import models, datasets
import os
from tqdm.auto import tqdm
from PIL import Image, ImageFile
import numpy as np
import matplotlib.pyplot as plt
from efficientnet_pytorch import EfficientNet
import csv
import pandas as pd
import copy
from google.colab import drive
import pickle

# epoch 및 hyperparameter 정의
ImageFile.LOAD_TRUNCATED_IMAGES = True
Image.MAX_IMAGE_PIXELS = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
drive.mount('/content/drive')

# Define the MoCov2 model
class MoCov2Model(nn.Module):
    def __init__(self, base_encoder, projection_dim=128, temperature=0.07, dropout_rate=0.5, queue_size=384, momentum=0.99):
        super(MoCov2Model, self).__init__()

        # 이미지 특징 추출 - 학습 & 추론 단계에서 모두 사용
        # EfficientNet + 128차원의 출력을 가진 선형 레이어 추가하여 DB에 저장할 특징 벡터의 차원 수 축소
        self.encoder = nn.Sequential(
            base_encoder,
            nn.Linear(1000, 128)
        )

        # Encoder에 의해 추출된 특징을 projection - 학습 과정에서 contrastive loss를 계산하는 데 사용, 추론 시엔 사용 X
        # Projection head 추가
        self.projection_head = nn.Sequential(
            nn.Linear(128, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, projection_dim),
            nn.Dropout(dropout_rate)
        )

        # 모멘텀 인코더의 복사본 생성 (깊은 복사)
        self.momentum_encoder = copy.deepcopy(self.encoder)
        self.momentum_projection_head = copy.deepcopy(self.projection_head)

        # 모멘텀 인코더와 projection head의 모든 파라미터를 고정 (훈련 중 업데이트 방지)
        for param in self.momentum_encoder.parameters():
            param.requires_grad = False
        for param in self.momentum_projection_head.parameters():
            param.requires_grad = False

        # 기타 초기화
        self.queue = torch.zeros(queue_size, projection_dim).to(device)
        self.queue_ptr = 0
        self.momentum = momentum

    def _momentum_update(self):
        # 모멘텀 인코더 및 projection head의 가중치를 업데이트하는 함수
        with torch.no_grad():
            for param_q, param_k in zip(self.encoder.parameters(), self.momentum_encoder.parameters()):
                param_k.data = param_k.data * self.momentum + param_q.data * (1. - self.momentum)
            for param_q, param_k in zip(self.projection_head.parameters(), self.momentum_projection_head.parameters()):
                param_k.data = param_k.data * self.momentum + param_q.data * (1. - self.momentum)

    def forward(self, x, with_projection_head=True):
        x = self.encoder(x)
        if with_projection_head:
            x = self.projection_head(x)
        return x

    def enqueue_dequeue(self, keys):
        # 큐에 새로운 데이터 추가 및 오래된 데이터 제거 (CPU에서 수행)
        keys = keys.to('cpu')  # GPU에서 CPU로 이동
        batch_size = keys.size(0)
        ptr = int(self.queue_ptr)

        # replace the keys at ptr (dequeue and enqueue)
        space_left = self.queue.size(0) - ptr  # 남은 공간 계산
        # 마지막 배치의 크기가 다른 배치보다 작을 때 발생할 수 있는 예외 상황 처리
        if space_left < batch_size:
            # If 큐의 남은 공간이 batch_size보다 작을 경우, split the update
            self.queue[ptr:] = keys[:space_left]
            self.queue[:batch_size - space_left] = keys[space_left:]
            ptr = batch_size - space_left
        else:
            self.queue[ptr:ptr + batch_size] = keys
            ptr = (ptr + batch_size) % self.queue.size(0)  # move pointer

        self.queue_ptr = ptr

# load mean & std
file_path = '/content/drive/MyDrive/final/dataset_mean_std.pkl'

with open(file_path, 'rb') as f:
    mean_, std_ = pickle.load(f)

efficientnet_b2 = EfficientNet.from_pretrained('efficientnet-b2').to(device)
mocov2_model = MoCov2Model(efficientnet_b2).to(device)

if torch.cuda.is_available():
    mocov2_model.load_state_dict(torch.load('/content/drive/MyDrive/final/mocov2_best_model_231216.pth'))
else:
    mocov2_model.load_state_dict(torch.load('/content/drive/MyDrive/final/mocov2_best_model_231216.pth'), map_location=torch.device('cpu'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b2-8bb594d6.pth
100%|██████████| 35.1M/35.1M [00:00<00:00, 63.5MB/s]


Loaded pretrained weights for efficientnet-b2


In [None]:
torch.cuda.is_available()

True

## feature vector 추출 후 DB에 저장

In [None]:
import torch
from torchvision import transforms
from PIL import Image
from tqdm.auto import tqdm
import os
import pickle
import sqlite3

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean_, std_), # 학습 시 사용했던 평균 & 표준편차 값 사용
])

# SQLite 데이터베이스 연결
db_path = '/content/drive/MyDrive/final/Image_db_final.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# 이미지 파일명 리스트를 가져오는 함수
def get_image_filenames(folder_path):
    image_files = os.listdir(folder_path)
    image_filenames = [img for img in image_files if img.lower().endswith(('.png', '.jpg', '.jpeg'))]
    return image_filenames

# dataset의 특징 벡터를 저장하는 함수
def save_features(model, dataset_path, test_transform):
    model = model.to(device)  # 모델을 GPU로 이동
    model.eval()
    error = []

    with torch.no_grad():
        for image_directory in tqdm(dataset_path):
            for img_filename in tqdm(get_image_filenames(image_directory)):
                img_path = os.path.join(image_directory, img_filename)

                try:
                    # 추론 시엔 projection head를 사용하지 않고 특징 벡터를 추출
                    dataset_image = test_transform(Image.open(img_path).convert('RGB')).unsqueeze(0).to(device)
                    dataset_feature_vector = model(dataset_image, with_projection_head=False).cpu().numpy()

                    # SQLite에 파일명, 특징 벡터, boundingbox, path 저장
                    insert_query = "INSERT INTO IMAGE (filename, feature_vector, boundingbox, path) VALUES (?, ?, ?, ?);"
                    cursor.execute(insert_query, (img_filename, dataset_feature_vector.tobytes(), None, 'D:\\Poster\\Movie\\' + img_filename))

                    # 새로 추가된 row 정보(id, path만)를 출력
                    cursor.execute("SELECT id, path FROM IMAGE ORDER BY id DESC LIMIT 1;")
                    new_row_info = cursor.fetchone()
                    print(f"New Row ID: {new_row_info[0]}, Path: {new_row_info[1]}")

                    conn.commit()

                except Exception as e:
                    print(f"Error processing '{img_filename}': {e}")
                    error.append(img_filename)

    return error

# 이미지들이 있는 폴더 리스트
image_directories = [
    '/content/Poster/Book',
    '/content/Poster/Concert',
    '/content/Poster/Contest',
    '/content/Poster/Movie'
]

# 데이터베이스에 저장 및 에러 처리
error_list = save_features(mocov2_model, image_directories, test_transform)

# SQL 파일 업데이트
sql_backup_path = '/content/drive/MyDrive/final/Image_db_final.sql'
os.system(f"sqlite3 {db_path} .dump > {sql_backup_path}")

cursor.close()
conn.close()


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/56787 [00:00<?, ?it/s]

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
New Row ID: 296258, Path: D:\Poster\Movie\MP33382.jpg
New Row ID: 296259, Path: D:\Poster\Movie\MP11301.jpg
New Row ID: 296260, Path: D:\Poster\Movie\MP12747.jpg
New Row ID: 296261, Path: D:\Poster\Movie\MP35926.jpg
New Row ID: 296262, Path: D:\Poster\Movie\MP55939.jpg
New Row ID: 296263, Path: D:\Poster\Movie\MP20884.jpg
New Row ID: 296264, Path: D:\Poster\Movie\MP25207.jpg
New Row ID: 296265, Path: D:\Poster\Movie\MP47246.jpg
New Row ID: 296266, Path: D:\Poster\Movie\MP6290.jpg
New Row ID: 296267, Path: D:\Poster\Movie\MP48805.jpg
New Row ID: 296268, Path: D:\Poster\Movie\MP8251.jpg
New Row ID: 296269, Path: D:\Poster\Movie\MP3566.jpg
New Row ID: 296270, Path: D:\Poster\Movie\MP54382.jpg
New Row ID: 296271, Path: D:\Poster\Movie\MP40660.jpg
New Row ID: 296272, Path: D:\Poster\Movie\MP23918.jpg
New Row ID: 296273, Path: D:\Poster\Movie\MP7170.jpg
New Row ID: 296274, Path: D:\Poster\Movie\MP34398.jpg
New Row ID: 296275, Path: D:\Poster\

In [None]:
# row 수 확인

db_path = '/content/drive/MyDrive/final/Image_db_final.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

row_count_query = "SELECT COUNT(*) FROM IMAGE;"
cursor.execute(row_count_query)
row_count = cursor.fetchone()[0]

print(f"The number of rows in the 'IMAGE' table: {row_count}")

The number of rows in the 'IMAGE' table: 275969


# ANN 인덱스 구축

In [None]:
!pip install annoy
from annoy import AnnoyIndex

# 특징 벡터의 차원
f = 128

# Annoy 인덱스 생성
db_path = '/content/drive/MyDrive/final/Image_db_final.db'
annoy_index = AnnoyIndex(f, 'angular')
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# 특징 벡터 가져오기
cursor.execute('SELECT feature_vector FROM IMAGE')

# 특징 벡터를 ANN 인덱스에 추가
for row in cursor:
    id, feature_vector_blob = row
    feature_vector = np.frombuffer(feature_vector_blob, dtype=np.float32)
    t.add_item(id, feature_vector)

annoy_index.build(10)
annoy_index.save('/content/drive/MyDrive/final/annoy_index.ann')

conn.close()

