In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Goal of the project 

# input an image from the test set
# The model will output predicted labels (e.g., [1, 8, 19]).
# These labels will then be mapped to a caption using:
 #- A pre-defined lookup table for captions associated with labels.
 #- or a text generation model 

train = '/kaggle/input/multi-label-classification-competition-2024/COMP5329S1A2Dataset/train.csv'
test = '/kaggle/input/multi-label-classification-competition-2024/COMP5329S1A2Dataset/test.csv'
image_dir = '/kaggle/input/multi-label-classification-competition-2024/COMP5329S1A2Dataset/data'

In [None]:
import re
from io import StringIO

In [None]:
with open(train) as file:
    lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
train_df = pd.read_csv(StringIO(''.join(lines)), escapechar="/")

In [None]:
with open(test) as file:
    lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
test_df = pd.read_csv(StringIO(''.join(lines)), escapechar="/")

In [None]:
train_df.head(20)

In [None]:
test_df.head()

In [None]:
len(train_df['Labels'].unique())

In [None]:
all_labels = train_df['Labels'].apply(lambda x: x.split()).explode().unique()

# Checking how many unique labels exist (should be between 1 and 19, excluding 12)
len(all_labels)

In [None]:

all_possible_labels = set(map(str, range(1, 20)))
missing_labels = all_possible_labels - set(all_labels)

# Output the missing label
missing_labels

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
 # converting all the labels into a binary matrix

train_df['Labels'] = train_df['Labels'].apply(lambda x: list(map(int, x.split())))

# all classes (e.g., excluding class 12)
all_classes = [i for i in range(1, 20) if i != 12]

# binary matrix
mlb = MultiLabelBinarizer(classes=all_classes)
binary_labels = mlb.fit_transform(train_df['Labels'])

np.save('train_labels.npy', binary_labels)
print(f"Binary labels saved: {binary_labels.shape}")

In [None]:
loaded_labels = np.load('train_labels.npy')
print(loaded_labels)
print(loaded_labels.shape)

In [None]:
sample_index = 0  # checking for image 1 
print("Original Labels:", train_df['Labels'][sample_index])
print("Binary Representation:", binary_labels[sample_index])


In [None]:
import os
import torch
from torchvision import transforms
from PIL import Image
# processing all images at once caused the kernel to restart again and again
# so doing it in chunks of 5000 images at a time.

# preprocessing pipeline
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize
    transforms.ToTensor(),          # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])

def preprocess_images_in_chunk(image_dir, filenames, chunk_size, output_dir):
    """
    Preprocess images in smaller chunks and save each chunk separately.
    Args:
        image_dir: Path to the folder containing images.
        filenames: List of image filenames to process.
        chunk_size: Number of images to process in one chunk.
        output_dir: Directory to save the preprocessed chunks.
    """
    os.makedirs(output_dir, exist_ok=True)
    total_files = len(filenames)
    
    for start_idx in range(0, total_files, chunk_size):
        end_idx = min(start_idx + chunk_size, total_files)
        chunk_filenames = filenames[start_idx:end_idx]
        
        image_data = []
        for img_name in chunk_filenames:
            img_path = os.path.join(image_dir, img_name)
            img_tensor = image_transform(Image.open(img_path).convert('RGB'))
            image_data.append(img_tensor)
        
        # Save the current chunk
        chunk_file = os.path.join(output_dir, f'preprocessed_images_{start_idx}_{end_idx}.pt')
        torch.save(torch.stack(image_data), chunk_file)
        print(f"Saved chunk {start_idx}-{end_idx} to {chunk_file}")


train_filenames = train_df['ImageID'].tolist()
chunk_size = 5000  # Process 5,000 images at a time
output_dir = './preprocessed_train_chunks'

preprocess_images_in_chunk(image_dir, train_filenames, chunk_size, output_dir)


In [None]:
def preprocess_test_images_in_parts(image_dir, filenames, output_dir, part_size=1000):
    """
    Preprocess test images and save in smaller parts.
    Args:
        image_dir: Path to the folder containing images.
        filenames: List of test image filenames.
        output_dir: Directory to save parts.
        part_size: Number of images per part.
    """
    import os
    os.makedirs(output_dir, exist_ok=True)

    for start_idx in range(0, len(filenames), part_size):
        end_idx = min(start_idx + part_size, len(filenames))
        part_filenames = filenames[start_idx:end_idx]
        
        # Process images in the current part
        image_data = []
        for img_name in part_filenames:
            img_path = os.path.join(image_dir, img_name)
            img_tensor = image_transform(Image.open(img_path).convert('RGB'))
            image_data.append(img_tensor)
        
        part_tensor = torch.stack(image_data)
        part_file = os.path.join(output_dir, f'test_images_part_{start_idx}_{end_idx}.pt')
        torch.save(part_tensor, part_file)
        print(f"Saved test images part {start_idx}-{end_idx} to {part_file}")