# Perform data preprocessing for master-oez61

In [None]:
!rm -r data_cls/
!rm -r classification_raw_data/

In [None]:
!mkdir data_cls
!unzip data.zip -d ./data_cls

In [None]:
import os
from os import path


def malformat_filter(paths):
    # paths is an iterable of base dataset dirs, each expected to contain 'labels' and 'images'
    for base in paths:
        labels_dir = path.join(base, "labels")
        targets = []

        for fname in os.listdir(labels_dir):
            label_path = path.join(labels_dir, fname)

            with open(label_path, "r") as f:
                lines = f.readlines()

            for line in lines:
                words = line.split(" ")
                if len(words) > 5:
                    # strip extension from label_path
                    target = path.splitext(label_path)[0]
                    targets.append(target)
                    break

        for target in targets:
            # remove label file
            os.remove(target + ".txt")

            # map labels/... to images/... and remove corresponding image
            labels_dirname, label_basename = path.split(target)
            images_dirname = labels_dirname.replace("labels", "images", 1)
            image_path = path.join(images_dirname, label_basename + ".jpg")
            os.remove(image_path)


In [None]:
import os
from os import path

tiles = {
    "p": [i for i in range(1, 10)],
    "s": [i for i in range(1, 10)],
    "m": [i for i in range(1, 10)],
    "z": [i for i in range(1, 8)],
}


def create_dir():
    base_dir = path.join("classification_raw_data", "train")
    for key, val in tiles.items():
        for v in val:
            dir_name = f"{v}{key}"
            dir_path = path.join(base_dir, dir_name)
            os.makedirs(dir_path, exist_ok=True)


In [None]:
import os
from os import path
import yaml
import cv2
from datetime import datetime


def crop_and_save(base_path):
    # Ensure base_path has no trailing slash issues
    base_path = path.abspath(base_path)

    # Load class names
    data_yaml_path = path.join(base_path, "data.yaml")
    with open(data_yaml_path, "r") as f:
        data_yaml = yaml.safe_load(f)
        labels = data_yaml["names"]
    print(labels)

    destination_root = path.join(".", "classification_raw_data", "train")

    images_dir = path.join(base_path, "train", "images")
    labels_dir = path.join(base_path, "train", "labels")

    img_files = os.listdir(images_dir)
    image_names = [path.splitext(fname)[0] for fname in img_files]

    for img_name in image_names:
        tiles = []

        img_path = path.join(images_dir, f"{img_name}.jpg")
        img = cv2.imread(img_path)
        if img is None:
            # Skip if image cannot be read
            print(f"Warning: could not read image {img_path}")
            continue

        h, w = img.shape[:2]

        label_file = path.join(labels_dir, f"{img_name}.txt")
        if not path.exists(label_file):
            print(f"Warning: label file not found for {img_name}")
            continue

        with open(label_file, "r") as f:
            tiles = f.readlines()

        for tile in tiles:
            if not tile.strip():
                continue

            cls_id, x_c, y_c, bw, bh = map(float, tile.strip().split()[:5])
            cls_id = int(cls_id)

            # Denormalize
            x1 = int((x_c - bw / 2) * w)
            y1 = int((y_c - bh / 2) * h)
            x2 = int((x_c + bw / 2) * w)
            y2 = int((y_c + bh / 2) * h)

            # Clamp
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(w, x2), min(h, y2)

            crop = img[y1:y2, x1:x2]
            cls_name = labels[cls_id]

            # Special case: 0m, 0p, 0s
            if cls_name == "0m":
                cls_name = "5m"
            elif cls_name == "0p":
                cls_name = "5p"
            elif cls_name == "0s":
                cls_name = "5s"

            # Build destination directory and filename
            dest_dir = path.join(destination_root, cls_name)
            os.makedirs(dest_dir, exist_ok=True)

            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
            crop_filename = f"{img_name}_{timestamp}.jpg"
            crop_path = path.join(dest_dir, crop_filename)

            cv2.imwrite(crop_path, crop)
            print(f"Saved: {crop_path}")


In [None]:
malformat_filter(['data_cls/train/', 'data_cls/valid/', 'data_cls/test/'])
create_dir()
crop_and_save("data_cls/")

# Perform data preprocessing for mahjong-tiles-non-riichi-xe0ya

In [None]:
!rm -r data_cls1/
!mkdir data_cls1/
!unzip data_cls.zip -d data_cls1/

In [None]:
import os
from os import path
import yaml
import cv2
from datetime import datetime


def crop_and_save_mapping(base_path):
    def mapping(x: str) -> str:
        return x[1] + x[0]

    base_path = path.abspath(base_path)

    # Load labels
    data_yaml_path = path.join(base_path, "data.yaml")
    with open(data_yaml_path, "r") as f:
        data_yaml = yaml.safe_load(f)
        labels = data_yaml["names"]
    print(labels)

    destination_root = path.join(".", "classification_raw_data", "train")

    images_dir = path.join(base_path, "train", "images")
    labels_dir = path.join(base_path, "train", "labels")

    img_files = os.listdir(images_dir)
    image_names = [path.splitext(fname)[0] for fname in img_files]

    for img_name in image_names:
        img_path = path.join(images_dir, f"{img_name}.jpg")
        img = cv2.imread(img_path)
        if img is None:
            print(f"Warning: could not read image {img_path}")
            continue

        h, w = img.shape[:2]

        label_file = path.join(labels_dir, f"{img_name}.txt")
        if not path.exists(label_file):
            print(f"Warning: label file not found for {img_name}")
            continue

        with open(label_file, "r") as f:
            tiles = f.readlines()

        for tile in tiles:
            if not tile.strip():
                continue

            cls_id, x_c, y_c, bw, bh = map(float, tile.strip().split()[:5])
            cls_id = int(cls_id)

            # Denormalize
            x1 = int((x_c - bw / 2) * w)
            y1 = int((y_c - bh / 2) * h)
            x2 = int((x_c + bw / 2) * w)
            y2 = int((y_c + bh / 2) * h)

            # Clamp
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(w, x2), min(h, y2)

            crop = img[y1:y2, x1:x2]
            cls_name = mapping(labels[cls_id])

            dest_dir = path.join(destination_root, cls_name)
            os.makedirs(dest_dir, exist_ok=True)

            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
            crop_filename = f"{img_name}_{timestamp}.jpg"
            crop_path = path.join(dest_dir, crop_filename)

            cv2.imwrite(crop_path, crop)
            print(f"Saved: {crop_path}")


In [None]:
malformat_filter(['data_cls1/train/', 'data_cls1/valid/', 'data_cls1/test/'])
create_dir()
crop_and_save_mapping("data_cls1/")

In [None]:
!rm -r data_cls/
!rm -r classification_raw_data/

In [None]:
!mkdir data_cls
!unzip data.zip -d ./data_cls

In [None]:
from os.path import split
import os

# Filter out malformatted files
# For yolo without OBB, each line in label file contains only 5 values
def malformat_filter(paths):
    for path in paths:
        targets = []
        for file in os.listdir(path+'labels'):
            lines = []
            label_path = path+'labels/'+file
            with open(label_path, 'r') as f:
                lines = f.readlines()

            for line in lines:
                words = line.split(' ')
                if len(words) > 5:
                    targets.append('.'.join(label_path.split('.')[:-1]))
                    break

        for target in targets:
            os.remove(target+'.txt')
            os.remove(target.replace('labels', 'images', 1)+'.jpg')

In [None]:
# Create directories for classification task
import os

tiles = {
    'p': [i for i in range(1, 10)],
    's': [i for i in range(1, 10)],
    'm': [i for i in range(1, 10)],
    'z': [i for i in range(1, 8)]
}

def create_dir():
    for key, val in tiles.items():
        for v in val:
            dir_name = str(v) + key
            os.makedirs(f"classification_raw_data/train/{dir_name}", exist_ok=True)

In [None]:
# Traverse all training img + label file pairs
# Crop marked region
# Save to coresponding location
import yaml
import cv2
from datetime import datetime


def crop_and_save(path):
    labels = []
    with open(path+'data.yaml', 'r') as f:
        data_yaml = yaml.safe_load(f)
        labels = data_yaml['names']
    print(labels)
    destination_root = "./classification_raw_data/train"
    img_files = os.listdir(path + 'train/' + "images")

    image_names = [i[:-4] for i in img_files]
    for img_name in image_names: 
        tiles = []
        img = cv2.imread(path + 'train/' + "images/" + f'{img_name}.jpg')
        h, w = img.shape[:2]
        with open(path + 'train/' + "labels/" + f'{img_name}.txt', 'r') as f:
            tiles = f.readlines()
            for tile in tiles:
                # print(tile)
                if not tile:
                    continue

                cls_id, x_c, y_c, bw, bh = map(float, tile.strip().split()[:5])
                cls_id = int(cls_id)

                # Denormalize
                x1 = int((x_c - bw/2) * w)
                y1 = int((y_c - bh/2) * h)
                x2 = int((x_c + bw/2) * w)
                y2 = int((y_c + bh/2) * h)

                # Clamp
                x1, y1 = max(0, x1), max(0, y1)
                x2, y2 = min(w, x2), min(h, y2)

                crop = img[y1:y2, x1:x2]
                cls_name = labels[cls_id]

                # Special case: 0m, 0p, 0s
                if cls_name == '0m':
                    cls_name = '5m'
                if cls_name == '0p':
                    cls_name = '5p'
                if cls_name == '0s':
                    cls_name = '5s'

                current_time = datetime.now()
                crop_path = f"{destination_root}/{cls_name}/{current_time}.jpg"
                cv2.imwrite(crop_path, crop)
                print(f"Saved: {crop_path}")

In [None]:
malformat_filter(['data_cls/train/', 'data_cls/valid/', 'data_cls/test/'])
create_dir()
crop_and_save("data_cls/")

# Perform data preprocessing for mahjong-tiles-non-riichi-xe0ya

In [None]:
!rm -r data_cls1/
!mkdir data_cls1/
!unzip data_cls.zip -d data_cls1/

In [None]:
# Map the labels
import yaml
def crop_and_save_mapping(path):
    def mapping(x: str):
        return x[1] + x[0]
    
    labels = []
    with open(path+'data.yaml', 'r') as f:
        data_yaml = yaml.safe_load(f)
        labels = data_yaml['names']
    print(labels)
    destination_root = "./classification_raw_data/train"
    img_files = os.listdir(path + 'train/' + "images")

    image_names = [i[:-4] for i in img_files]
    for img_name in image_names: 
        tiles = []
        img = cv2.imread(path + 'train/' + "images/" + f'{img_name}.jpg')
        h, w = img.shape[:2]
        with open(path + 'train/' + "labels/" + f'{img_name}.txt', 'r') as f:
            tiles = f.readlines()
            for tile in tiles:
                # print(tile)
                if not tile:
                    continue

                cls_id, x_c, y_c, bw, bh = map(float, tile.strip().split()[:5])
                cls_id = int(cls_id)

                # Denormalize
                x1 = int((x_c - bw/2) * w)
                y1 = int((y_c - bh/2) * h)
                x2 = int((x_c + bw/2) * w)
                y2 = int((y_c + bh/2) * h)

                # Clamp
                x1, y1 = max(0, x1), max(0, y1)
                x2, y2 = min(w, x2), min(h, y2)

                crop = img[y1:y2, x1:x2]
                cls_name = mapping(labels[cls_id])

                current_time = datetime.now()
                crop_path = f"{destination_root}/{cls_name}/{current_time}.jpg"
                cv2.imwrite(crop_path, crop)
                print(f"Saved: {crop_path}")
    

In [None]:
malformat_filter(['data_cls1/train/', 'data_cls1/valid/', 'data_cls1/test/'])
create_dir()
crop_and_save_mapping("data_cls1/")