In [37]:
import numpy as np
import os
import shutil
import json
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from collections import Counter
import logging
import sys


In [38]:
def setup_logger(name=__name__):
    """
    Sets up a logger that outputs to the console (stdout).
    """
    logger = logging.getLogger(name)
    if not logger.handlers:
        logger.setLevel(logging.INFO)
        handler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
    return logger

logger = setup_logger()

In [39]:
# Rename images to avoid name conflicts and copy to _preped folder
data_folder = '../data/pre-downloaded'
image_formats = ['.jpg', '.jpeg', '.png', '.bmp']
preped_folder = os.path.join(data_folder, '_preped')
os.makedirs(preped_folder, exist_ok=True)

for foldername in os.listdir(data_folder):
    folder_path = os.path.join(data_folder, foldername)
    if os.path.isdir(folder_path) and foldername != '_preped':
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                name, extension = os.path.splitext(filename)
                if extension in image_formats:
                    new_filename = f"{foldername}_{name}{extension}"
                    new_file_path = os.path.join(preped_folder, new_filename)
                    shutil.copy2(file_path, new_file_path)

In [40]:
label_file = ''
# Create labels folder and save modified JSON
labels_folder = '../data/_labels'
os.makedirs(labels_folder, exist_ok=True)
for foldername in os.listdir(data_folder):
    folder_path = os.path.join(data_folder, foldername)
    if os.path.isdir(folder_path) and foldername != '_labels' and foldername != 'consensus': 
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                name, extension = os.path.splitext(filename)
                if extension == '.json':
                    label_file = file_path
                    break
                else:
                    label_file = 'not found'
        if label_file == 'not found':
            logger.info(f"No label file found in folder: {foldername}")
            continue

        logger.info(f"Label file found: {label_file}")
        with open(label_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        cnt = 0
        # Process each entry
        for entry in data:
            if 'file_upload' in entry:
                # Remove labelstudio hash prefix
                parts = entry['file_upload'].split('-', 1)  # Split only on first '-'
                if len(parts) > 1:
                    entry['file_upload'] = f"{foldername}_{parts[1]}"
                    cnt += 1

        # Get original filename and create new path
        original_filename = os.path.basename(label_file)
        new_label_path = os.path.join(labels_folder, original_filename)

        # Save the modified JSON
        with open(new_label_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        logger.info(f"Total entries updated: {cnt}")

2025-12-10 17:59:40,168 - INFO - Label file found: ../data/pre-downloaded\B8V41Y\b8v41y.json
2025-12-10 17:59:40,176 - INFO - Total entries updated: 20
2025-12-10 17:59:40,178 - INFO - Label file found: ../data/pre-downloaded\C6037J\C6037J.json
2025-12-10 17:59:40,188 - INFO - Total entries updated: 34
2025-12-10 17:59:40,190 - INFO - Label file found: ../data/pre-downloaded\D6AE9F\D6AE9F.json
2025-12-10 17:59:40,197 - INFO - Total entries updated: 22
2025-12-10 17:59:40,201 - INFO - No label file found in folder: ECSGGY
2025-12-10 17:59:40,203 - INFO - Label file found: ../data/pre-downloaded\FGWUFP\FGWUFP.json
2025-12-10 17:59:40,209 - INFO - Total entries updated: 20
2025-12-10 17:59:40,211 - INFO - Label file found: ../data/pre-downloaded\FO6K58\FO6K58_labels.json
2025-12-10 17:59:40,223 - INFO - Total entries updated: 32
2025-12-10 17:59:40,228 - INFO - No label file found in folder: GI9Y8B
2025-12-10 17:59:40,232 - INFO - Label file found: ../data/pre-downloaded\GK1XQ4\project-1-

In [41]:
#Match the file names with the labels
image_names = list(os.listdir(preped_folder))
data_ready = []
for label_filename in os.listdir(labels_folder):
    label_path = os.path.join(labels_folder, label_filename)
    with open(label_path, 'r', encoding='utf-8') as f:
        labels = json.load(f)
    for entry in labels:
        if 'file_upload' in entry:
            if entry['file_upload'] in image_names:
                result = entry['annotations'][0].get('result')
                if len(result) > 0:
                    label = result[0].get('value').get('choices')[0]
                    data_ready.append((entry['file_upload'], label))
logger.info(f"Total matched entries: {len(data_ready)}")

2025-12-10 17:59:40,419 - INFO - Total matched entries: 264


In [42]:
# Reaname the 3 wrong labels:
for i in range(len(data_ready)):
    imge_name, label = data_ready[i]
    if label == 'neutral': data_ready[i] = (imge_name, '2_Neutralis')
    elif label == 'pronation': data_ready[i] = (imge_name, '1_Pronacio')
    elif label == 'supination': data_ready[i] = (imge_name, '3_Szupinacio')

In [43]:
# Read what picture are in consensus text file
consensus_file_path =  os.path.join(data_folder, 'consensus')
consensus_file = os.path.join(consensus_file_path, 'anklealign-consensus.txt')
with open(consensus_file, 'r', encoding='utf-8') as f:
    consensus_images = f.read().splitlines()

img_names = []
# Get every image name from the consensus file
for img in consensus_images:
    parts = img.split('\\')
    if len(parts) > 1:
        img_names.append(parts[-1])

# Count occurrences of each image name
img_counts = Counter(img_names)

# Keep only images that appear exactly once
unique_consensus_image_names = [img for img, count in img_counts.items() if count == 1]

logger.info(f"Total images in consensus: {len(img_names)}")
logger.info(f"Unique images (appearing exactly once): {len(unique_consensus_image_names)}")
logger.info(f"Duplicate images removed: {len(img_names) - len(unique_consensus_image_names)}")

2025-12-10 17:59:40,446 - INFO - Total images in consensus: 57
2025-12-10 17:59:40,447 - INFO - Unique images (appearing exactly once): 49
2025-12-10 17:59:40,448 - INFO - Duplicate images removed: 8


In [44]:
consensus_label_matrix = pd.DataFrame({
    'image': unique_consensus_image_names,
    '1_Pronacio': 0,
    '2_Neutralis': 0,
    '3_Szupinacio': 0
})

for consensus_label_file in os.listdir(consensus_file_path):
    extension = os.path.splitext(consensus_label_file)[1]
    consensus_label_path = os.path.join(consensus_file_path, consensus_label_file)
    if extension != '.json' or os.path.getsize(consensus_label_path) == 0:
        continue

    with open(consensus_label_path, 'r', encoding='utf-8') as f:
        labels = json.load(f)
    for entry in labels:
        if 'file_upload' in entry:
            img_name = entry['file_upload'].split('-', 1)[1]  # Remove hash prefix
            if img_name in unique_consensus_image_names:
                result = entry['annotations'][0].get('result')
                if len(result) > 0:
                    label = result[0].get('value').get('choices')[0]
                    if label == '1_Pronacio':
                        consensus_label_matrix.loc[consensus_label_matrix['image'] == img_name, '1_Pronacio'] += 1
                    elif label == '2_Neutralis':
                        consensus_label_matrix.loc[consensus_label_matrix['image'] == img_name, '2_Neutralis'] += 1
                    elif label == '3_Szupinacio':
                        consensus_label_matrix.loc[consensus_label_matrix['image'] == img_name, '3_Szupinacio'] += 1

In [45]:
unique_consensus_images = []

# Rename the images as the prepared data
for img in consensus_images:
    parts = img.split('\\')
    if len(parts) > 1 and parts[2] in unique_consensus_image_names:
        row = consensus_label_matrix.loc[consensus_label_matrix['image'] == parts[2]]
        max_col = row[['1_Pronacio', '2_Neutralis', '3_Szupinacio']].idxmax(axis=1).values[0]
        label = max_col
        unique_consensus_images.append((f"{parts[1]}_{parts[2]}", label))
logger.info(f"Total unique consensus images: {len(unique_consensus_images)}")

2025-12-10 17:59:41,260 - INFO - Total unique consensus images: 49


In [46]:
# Match the consensus images with the prepared data
matched_consensus = []
for img, _ in unique_consensus_images:
    for data_img, _ in data_ready:
        if img == data_img:
            matched_consensus.append((data_img))

train_data = [(img, label) for img, label in data_ready if img not in matched_consensus]
test_data = unique_consensus_images

logger.info(f"Total training data: {len(train_data)}")
logger.info(f"Total testing data: {len(test_data)}")

2025-12-10 17:59:41,273 - INFO - Total training data: 241
2025-12-10 17:59:41,274 - INFO - Total testing data: 49


In [47]:
# Save train and test data to files

pd.DataFrame(train_data, columns=['image', 'label']).to_csv(os.path.join(data_folder, 'train_data.csv'), index=False)
pd.DataFrame(test_data, columns=['image', 'label']).to_csv(os.path.join(data_folder, 'test_data.csv'), index=False)

In [48]:
# Get the majority class
all_data = train_data + test_data
labels = [label for _, label in all_data]

unique_labels, counts = np.unique(labels, return_counts=True)
majority_class = unique_labels[np.argmax(counts)]
logger.info(f"Majority class: {majority_class}")

# Baseline: Always predict the majority class
def baseline_predict(data):
    return [majority_class] * len(data)

# Evaluate baseline accuracy
true_labels = labels
predicted_labels = baseline_predict(all_data)
accuracy = np.mean([true == pred for true, pred in zip(true_labels, predicted_labels)])
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

logger.info(f"Baseline accuracy: {accuracy * 100:.2f}%")
logger.info(f"Baseline precision: {precision * 100:.2f}%")
logger.info(f"Baseline recall: {recall * 100:.2f}%")
logger.info(f"Baseline F1-score: {f1 * 100:.2f}%")

# For detailed per-class metrics
logger.info(f"Detailed Classification Report: \n{classification_report(true_labels, predicted_labels)}")


2025-12-10 17:59:41,303 - INFO - Majority class: 2_Neutralis
2025-12-10 17:59:41,319 - INFO - Baseline accuracy: 46.55%
2025-12-10 17:59:41,320 - INFO - Baseline precision: 21.67%
2025-12-10 17:59:41,320 - INFO - Baseline recall: 46.55%
2025-12-10 17:59:41,321 - INFO - Baseline F1-score: 29.57%
2025-12-10 17:59:41,330 - INFO - Detailed Classification Report: 
              precision    recall  f1-score   support

  1_Pronacio       0.00      0.00      0.00       119
 2_Neutralis       0.47      1.00      0.64       135
3_Szupinacio       0.00      0.00      0.00        36

    accuracy                           0.47       290
   macro avg       0.16      0.33      0.21       290
weighted avg       0.22      0.47      0.30       290



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
