<a href="https://colab.research.google.com/github/Yadanar025/Vietnamese-Sign-Language-Translation-additionalfiles/blob/main/Sign%20Language%20Translator/File_Checking_NPY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


import numpy as np
import matplotlib.pyplot as plt

# Load the data
file_path = 'Vietnamese-Sign-Language-Translation/Sign Language Translator/Data/toi thay nho ban/18/2.npy'
data = np.load(file_path)

print("=== DETAILED ANALYSIS ===")
print(f"Total values: {len(data)}")
print(f"Non-zero values: {np.count_nonzero(data)}")
print(f"Zero values: {np.sum(data == 0)}")
print(f"Percentage zeros: {np.sum(data == 0)/len(data)*100:.1f}%")

# Find where zeros start
first_zero_index = np.where(data == 0)[0]
if len(first_zero_index) > 0:
    print(f"First zero at index: {first_zero_index[0]}")

# Check if zeros are only at the end
if np.all(data[first_zero_index[0]:] == 0):
    print("All zeros are at the end (padding)")
    useful_data = data[:first_zero_index[0]]
    print(f"Useful data length: {len(useful_data)}")
else:
    print("Zeros are scattered throughout")
    useful_data = data

# Plot the data
plt.figure(figsize=(12, 4))

# Plot 1: All values
plt.subplot(1, 3, 1)
plt.plot(data, 'b-', linewidth=1)
plt.axvline(x=first_zero_index[0] if len(first_zero_index) > 0 else len(data),
            color='r', linestyle='--', alpha=0.5)
plt.title(f'All {len(data)} values')
plt.xlabel('Index')
plt.ylabel('Value')
plt.grid(True, alpha=0.3)

# Plot 2: Non-zero values only
plt.subplot(1, 3, 2)
non_zero_indices = np.where(data != 0)[0]
if len(non_zero_indices) > 0:
    plt.plot(non_zero_indices, data[data != 0], 'g.', markersize=3)
    plt.title(f'{len(non_zero_indices)} non-zero values')
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.grid(True, alpha=0.3)

# Plot 3: Histogram of values
plt.subplot(1, 3, 3)
plt.hist(data[data != 0], bins=50, alpha=0.7, edgecolor='black')
plt.title('Distribution (excluding zeros)')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


import os
import numpy as np
from pathlib import Path

# Quick analysis - just tell me what you have
base_path = Path('/Vietnamese-Sign-Language-Translation/Sign Language Translator/Data')

print("üîç QUICK DATASET ANALYSIS")
print("=" * 40)

# Count all .npy files
all_npy = list(base_path.rglob('*.npy'))
print(f"Total .npy files: {len(all_npy)}")

# Count by phrase
print("\nüìÅ By phrase folder:")
phrase_counts = {}
for npy_file in all_npy:
    # Get phrase name (folder name)
    phrase = npy_file.parent.name
    phrase_counts[phrase] = phrase_counts.get(phrase, 0) + 1

for phrase, count in sorted(phrase_counts.items()):
    print(f"  {phrase}: {count} files")

# Check data quality
print("\nüìä Data quality check (sampling 20 random files):")
import random
sample_files = random.sample(all_npy, min(20, len(all_npy)))

zero_files = 0
valid_files = 0
shapes = {}

for file in sample_files:
    try:
        data = np.load(file)
        shape = data.shape
        shapes[shape] = shapes.get(shape, 0) + 1

        if np.all(data == 0):
            zero_files += 1
        elif np.any(data != 0):
            valid_files += 1
    except:
        continue

print(f"  Valid files (non-zero): {valid_files}")
print(f"  Zero files: {zero_files}")
print(f"  Common shapes: {shapes}")

# Find ONE good file to examine
print("\nüîé Looking for one good file to examine...")
for file in all_npy[:100]:  # Check first 100
    try:
        data = np.load(file)
        if np.any(data != 0) and data.shape == (126,):
            print(f"‚úÖ Found: {file.relative_to(base_path)}")
            print(f"   Non-zero values: {np.count_nonzero(data)}")
            print(f"   First 3 values: {data[:3]}")
            break
    except:
        continue
