In [None]:
import numpy as np
import os
from PIL import Image
from sklearn.model_selection import train_test_split
import pickle

class DataLoader:

    def __init__(self, data_folder='10000'):
        self.data_folder = data_folder
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        
    def load_images_from_folder(self):

        print("Loading images from folder structure...")
        
        images = []
        labels = []
        
        # Loop through each digit folder (0-9)
        for digit in range(10):
            digit_folder = os.path.join(self.data_folder, str(digit))
            
            if not os.path.exists(digit_folder):
                print(f"Warning: Folder {digit_folder} not found!")
                continue
            
            # Get all image files in the digit folder
            image_files = [f for f in os.listdir(digit_folder) 
                          if f.endswith(('.png', '.jpg', '.jpeg', '.bmp'))]
            
            print(f"Loading digit {digit}: {len(image_files)} images")
            
            for img_file in image_files:
                img_path = os.path.join(digit_folder, img_file)
                
                try:
                    # Load image and convert to grayscale
                    img = Image.open(img_path).convert('L')
                    
                    # Resize to 28x28 if necessary
                    img = img.resize((28, 28))
                    
                    # Convert to numpy array and flatten
                    img_array = np.array(img).flatten()
                    
                    images.append(img_array)
                    labels.append(digit)
                    
                except Exception as e:
                    print(f"Error loading {img_path}: {e}")
        
        self.X = np.array(images)
        self.y = np.array(labels)
        
        print(f"\nDataset loaded successfully!")
        print(f"Total samples: {len(self.X)}")
        print(f"Image shape: 28x28 (784 features)")
        print(f"Classes: {np.unique(self.y)}")
        
        # Print class distribution
        print("\nClass distribution:")
        for digit in range(10):
            count = np.sum(self.y == digit)
            print(f"  Digit {digit}: {count} samples")
        
        return self.X, self.y
    
    
    def normalize(self):
        """
        Comprehensive preprocessing for MNIST-like images
        """
        # 1. Normalize to [0, 1]
        X_normalized = self.X.astype('float32') / 255.0

        # 2. Invert if needed (black on white â†’  white on black)
        X_inverted = 1.0 - X_normalized



        self.X = X_inverted
        print(f"Normalized: range [{np.min(self.X):.3f}, {np.max(self.X):.3f}]")

    def split(self, test_size=0.2, random_state=42):
        """
        Normalize pixel values and split into train/test sets
        """
        print("\nPreprocessing data...")
        

        # Split into train and test sets (80-20 split)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, 
            test_size=test_size, 
            random_state=random_state, 
            stratify=self.y
        )


        print(f"Training set: {self.X_train.shape[0]} samples")
        print(f"Testing set: {self.X_test.shape[0]} samples")
        print(f"Feature shape: {self.X_train.shape[1]} features")
        
        return self.X_train, self.X_test, self.y_train, self.y_test
    
    def save_preprocessed_data(self, output_file='preprocessed_data.pkl'):
        """
        Save preprocessed data for reuse
        """
        data = {
            'X_train': self.X_train,
            'X_test': self.X_test,
            'y_train': self.y_train,
            'y_test': self.y_test
        }
        
        with open(output_file, 'wb') as f:
            pickle.dump(data, f)
        
        print(f"\nPreprocessed data saved to {output_file}")
    
    def load_preprocessed_data(self, input_file='preprocessed_data.pkl'):
        """
        Load previously saved preprocessed data
        """
        print(f"Loading preprocessed data from {input_file}...")
        
        with open(input_file, 'rb') as f:
            data = pickle.load(f)
        
        self.X_train = data['X_train']
        self.X_test = data['X_test']
        self.y_train = data['y_train']
        self.y_test = data['y_test']
        
        print("Preprocessed data loaded successfully!")
        print(f"Training set: {self.X_train.shape[0]} samples")
        print(f"Testing set: {self.X_test.shape[0]} samples")
        
        return self.X_train, self.X_test, self.y_train, self.y_test


def print_data_statistics(X_train, y_train, X_test, y_test):
    """
    Print dataset statistics
    """
    print("\nDATASET STATISTICS")
    print("-"*40)
    print(f"Total Samples: {len(X_train) + len(X_test)}")
    print(f"Training Samples: {len(X_train)}")
    print(f"Test Samples: {len(X_test)}")
    print(f"Features: {X_train.shape[1]}")
    print(f"Number of Classes: {len(np.unique(y_train))}")
    print(f"\nTraining Set Distribution: {np.bincount(y_train)}")
    print(f"Test Set Distribution: {np.bincount(y_test)}")
    print(f"\nPixel Value Mean: {np.mean(X_train):.4f}")
    print(f"Pixel Value Std: {np.std(X_train):.4f}")
    print("-"*40)

def main():
    """
    Main function to load and preprocess data
    """
    # Initialize data loader
    loader = DataLoader(data_folder='10000')
    
    # Load images from folder structure
    loader.X, loader.y = loader.load_images_from_folder()
    
    loader.normalize()
    # Preprocess data (normalize and split)
    X_train, X_test, y_train, y_test = loader.split(test_size=0.2)
    
    # Save preprocessed data for reuse
    loader.save_preprocessed_data('preprocessed_data.pkl')
    
    print("\n" + "="*60)
    print("Data preprocessing completed!")
    print("="*60)
    print("\nYou can now use this data to train your models.")
    print("The preprocessed data has been saved to 'preprocessed_data.pkl'")
    print_data_statistics(X_train, y_train, X_test, y_test)


if __name__ == "__main__":
            main()

Loading images from folder structure...
Loading digit 0: 1000 images
Loading digit 1: 1000 images
Loading digit 2: 1000 images
Loading digit 3: 1000 images
Loading digit 4: 1000 images
Loading digit 5: 1000 images
Loading digit 6: 1000 images
Loading digit 7: 1000 images
Loading digit 8: 1000 images
Loading digit 9: 1000 images

Dataset loaded successfully!
Total samples: 10000
Image shape: 28x28 (784 features)
Classes: [0 1 2 3 4 5 6 7 8 9]

Class distribution:
  Digit 0: 1000 samples
  Digit 1: 1000 samples
  Digit 2: 1000 samples
  Digit 3: 1000 samples
  Digit 4: 1000 samples
  Digit 5: 1000 samples
  Digit 6: 1000 samples
  Digit 7: 1000 samples
  Digit 8: 1000 samples
  Digit 9: 1000 samples
Normalized: range [0.000, 1.000]

Preprocessing data...
Training set: 8000 samples
Testing set: 2000 samples
Feature shape: 784 features

Preprocessed data saved to preprocessed_data.pkl

Data preprocessing completed!

You can now use this data to train your models.
The preprocessed data has 