In [1]:
# Loading MNIST
import numpy as np
from tensorflow.keras.datasets import mnist
from tqdm import tqdm

# Download and load the dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# x_train and x_test are the image data, y_train and y_test are the labels
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

x_train = x_train.reshape(-1, 28*28)
x_test = x_test.reshape(-1, 28*28)
print(x_train.shape)

ModuleNotFoundError: No module named 'tensorflow'

In [6]:
class Dataset:
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

def create_dataset(x_data, y_data):
    return Dataset(x_data, y_data)


def pre_process_data(dataset, normalize=True, standardize=True, batch_size=32):
    dataset_size = len(dataset)
    n_features = dataset[0][0].shape[0]
    
    data = np.zeros((dataset_size, n_features), dtype=np.float32)
    labels = np.zeros(dataset_size, dtype=np.int64)
    
    # Process in batches
    for start_idx in tqdm(range(0, dataset_size, batch_size), desc="Processing batches"):
        end_idx = min(start_idx + batch_size, dataset_size)
        batch_slice = slice(start_idx, end_idx)
        
        batch_data = np.array([dataset[i][0] for i in range(start_idx, end_idx)])
        batch_labels = np.array([dataset[i][1] for i in range(start_idx, end_idx)])
        
        if normalize:
            batch_data = batch_data / 255.0
        
        data[batch_slice] = batch_data
        labels[batch_slice] = batch_labels
    
    # Standardize after processing all batches
    if standardize:
        data_mean = np.mean(data, axis=0)
        data_std = np.std(data, axis=0)
        data = (data - data_mean) / (data_std + 1e-8)
    
    return data, labels

# Create datasets
train_dataset = create_dataset(x_train, y_train)
test_dataset = create_dataset(x_test, y_test)

# Preprocess
train_data, train_labels = pre_process_data(train_dataset)
test_data, test_labels = pre_process_data(test_dataset)

print(f"Processed train data shape: {train_data.shape}")
print(f"Processed train labels shape: {train_labels.shape}")
print(f"Processed test data shape: {test_data.shape}")
print(f"Processed test labels shape: {test_labels.shape}")

Processing batches: 100%|██████████| 1875/1875 [00:00<00:00, 17560.91it/s]
Processing batches: 100%|██████████| 313/313 [00:00<00:00, 23283.51it/s]

Processed train data shape: (60000, 784)
Processed train labels shape: (60000,)
Processed test data shape: (10000, 784)
Processed test labels shape: (10000,)



