Day 4: NumPy for ML Workflow

     1. Data Cleaning with NumPy

In [None]:
import numpy as np

data = np.array([1, 2, np.nan, 4, 5, np.nan, 7])

# Find NaN values
print(np.isnan(data))

# Replace NaN with mean
data[np.isnan(data)] = np.nanmean(data)
print("Cleaned:", data)


    2. Feature Scaling (Normalization & Standardization)

In [None]:
X = np.array([[20, 20000],
              [30, 50000],
              [40, 80000],
              [50, 100000]])

# Z-score standardization
mean = X.mean(axis=0)
std = X.std(axis=0)

X_standardized = (X - mean) / std
print(X_standardized)

    3.One-Hot Encoding with NumPy

In [None]:
categories = np.array(["red", "blue", "green", "red", "blue"])

unique, encoded = np.unique(categories, return_inverse=True)
one_hot = np.eye(len(unique))[encoded]
print(one_hot)


    4.Train-Test Split

In [None]:
data = np.arange(100).reshape(50,2)  # 50 samples, 2 features
np.random.shuffle(data)

train_size = int(0.8 * len(data))
train, test = data[:train_size], data[train_size:]

print("Train shape:", train.shape)
print("Test shape:", test.shape)

    5.Mini-batch Sampling

In [None]:
X = np.arange(100).reshape(50,2)
batch = np.random.choice(len(X), size=8, replace=False)
print("Mini-batch:\n", X[batch])

    Phase -4 Exercise:

1. Create an array with NaN values and replace them with the column mean.

2. Standardize a dataset of shape (6,3) (random numbers).

3. Convert the categories ["apple","banana","apple","orange"] into one-hot encoded form.

4. Shuffle and split a dataset of shape (100,5) into 70% train and 30% test.

5. From a dataset of shape (200,4), randomly select a mini-batch of 16 samples.

     Solution:

In [None]:
import numpy as np

# 1. Create an array with NaN values and replace them with column mean
data = np.array([[1, 2, np.nan],
                 [4, np.nan, 6],
                 [7, 8, 9]], dtype=float)

col_mean = np.nanmean(data, axis=0)         # mean per column (ignoring NaN)
inds = np.where(np.isnan(data))             # find NaN positions
data[inds] = np.take(col_mean, inds[1])     # replace with col mean
print("Array after replacing NaN:\n", data)


# 2. Standardize a dataset of shape (6,3) (random numbers)
dataset = np.random.randint(1, 20, (6, 3)).astype(float)
mean = dataset.mean(axis=0)
std = dataset.std(axis=0)
standardized = (dataset - mean) / std
print("\nStandardized dataset:\n", standardized)


# 3. One-hot encode categories
categories = np.array(["apple","banana","apple","orange"])
unique = np.unique(categories)
one_hot = np.zeros((len(categories), len(unique)))

for i, val in enumerate(categories):
    one_hot[i, unique.tolist().index(val)] = 1

print("\nOne-hot encoded:\n", one_hot)


# 4. Shuffle and split dataset (100,5) → 70% train, 30% test
dataset2 = np.random.rand(100, 5)
np.random.shuffle(dataset2)

split = int(0.7 * len(dataset2))
train, test = dataset2[:split], dataset2[split:]
print("\nTrain shape:", train.shape)
print("Test shape:", test.shape)


# 5. Random mini-batch of 16 samples from dataset (200,4)
dataset3 = np.random.rand(200, 4)
indices = np.random.choice(200, 16, replace=False)
mini_batch = dataset3[indices]
print("\nMini-batch (16 samples):\n", mini_batch)
