In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.datasets import fetch_openml

# Fetching the MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
x = mnist.data
y = mnist.target

x.shape, y.shape

((70000, 784), (70000,))

In [3]:
x_train, x_test = x[:60000], x[60000:]
y_train, y_test = y[:60000], y[60000:]

In [4]:
x_train.shape

(60000, 784)

In [5]:
x_test.shape

(10000, 784)

In [6]:
%%time
from sklearn.neighbors import KNeighborsClassifier
nn1 = KNeighborsClassifier(n_neighbors = 1)
nn1.fit(x_train, y_train)

CPU times: total: 141 ms
Wall time: 274 ms


In [7]:
from sklearn.metrics import accuracy_score

In [10]:
%%time
accuracy_score(nn1.predict(x_test), y_test)

CPU times: total: 46.4 s
Wall time: 7.46 s


0.9691

In [11]:
#Baseline, taking all 60000 datapoints as training

# Randomly selecting

## 10000

In [45]:
%%time
accuracies = []
for i in range(10):
    sampled_indices = x_train.index.to_series().sample(n=10000)
    x_train_10000 = x_train.loc[sampled_indices]
    y_train_10000 = y_train.loc[sampled_indices]
    nn1.fit(x_train_10000, y_train_10000)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).std())
print(np.array(accuracies).mean())

0.0017239489551607679
0.9494
CPU times: total: 1min 19s
Wall time: 14.1 s


In [46]:
accuracies

[0.9511, 0.9498, 0.952, 0.9493, 0.9481, 0.9506, 0.9463, 0.9488, 0.9508, 0.9472]

## 5000

In [47]:
%%time
accuracies = []
for i in range(10):
    sampled_indices = x_train.index.to_series().sample(n=5000)
    x_train_5000 = x_train.loc[sampled_indices]
    y_train_5000 = y_train.loc[sampled_indices]
    nn1.fit(x_train_5000, y_train_5000)
    accuracy_score(nn1.predict(x_test), y_test)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).std())
print(np.array(accuracies).mean())

0.0017333493589002684
0.93545
CPU times: total: 1min 21s
Wall time: 16.4 s


## 1000

In [14]:
%%time
nn1.fit(x_train_10000, y_train_10000)
accuracy_score(nn1.predict(x_test), y_test)

CPU times: total: 7.8 s
Wall time: 1.34 s


0.9479

In [48]:
%%time
accuracies = []
for i in range(10):
    sampled_indices = x_train.index.to_series().sample(n=1000)
    x_train_1000 = x_train.loc[sampled_indices]
    y_train_1000 = y_train.loc[sampled_indices]
    nn1.fit(x_train_1000, y_train_1000)
    accuracy_score(nn1.predict(x_test), y_test)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).std())
print(np.array(accuracies).mean())

0.003440348819523978
0.8869999999999999
CPU times: total: 20.7 s
Wall time: 6.9 s


## 100

In [49]:
%%time
accuracies = []
for i in range(10):
    sampled_indices = x_train.index.to_series().sample(n=100)
    x_train_100 = x_train.loc[sampled_indices]
    y_train_100 = y_train.loc[sampled_indices]
    nn1.fit(x_train_100, y_train_100)
    accuracy_score(nn1.predict(x_test), y_test)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).std())
print(np.array(accuracies).mean())

0.025161679196746806
0.7128300000000001
CPU times: total: 1.89 s
Wall time: 4.16 s


# K-means clustering

Splitting the data into 10 clusters, since there are 10 digits, I then choose the same amount of points from each cluster, getting the ones closest.

In [11]:
from sklearn.cluster import KMeans

In [64]:
def clustering(x_train, y_train, n):
    training = x_train.copy()
    k = 10 
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(x_train)
    training['labels'] = y_train
    labels = kmeans.labels_
    prototypes = pd.DataFrame()
    for i in range(10): #loop through each cluster
        #get all points in the current cluster
        cluster_points = training[labels == i]
        #copmuting distances from the points to the center of the cluster
        distances = np.linalg.norm(cluster_points.iloc[:, :784] - kmeans.cluster_centers_[i], axis = 1)
        #get indices of n closest
        closest_indices = np.argsort(distances)[:int(n/10)]

        prototypes = pd.concat([prototypes, cluster_points.iloc[closest_indices]], axis =0)
        
    return prototypes.iloc[:,:784], prototypes['labels']

### M = 10000

In [68]:
%%time
accuracies = []
for i in range(10):
    x_train_c10000, y_train_c10000 = clustering(x_train, y_train, 10000)
    nn1.fit(x_train_c10000, y_train_c10000)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.8690900000000001
0.008921261121612787
CPU times: total: 1min 41s
Wall time: 39.2 s


### M = 5000

In [69]:
%%time
accuracies = []
for i in range(10):
    x_train_c10000, y_train_c10000 = clustering(x_train, y_train, 5000)
    nn1.fit(x_train_c10000, y_train_c10000)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.82537
0.008432680475388594
CPU times: total: 1min 8s
Wall time: 35.5 s


### M = 1000

In [70]:
%%time
accuracies = []
for i in range(10):
    x_train_c10000, y_train_c10000 = clustering(x_train, y_train, 1000)
    nn1.fit(x_train_c10000, y_train_c10000)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.7515
0.0065820969303102585
CPU times: total: 37.6 s
Wall time: 27.7 s


### M = 100

In [56]:
    x_train_c10000, y_train_c10000 = clustering(x_train, y_train, 10000)
    nn1.fit(x_train_c10000, y_train_c10000)
    (accuracy_score(nn1.predict(x_test), y_test))

0.9691

In [71]:
%%time
accuracies = []
for i in range(10):
    x_train_c10000, y_train_c10000 = clustering(x_train, y_train, 100)
    nn1.fit(x_train_c10000, y_train_c10000)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.64463
0.01562811888872105
CPU times: total: 28 s
Wall time: 26.2 s


## K means clustering 2

split into M clusters and take the closest point to center

### M = 10000

In [12]:
def clustering2(x_train, y_train, M):
    kmeans = KMeans(n_clusters = M)
    kmeans.fit(x_train)
    labels = kmeans.labels_
    prototypes = pd.DataFrame()
    training = x_train.copy()
    training['labels'] = y_train
    for i in range(M): #loop through each cluster
        #get all points in the current cluster
        cluster_points = training[labels ==i]
        #copmuting distances from the points to the center of the cluster
        distances = np.linalg.norm(cluster_points.iloc[:, :784] - kmeans.cluster_centers_[i], axis = 1)
        #get indices of n closest
        closest_indices = np.argsort(distances)[:1]

        prototypes = pd.concat([prototypes, cluster_points.iloc[closest_indices]], axis =0)
        
    return prototypes.iloc[:,:784], prototypes['labels']

In [13]:
%%time
accuracies = []
for i in range(10):
    a, b = clustering2(x_train, y_train, 10000)
    nn1.fit(a, b)    
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.9575400000000001
0.001065082156455538
CPU times: total: 2h 29min 40s
Wall time: 52min 17s


In [3]:
(149) * 60 + 40

8980

### M = 5000


In [73]:
a, b = clustering2(x_train, y_train, 5000)

In [14]:
%%time
accuracies = []
for i in range(10):
    a, b = clustering2(x_train, y_train, 5000)
    nn1.fit(a, b)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.95037
0.0012688971589533977
CPU times: total: 1h 10min 11s
Wall time: 23min 20s


### M = 1000

In [75]:
a, b = clustering2(x_train, y_train, 1000)

In [15]:
%%time
accuracies = []
for i in range(10):
    a, b = clustering2(x_train, y_train, 1000)
    nn1.fit(a, b)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.9258200000000001
0.001718604084715267
CPU times: total: 26min 17s
Wall time: 5min 39s


### M = 100

In [77]:
a, b = clustering2(x_train, y_train, 100)

In [16]:
%%time
accuracies = []
for i in range(10):
    a, b = clustering2(x_train, y_train, 100)
    nn1.fit(a, b)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.84719
0.005584344187100215
CPU times: total: 7min 39s
Wall time: 1min 30s


In [6]:
7*60+39

459

# CNN

In [33]:
# from sklearn.neighbors import NearestNeighbors
# def condensed_nearest_neighbor(X, y):

#     # Initialize S with one random instance of each class
#     classes = np.unique(y)
#     indices = [np.random.choice(np.where(y == cls)[0]) for cls in classes]
#     S_X = X[indices]
#     S_y = y[indices]

#     # Create a nearest neighbor model
#     nbrs = NearestNeighbors(n_neighbors=1)

#     # Iterate until no misclassified instances are found
#     misclassified = True
#     while misclassified:
#         misclassified = False
#         nbrs.fit(S_X)

#         for i, (x, label) in enumerate(zip(X, y)):
#             # Find the nearest neighbor in S
#             distances, neighbors = nbrs.kneighbors([x])
#             nearest_neighbor_label = S_y[neighbors[0][0]]

#             # If misclassified, add to S
#             if label != nearest_neighbor_label:
#                 S_X = np.vstack([S_X, x])
#                 S_y = np.append(S_y, label)
#                 misclassified = True

#     return S_X, S_y

In [34]:
# a, b = condensed_nearest_neighbor(x_train.to_numpy(), y_train.to_numpy())

# Active Learning Loop

Initialize:
1. Set M = desired size of condensed set.
2. Randomly select a small subset from the training set and use it to train an initial model.

Active Learning Loop:
While the size of the condensed set is less than M:
    1. Use the model to predict on the entire (or a large subset of the) training set.
    2. Identify the samples where the model's predictions are most uncertain.
    3. Add a certain number of these uncertain samples to the condensed set.
    4. Retrain the model on the updated condensed set.
    5. Repeat until the condensed set reaches the size M or no more uncertain samples are found.

Output:
- The condensed set.


In [96]:
import numpy as np
from sklearn.linear_model import LogisticRegression

def active_learning_prototype_selection(X, y, M, initial_subset_size=50):
    # Initialize the condensed set with a random small subset
    initial_indices = np.random.choice(range(len(X)), size=initial_subset_size, replace=False)
    condensed_indices = initial_indices.tolist()
    
    # Train initial model
    model = LogisticRegression(max_iter=1000)
    model.fit(X[condensed_indices], y[condensed_indices])

    while len(condensed_indices) < M:
        # Predict on the remaining dataset
        remaining_indices = list(set(range(len(X))) - set(condensed_indices))
        if not remaining_indices:
            break

        probas = model.predict_proba(X[remaining_indices])
        # Measure uncertainty (e.g., distance from 0.5 in binary classification)
        uncertainty = np.abs(probas[:, 0] - 0.5)

        # Select the most uncertain samples
        uncertain_indices = np.argsort(uncertainty)[:M - len(condensed_indices)]
        selected_indices = [remaining_indices[i] for i in uncertain_indices]

        # Add to the condensed set and retrain model
        condensed_indices.extend(selected_indices)
        model.fit(X[condensed_indices], y[condensed_indices])

    return X[condensed_indices], y[condensed_indices]

## 10000

In [99]:
%%time
accuracies = []
for i in range(10):
    a, b = active_learning_prototype_selection(x_train.to_numpy(), y_train.to_numpy(), 10000, 10000)
    nn1.fit(a, b)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.9479599999999999
0.0018111874557869396
CPU times: total: 1min 43s
Wall time: 58 s


## 5000

In [100]:
%%time
accuracies = []
for i in range(10):
    a, b = active_learning_prototype_selection(x_train.to_numpy(), y_train.to_numpy(), 5000, 5000)
    nn1.fit(a, b)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.9359599999999999
0.002036271101793681
CPU times: total: 43 s
Wall time: 15.5 s


## 1000

In [101]:
%%time
accuracies = []
for i in range(10):
    a, b = active_learning_prototype_selection(x_train.to_numpy(), y_train.to_numpy(), 1000, 1000)
    nn1.fit(a, b)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.8841699999999999
0.0029475583115521386
CPU times: total: 7.11 s
Wall time: 4.35 s


## 100

In [102]:
%%time
accuracies = []
for i in range(10):
    a, b = active_learning_prototype_selection(x_train.to_numpy(), y_train.to_numpy(), 100, 100)
    nn1.fit(a, b)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.70295
0.01197591332633968
CPU times: total: 875 ms
Wall time: 2.59 s


# K means revisted

In [117]:
kmeans = KMeans(n_clusters=10)
kmeans.fit(x_train)

In [103]:
kmeans.labels_

array([9, 7, 5, ..., 9, 0, 8])

In [106]:
y_train[kmeans.labels_ == 0].value_counts()

class
6    4591
0     237
2     161
5     140
4     127
3      61
8      57
1      11
9       8
7       2
Name: count, dtype: int64

In [107]:
y_train[kmeans.labels_ == 1].value_counts()

class
7    2824
9    1707
4    1536
5     265
8     230
2      61
3      32
0      27
1       7
6       1
Name: count, dtype: int64

In [108]:
y_train[kmeans.labels_ == 2].value_counts()

class
9    2473
7    2385
4    1632
5     312
8     214
3     161
2      37
0      14
1       9
6       1
Name: count, dtype: int64

In [109]:
y_train[kmeans.labels_ == 3].value_counts()

class
1    3713
3     373
2     323
8     276
6     234
7     223
9     139
5     119
4      98
0       0
Name: count, dtype: int64

In [110]:
y_train[kmeans.labels_ == 4].value_counts()

class
2    4192
3     209
6      80
8      43
0      39
7      30
4      14
5      14
1       9
9       5
Name: count, dtype: int64

In [111]:
y_train[kmeans.labels_ == 5].value_counts()

class
4    2188
9    1382
6     592
7     546
2     230
5     186
8     134
0      89
3      65
1       4
Name: count, dtype: int64

In [112]:
y_train[kmeans.labels_ == 6].value_counts()

class
3    3888
5    1782
8    1372
2     391
0     233
9      85
6      31
1       6
7       3
4       0
Name: count, dtype: int64

In [113]:
y_train[kmeans.labels_ == 7].value_counts()

class
0    4676
6      73
5      63
2      55
3      33
8      29
9      29
7      13
4       4
1       0
Name: count, dtype: int64

### 10000

In [114]:
y_train[kmeans.labels_ == 8].value_counts()

class
1    2977
5     712
8     392
2     369
7     236
4     231
6     190
3      84
9      77
0      29
Name: count, dtype: int64

In [115]:
y_train[kmeans.labels_ == 9].value_counts()

class
8    3104
5    1828
3    1225
0     579
2     139
6     125
9      44
4      12
1       6
7       3
Name: count, dtype: int64

In [118]:
y_train.value_counts() / 60000

class
1    0.112367
7    0.104417
3    0.102183
2    0.099300
9    0.099150
0    0.098717
6    0.098633
8    0.097517
4    0.097367
5    0.090350
Name: count, dtype: float64

1. Manually cluster the datapoints by n = {amount of labels} and see which labels are often clustered correclty and which labels might get clustered with others

2. Retrieve ratio of each label in the entire original dataset:
    for each label: ratio of label = occurence ob label / total number of datapoints
3. Calculate number of data points for each label needed in M, according to the ratio from step (2)
    for each label: number of datapoints = ratio of label * M
4. For the labels that mostly get clustered with its own in step 1, reduce the amount by in M by half, add those points to the labels that mostly get clustered together

5. For each label that mostly clustered with itself, perform k means clustering on data points of the label with k = number of datapoints going to M and return the points closest to the centroid

6. For labels that get clustered together, perform k means clustering on datapoints of those labels with k = number of datapoints going to M and return the points closest to the centroid

7. combine datapoints from step 5 and 6

In [17]:
def kmeans_subset_selection(data, labels, M, manual_clusters, adjustment_factor=0.5):
    # Step 2: Calculate the ratio of each label in the dataset
    unique_labels, counts = np.unique(labels, return_counts=True)
    total_points = len(labels)
    label_ratios = {label: count / total_points for label, count in zip(unique_labels, counts)}
    training = data.copy()
    training['labels'] = labels
    # Step 3: Calculate number of data points for each label in M
    label_counts = {label: int(ratio * M) for label, ratio in label_ratios.items()}
    # Calculate total reduction from self_clustered labels
    total_reduction = 0
    for label in manual_clusters['self_clustered']:
        original_count = label_counts[label]
        reduced_count = int(original_count * adjustment_factor)
        total_reduction += original_count - reduced_count
        label_counts[label] = reduced_count

    # Evenly distribute the reduced points to combined_clustered labels
    num_combined_labels = len([item for sublist in manual_clusters['combined_clustered'] for item in sublist])
    extra_points_per_label = total_reduction // num_combined_labels
    to_add = 0
    for clustered in manual_clusters['combined_clustered']:
        for label in clustered:
            label_counts[label] += extra_points_per_label
    # Handle any remainder by distributing one extra point to each label, until exhausted
    remainder = total_reduction % num_combined_labels
    for i in range(remainder):
        label_counts[manual_clusters['combined_clustered'][0][0]] += 1
    # Step 5 and 6: Perform k-means clustering for each label
    prototypes = pd.DataFrame()

    # For labels that cluster with themselves
    for label in manual_clusters['self_clustered']:
        label_data = training[np.isin(labels, label)]
        n1 = label_counts[label]# Assuming the count is the same for all labels in the cluster
        sampled_indices = label_data.index.to_series().sample(n=int(n1*0.9))
        prototypes = pd.concat([prototypes, label_data.loc[sampled_indices]], axis =0)
        k = 10
        if k > 0:
            kmeans = KMeans(n_clusters=k)
            kmeans.fit(label_data.iloc[:, :784])
            klabels = kmeans.labels_
            for i in range(k):
                cluster_points = label_data[klabels == i]
                distances = np.linalg.norm(cluster_points.iloc[:, :784] - kmeans.cluster_centers_[i], axis = 1)
                closest_indices = np.argsort(distances)[:n1//100]
                prototypes = pd.concat([prototypes, cluster_points.iloc[closest_indices]], axis =0)
        else:
            print('something wrong')
    # For labels that get clustered together
    for cluster in manual_clusters['combined_clustered']:
        combined_data = training[np.isin(labels, cluster)]
        k = sum(label_counts[label] for label in cluster)
        if k > 0:
            kmeans = KMeans(n_clusters=k)
            kmeans.fit(combined_data.iloc[:, :784])
            klabels = kmeans.labels_
            for i in range(k):
                cluster_points = combined_data[klabels == i]
                distances = np.linalg.norm(cluster_points.iloc[:, :784] - kmeans.cluster_centers_[i], axis = 1)
                closest_indices = np.argsort(distances)[:1]
                prototypes = pd.concat([prototypes, cluster_points.iloc[closest_indices]], axis =0)
        else:
            print('something wrong')

    # Step 7: Combine data points
    return prototypes.iloc[:,:784], prototypes['labels']

In [22]:
a, b = kmeans_subset_selection(x_train, y_train, 10000,{'self_clustered': ['1','2','6','0'], 'combined_clustered': [['7', '9', '4'], ['3','5','8']]})

In [23]:
a

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
28894,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23647,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19837,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34222,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35345,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13543,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5332,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 10000

In [28]:
%%time
accuracies = []
for i in range(10):
    a, b = kmeans_subset_selection(x_train, y_train, 10000,{'self_clustered': ['1','2','6','0'], 'combined_clustered': [['7', '9', '4'], ['3','5','8']]})
    nn1.fit(a, b)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.9507099999999999
0.001266056870760541
CPU times: total: 24min 40s
Wall time: 19min 15s


## 5000

In [29]:
%%time
accuracies = []
for i in range(10):
    a, b = kmeans_subset_selection(x_train, y_train, 5000,{'self_clustered': ['1','2','6','0'], 'combined_clustered': [['7', '9', '4'], ['3','5','8']]})
    nn1.fit(a, b)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.94259
0.0017980266961310685
CPU times: total: 12min 58s
Wall time: 7min 16s


## 1000

In [30]:
%%time
accuracies = []
for i in range(10):
    a, b = kmeans_subset_selection(x_train, y_train, 1000,{'self_clustered': ['1','2','6','0'], 'combined_clustered': [['7', '9', '4'], ['3','5','8']]})
    nn1.fit(a, b)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.90231
0.0028693030512652257
CPU times: total: 5min 48s
Wall time: 1min 41s


## 100

In [31]:
%%time
accuracies = []
for i in range(10):
    a, b = kmeans_subset_selection(x_train, y_train, 100,{'self_clustered': ['1','2','6','0'], 'combined_clustered': [['7', '9', '4'], ['3','5','8']]})
    nn1.fit(a, b)
    accuracies.append(accuracy_score(nn1.predict(x_test), y_test))
print(np.array(accuracies).mean())
print(np.array(accuracies).std())

0.74251
0.015885304529658858
CPU times: total: 1min 30s
Wall time: 44.7 s
