In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import time
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv("archive_3\Liver_disease_data.csv")

In [3]:
def encode(name):
    cat,val = pd.factorize(data[name])
    data.drop(name,axis = 1)
    data[name] = cat

In [4]:
# One-hot encode the responded column
data = pd.get_dummies(data, columns=["Diagnosis"])

In [5]:
dataset = data.loc[:,data.columns != 'Diagnosis_0']

In [6]:
dataset = dataset.loc[:,dataset.columns != 'Diagnosis_1']

In [7]:
labels = data[["Diagnosis_0","Diagnosis_1"]].values

In [8]:
# Normalizing the data
scaler = StandardScaler()
dataset = scaler.fit_transform(dataset)

In [9]:
dataset = torch.tensor(dataset, dtype = torch.float32)
labels = torch.tensor(labels, dtype = torch.float32)

In [10]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, num_classes)
    
    def forward(self, x):
        logits = self.linear(x)

        probabilities = F.softmax(logits, dim = 1)

        return probabilities

In [11]:
model = LogisticRegressionModel(dataset.shape[1],2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.1)

In [12]:
epochs = 50

In [13]:
# This will keep track of the total training time
startTime = time.time()
for epoch in range(epochs):
    model.train()
    outputs = model(dataset)
    loss = criterion(outputs, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")
print(f"Training took: {(time.time() - startTime) * 1000} milliseconds")

Epoch 1/50, Loss: 0.7383216023445129
Epoch 2/50, Loss: 0.6518785357475281
Epoch 3/50, Loss: 0.5923824310302734
Epoch 4/50, Loss: 0.5563733577728271
Epoch 5/50, Loss: 0.5354049801826477
Epoch 6/50, Loss: 0.5225387811660767
Epoch 7/50, Loss: 0.5137298703193665
Epoch 8/50, Loss: 0.5070165395736694
Epoch 9/50, Loss: 0.501541018486023
Epoch 10/50, Loss: 0.4969373643398285
Epoch 11/50, Loss: 0.49303877353668213
Epoch 12/50, Loss: 0.4897485375404358
Epoch 13/50, Loss: 0.48698893189430237
Epoch 14/50, Loss: 0.4846881330013275
Epoch 15/50, Loss: 0.4827801585197449
Epoch 16/50, Loss: 0.48120632767677307
Epoch 17/50, Loss: 0.4799148738384247
Epoch 18/50, Loss: 0.47885942459106445
Epoch 19/50, Loss: 0.477999210357666
Epoch 20/50, Loss: 0.47730016708374023
Epoch 21/50, Loss: 0.4767358899116516
Epoch 22/50, Loss: 0.4762864112854004
Epoch 23/50, Loss: 0.4759362041950226
Epoch 24/50, Loss: 0.47567105293273926
Epoch 25/50, Loss: 0.4754762053489685
Epoch 26/50, Loss: 0.4753357768058777
Epoch 27/50, Loss

In [14]:
model.eval()

LogisticRegressionModel(
  (linear): Linear(in_features=10, out_features=2, bias=True)
)

In [15]:
# Evaluate the accuracy of the model
with torch.no_grad():
    outputs = model(dataset)
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == torch.max(labels, 1)[1]).sum().item()
    accuracy = correct/dataset.shape[0]
    print(f"Accuracy: {accuracy}")

Accuracy: 0.8341176470588235


In [16]:
# Lets decide on a subset of data we want to forget
# Lets choose it, for example, to be the first ten images in the dataset
forgetSet = dataset[:3]
forgetLabels = labels[:3]

In [17]:
# This will be our retain set
retainSet = dataset[3:]
retainLabels = labels[3:]

In [18]:
# Now we will start with the unlearning process
# Here the dot product is comparitively less
print(((model(forgetSet) @ model(retainSet).T)))

tensor([[0.1110, 0.7012, 0.9332,  ..., 0.9837, 0.9561, 0.9016],
        [0.1052, 0.7042, 0.9396,  ..., 0.9908, 0.9628, 0.9075],
        [0.8254, 0.3317, 0.1377,  ..., 0.0955, 0.1186, 0.1641]],
       grad_fn=<MmBackward0>)


In [19]:
def findAccuracy(model,retainSet,state):
    # Before unlearning the performance is
    model.eval()
    with torch.no_grad():
        outputs = model(retainSet)
        outputs = outputs.numpy()
        outputs = np.where(outputs >= 0.5, 1, 0)
        accuracy = np.mean(outputs == retainLabels.numpy())
        print(f"Accuracy {state} unlearning: {accuracy}")

This is the original loss on the forget set

In [20]:
print("Previous Loss on Forget Set: ",criterion(model(forgetSet),forgetLabels).item())

Previous Loss on Forget Set:  0.33569732308387756


This is the previous loss on the retain set

In [21]:
print("Previous loss on the retain set: ",criterion(model(retainSet),retainLabels).item())

Previous loss on the retain set:  0.47379621863365173


In [22]:
# Now we will unlearn the forget set
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.01)

In [23]:
# For the outputs to the forget set
model.train()
forgetOutputs = model(forgetSet)

In [24]:
# This will be the uniform label
uniformLabel = torch.ones_like(forgetOutputs) / forgetOutputs.shape[1]

In [25]:
uniformLabel

tensor([[0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000]])

In [26]:
def kl_loss_sym(x,y):
    kl_loss = nn.KLDivLoss(reduction='batchmean')
    return kl_loss(nn.LogSoftmax(dim=-1)(x),y)

In [27]:
optimizer_loss = optim.Adam(model.parameters(), lr = 0.01)

In [28]:
for i in range(1):
    model.train()
    outputs = model(forgetSet)
    optimizer_loss.zero_grad()
    loss = kl_loss_sym(outputs, uniformLabel)
    loss.backward()
    optimizer_loss.step()
    print(model(forgetSet))

tensor([[0.0153, 0.9847],
        [0.0092, 0.9908],
        [0.8890, 0.1110]], grad_fn=<SoftmaxBackward0>)


In [29]:
# Now after this initial step we will try to minimize the cosine similarity between the outputs


In [29]:
optimizer_loss = optim.Adam(model.parameters(),lr = 0.1)
optimizer = optim.Adam(model.parameters(),lr = 0.1)

In [30]:
# This will keep track of the total time for the unlearning
startTime = time.time()
epochs = 10
for epoch in range(epochs):
    model.train()
    outputForget,outputRetain = model(forgetSet),model(retainSet).detach()
    optimizer_loss.zero_grad()
    loss = (-1 * nn.LogSoftmax(dim=-1)((outputForget @ outputRetain.T))).mean() 
    # To check if the cosine similarilty is minimized
    loss.backward()
    optimizer_loss.step()
    print("Loss on forget is: ",criterion(model(forgetSet),forgetLabels).item())
    print("Loss on the overall dataset: ",criterion(model(dataset),labels).item())
print(f"Unlearning took: {(time.time() - startTime) * 1000} milliseconds")

Loss on forget is:  0.42367151379585266
Loss on the overall dataset:  0.4756835997104645
Loss on forget is:  0.569916844367981
Loss on the overall dataset:  0.4814296364784241
Loss on forget is:  0.6464361548423767
Loss on the overall dataset:  0.48517605662345886
Loss on forget is:  0.7528764605522156
Loss on the overall dataset:  0.4894867241382599
Loss on forget is:  0.7690455317497253
Loss on the overall dataset:  0.4908082187175751
Loss on forget is:  0.6944684386253357
Loss on the overall dataset:  0.4896646738052368
Loss on forget is:  0.574553906917572
Loss on the overall dataset:  0.48814478516578674
Loss on forget is:  0.5115971565246582
Loss on the overall dataset:  0.48874956369400024
Loss on forget is:  0.5311587452888489
Loss on the overall dataset:  0.49225711822509766
Loss on forget is:  0.6125810146331787
Loss on the overall dataset:  0.4976508617401123
Unlearning took: 135.30707359313965 milliseconds


In [31]:
model(forgetSet) @ outputRetain.T

tensor([[0.5091, 0.5157, 0.4840,  ..., 0.4771, 0.5129, 0.4791],
        [0.4970, 0.4949, 0.5052,  ..., 0.5075, 0.4958, 0.5069],
        [0.6197, 0.7073, 0.2891,  ..., 0.1978, 0.6708, 0.2236]],
       grad_fn=<MmBackward0>)

In [32]:
outputs = model(forgetSet)
for val,label in zip(outputs,forgetLabels):
    print(val.detach().numpy(),label.detach().numpy())

[0.5228833  0.47711676] [0. 1.]
[0.4925024  0.50749755] [0. 1.]
[0.8024237  0.19757628] [1. 0.]


As you can see the loss on the forget is now comparatively more than previously, but the loss in the retain samples is almost the same 

In [33]:
print("Loss on forget is: ",criterion(model(forgetSet),forgetLabels).item())

Loss on forget is:  0.6125810146331787


In [34]:
print("Loss on retain is: ",criterion(model(retainSet),retainLabels).item())

Loss on retain is:  0.497447669506073


We will now find out how much does retraining a model from scratch on the retain set helps

In [35]:
model = LogisticRegressionModel(dataset.shape[1],2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.1)
epochs = 50
startTime = time.time()
for epoch in range(epochs):
    model.train()
    outputs = model(retainSet)
    loss = criterion(outputs, retainLabels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")
print(f"Retraining from scratch took {(time.time() - startTime) * 1000} milliseconds")

Epoch 1/50, Loss: 0.6899567246437073
Epoch 2/50, Loss: 0.6234439611434937
Epoch 3/50, Loss: 0.5785881876945496
Epoch 4/50, Loss: 0.5519060492515564
Epoch 5/50, Loss: 0.5355179905891418
Epoch 6/50, Loss: 0.5237887501716614
Epoch 7/50, Loss: 0.5145766735076904
Epoch 8/50, Loss: 0.5071172118186951
Epoch 9/50, Loss: 0.5010887980461121
Epoch 10/50, Loss: 0.49628525972366333
Epoch 11/50, Loss: 0.49250027537345886
Epoch 12/50, Loss: 0.48950543999671936
Epoch 13/50, Loss: 0.48707568645477295
Epoch 14/50, Loss: 0.4850287139415741
Epoch 15/50, Loss: 0.48325061798095703
Epoch 16/50, Loss: 0.4816906452178955
Epoch 17/50, Loss: 0.4803381860256195
Epoch 18/50, Loss: 0.47919756174087524
Epoch 19/50, Loss: 0.47827091813087463
Epoch 20/50, Loss: 0.4775495231151581
Epoch 21/50, Loss: 0.47701165080070496
Epoch 22/50, Loss: 0.47662392258644104
Epoch 23/50, Loss: 0.47634634375572205
Epoch 24/50, Loss: 0.47613751888275146
Epoch 25/50, Loss: 0.4759620726108551
Epoch 26/50, Loss: 0.47579455375671387
Epoch 27/

Hence, the unlearning here was twice as faster than the retraining, and just about as accurate