# Setup

In [None]:
import sys
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
path_to_root = '/content/drive/My Drive/Colab Notebooks/BatuEl_Dissertation'
sys.path.append(path_to_root)
print("Drive mounted.")

data_path = path_to_root + '/data'

Mounted at /content/drive
Drive mounted.


In [None]:
import torch
import tqdm
from reprshift.learning.algorithms import ERM, Algorithm
from reprshift.models.hparams import hparams_f
from reprshift.models.networks import Featurizer, Classifier
from reprshift.dataset.datasets import MultiNLI, CivilComments
from reprshift.dataset.dataloaders import InfiniteDataLoader, FastDataLoader

# LFF Algorithm

In [None]:
import torch
from reprshift.models.networks import Featurizer, Classifier
from reprshift.learning.optimization import get_bert_optim
from transformers import get_scheduler

In [None]:
# Todo: non-linear classifier (maybe?)
def NonLinearClassifier(in_features, out_features):
    return torch.nn.Sequential(
        torch.nn.Linear(in_features, in_features // 2),
        torch.nn.ReLU(),
        torch.nn.Linear(in_features // 2, in_features // 4),
        torch.nn.ReLU(),
        torch.nn.Linear(in_features // 4, out_features))

# Based on: https://github.com/YyzHarry/SubpopBench
class LfF(Algorithm):
    """
    Learning from Failure (LfF) [https://arxiv.org/pdf/2007.02561.pdf]
    """
    def __init__(self, num_classes, num_attributes, hparams):
        super().__init__(num_classes, num_attributes, hparams)

        self.pred_model = ERM(num_classes, num_attributes, hparams)
        self.biased_featurizer = Featurizer(hparams['last_layer_dropout'])
        cls_in_features = 768 #self.featurizer.featurizer.config.hidden_size #
        self.biased_classifier = NonLinearClassifier(cls_in_features, num_classes) # differently, we use a linear biased classifier
        self.biased_network = torch.nn.Sequential(self.biased_featurizer, self.biased_classifier)
        self.q = self.hparams['LfF_q']
        self._init_model()

    def _init_model(self):
        lr, weight_decay, num_warmup_steps, num_training_steps = self.hparams['lr'], self.hparams['weight_decay'], self.hparams['num_warmup_steps'], self.hparams['num_training_steps']
        self.pred_model._init_model(lr, weight_decay, num_warmup_steps, num_training_steps)

        self.clip_grad = True
        self.biased_network.zero_grad()

        self.optimizer_b = get_bert_optim(self.biased_network, lr, weight_decay)
        self.lr_scheduler = get_scheduler("linear",optimizer=self.optimizer_b,num_warmup_steps=num_warmup_steps,num_training_steps=num_training_steps)

    # implemented from equation
    def GCE(self, logits, targets):
        p = torch.nn.functional.softmax(logits, dim=1)
        Yg = torch.gather(p, 1, torch.unsqueeze(targets, 1))
        loss = (1 - Yg.squeeze()**self.q) / self.q
        return loss

    # copied from the authors' repo
    def GCE2(self, logits, targets):
        p = torch.nn.functional.softmax(logits, dim=1)
        Yg = torch.gather(p, 1, torch.unsqueeze(targets, 1))
        loss = torch.nn.functional.cross_entropy(logits, targets, reduction='none') * (Yg.squeeze().detach()**self.q)*self.q
        return loss

    def update(self, minibatch, step):
        all_i, all_x, all_y, all_a = minibatch
        pred_logits = self.pred_model.predict(all_x)
        biased_logits = self.biased_network(all_x)
        loss_gce = self.GCE2(biased_logits, all_y)
        ce_b = torch.nn.functional.cross_entropy(biased_logits, all_y, reduction='none')
        ce_d = torch.nn.functional.cross_entropy(pred_logits, all_y, reduction='none')
        weights = (ce_b/(ce_b + ce_d + 1e-8)).detach()

        self.optimizer_b.zero_grad()
        self.pred_model.optimizer.zero_grad()

        loss_pred = (ce_d * weights).mean()
        loss = loss_pred.mean() + loss_gce.mean()
        loss.backward()

        if self.clip_grad:
            torch.nn.utils.clip_grad_norm_(self.biased_network.parameters(), 1.0)
            torch.nn.utils.clip_grad_norm_(self.pred_model.parameters(), 1.0)
        self.optimizer_b.step()
        self.pred_model.optimizer.step()

        self.lr_scheduler.step()
        self.pred_model.lr_scheduler.step()

        self.biased_network.zero_grad()
        self.pred_model.zero_grad()

        return {'loss': loss.item(), 'loss_pred': loss_pred.mean().item(), 'loss_gce': loss_gce.mean().item()}

    def return_feats(self, x):
        return self.pred_model.featurizer(x)

    def predict(self, x):
        return self.pred_model.predict(x)


# Data

In [None]:
hparams = hparams_f('LfF')
hparams

{'batch_size': 16,
 'last_layer_dropout': 0.5,
 'optimizer': 'adamw',
 'weight_decay': 0.0001,
 'lr': 1e-05,
 'group_balanced': False,
 'num_training_steps': 30001,
 'num_warmup_steps': 0,
 'LfF_q': 0.7}

In [None]:
device = "cuda"
train_weights = None
batch_size = hparams['batch_size']

In [None]:
DATASET = 'CivilComments'  # 'CivilComments' , 'MultiNLI'

if DATASET == 'MultiNLI':
    NUM_CLASSES = 3
    NUM_ATTRIBUTES = 2
    train_dataset = MultiNLI(data_path, 'tr', hparams)
    models_path = path_to_root + '/models/models_mnli'
    print(DATASET)
elif DATASET  == 'CivilComments':
    NUM_CLASSES = 2
    NUM_ATTRIBUTES = 8
    train_dataset = CivilComments(data_path, 'tr', hparams, granularity="fine")
    models_path = path_to_root + '/models/models_civilcomments'
    print(DATASET)
else:
    print('Dataset Not Implemented')

CivilComments


In [None]:
train_loader = InfiniteDataLoader(  dataset=train_dataset,
                                    weights=train_weights,
                                    batch_size=batch_size,
                                    num_workers=1)
steps_per_epoch = len(train_dataset) / batch_size

  self.pid = os.fork()


# Model

In [None]:
algorithm_name = 'LfF'
random_seeds = [0,1,2] #[0,1,2]
init_state_dict_path = lambda random_seed : models_path + f'/00_randominit/seed{random_seed}/sd_epoch0.pth'
state_dict_PATH = models_path + '/07_lff/'

# Training

In [None]:
start_step = 1
n_steps = 30001 #hparams['num_training_steps']
checkpoint_freq = 1000
train_losses = {}

for seed in random_seeds:
    print('Training Seed:' , seed)
    algorithm = LfF(num_classes=NUM_CLASSES, num_attributes=NUM_ATTRIBUTES, hparams=hparams)
    algorithm.to(device)
    ### Matching the Keys ###
    sd_init = torch.load(init_state_dict_path(seed))
    sd_init_matched = {f'pred_model.{key}' :sd_init[key]  for key in sd_init.keys()}
    sd_algorithm = algorithm.state_dict()
    for key in sd_init_matched:
        sd_algorithm[key] = sd_init_matched[key]
    algorithm.load_state_dict(sd_algorithm)
    #########################

    train_losses[seed] = []

    train_loader = InfiniteDataLoader(  dataset=train_dataset,
                                        weights=train_weights,
                                        batch_size=batch_size,
                                        num_workers=1)
    train_minibatches_iterator = iter(train_loader)

    for step in tqdm.tqdm(range(start_step, n_steps)):
        ### Training Step ###
        i, x, y, a = next(train_minibatches_iterator)
        minibatch_device = (i, x.to(device), y.to(device), a.to(device))
        algorithm.train()
        step_vals = algorithm.update(minibatch_device, step)
        train_losses[seed].append(step_vals['loss'])

        ### Evaluation ###
        if (step % checkpoint_freq == 0) or (step == n_steps - 1):
            epoch = int(step / checkpoint_freq)
            algorithm_state_dict = algorithm.state_dict()
            algorithm_state_dict_PATH = state_dict_PATH + f'seed{seed}/sd_epoch{epoch}.pth'
            torch.save(algorithm_state_dict, algorithm_state_dict_PATH)

    loss_PATH = state_dict_PATH + f'Loss_{algorithm_name}_{seed}.pth'
    torch.save(train_losses, loss_PATH)

Training Seed: 0


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  self.pid = os.fork()
 27%|██▋       | 8002/30000 [2:57:17<8:07:23,  1.33s/it] 


KeyboardInterrupt: 