In [1]:

%load_ext autoreload
%autoreload 2

In [2]:
from transformers import RobertaTokenizer, RobertaModel

# Load pre-trained model and tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
try_model = RobertaModel.from_pretrained(model_name)

# Prepare the input text
text = ["This is an example sentence.", "this is another example sentence"]
encoded_input = tokenizer(text, return_tensors="pt")

# Feed the input into the model
output = try_model(**encoded_input)

# Extract the CLS token embedding
cls_embedding = output.last_hidden_state[:, 0, :]

print(cls_embedding)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [1]:
import torch
import argparse
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from load_wiki_dataset import wikiData
from RoBERTa import CustomRobertaModel
from losses import align_loss, uniform_loss
from transformers import RobertaTokenizer, RobertaModel

# Check for GPU availability and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def parse_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int, default=100, help='Number of training epochs')
    parser.add_argument('--batch_size', type=int, default=8, help='Batch size')
    parser.add_argument('--learning_rate', type=float, default=1e-4, help='Learning rate')
    parser.add_argument('--data_path', type=str, default='../data/wiki1m_for_simcse.txt', help='Path to the dataset')
    return parser.parse_args()

def train_wiki(args,model):
    # Load dataset
    with open(args.data_path, 'r', encoding='UTF-8') as f:
        input_text = f.readlines()

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')  # ADD THIS LINE

    wiki = wikiData(input_text, tokenizer)  # PASS tokenizer TO wikiData
    train_params = {'batch_size': args.batch_size, 'shuffle': True, 'num_workers': 0}
    trainloader = DataLoader(wiki, **train_params)
    
    # Initialize model
    model.to(device)
    model.train()

    # Set up the optimizer
    optimizer = AdamW(model.parameters(), lr=args.learning_rate)
    for epoch in range(args.epochs):
            epoch_loss = 0
            for batch in tqdm(trainloader, desc=f"Epoch {epoch + 1}/{args.epochs}"):
                optimizer.zero_grad()
                batch = {k: v.to(device) for k, v in batch.items()}  # move batch to device
                
                loss, _ = model(batch)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()
                # print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(trainloader)}")
    torch.save(model.state_dict(), f'our_loss_{epoch + 1}wiki_model.pth')
    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(trainloader)}")

    return model

In [10]:
args = argparse.Namespace()
args.epochs = 1
args.batch_size = 8
args.learning_rate = 1e-4
args.data_path = "../data/wiki1m_for_simcse.txt"

In [11]:
from RoBERTa import CustomRobertaModel
raw_model= CustomRobertaModel()



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [12]:
simcse_1wiki_model = train_wiki(args, raw_model)
    # Save the trained model if needed
simcse_1wiki_model.save_pretrained('./simcse_1wiki_model')
torch.save(simcse_1wiki_model.state_dict(), 'simcse_1wiki_model.pth')

Epoch 1/1: 100%|██████████| 125000/125000 [5:39:37<00:00,  6.13it/s]  


Epoch 1 Loss: -3.7516470831336677


In [24]:
from RoBERTa import CustomRobertaModel

In [25]:
wiki_model2 = CustomRobertaModel()
wiki_model2.load_state_dict(torch.load('1wiki.pth'))


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

<All keys matched successfully>

In [7]:
from transformers import AutoModel,RobertaTokenizer # Load the trained model 

wiki_model = AutoModel.from_pretrained('./trained_model_wiki')

Some weights of the model checkpoint at ./trained_model_wiki were not used when initializing RobertaModel: ['roberta.roberta.encoder.layer.6.output.adapters.my_adapter.adapter_down.0.weight', 'roberta_m.roberta.encoder.layer.8.attention.self.value.bias', 'roberta.roberta.encoder.layer.4.output.dense.weight', 'roberta_m.roberta.encoder.layer.8.output.LayerNorm.weight', 'roberta_m.roberta.encoder.layer.11.output.adapters.my_adapter.adapter_down.0.weight', 'roberta.roberta.encoder.layer.3.output.adapters.my_adapter.adapter_down.0.weight', 'roberta_m.roberta.encoder.layer.9.attention.self.key.bias', 'roberta_m.roberta.encoder.layer.4.intermediate.dense.weight', 'roberta_m.roberta.encoder.layer.4.attention.self.query.weight', 'roberta.roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.roberta.encoder.layer.9.attention.output.dense.bias', 'roberta_m.roberta.encoder.layer.5.output.dense.weight', 'roberta.roberta.encoder.layer.8.attention.self.value.weight', 'roberta_m.robert

In [3]:
from load_allsides_dataset import allsidesData
from torch.utils.data import DataLoader
import torch
import argparse
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from load_wiki_dataset import wikiData
from losses import align_loss, uniform_loss
from transformers import RobertaTokenizer, RobertaModel
def train_allsides(args, model_to_train):
    # Load dataset
    with open(args.data_path, 'r', encoding='UTF-8') as f:
        input_text = f.readlines()

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')  # ADD THIS LINE

    allsides = allsidesData(input_text, tokenizer)  # PASS tokenizer TO wikiData
    train_params = {'batch_size': args.batch_size, 'shuffle': True, 'num_workers': 0}
    trainloader = DataLoader(allsides, **train_params)
    
    # Initialize model
    model = model_to_train
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()

    # Set up the optimizer
    optimizer = AdamW(model.parameters(), lr=args.learning_rate)
    for epoch in range(args.epochs):
            epoch_loss = 0
            for batch in tqdm(trainloader, desc=f"Epoch {epoch + 1}/{args.epochs}"):
                optimizer.zero_grad()
                batch = {k: v.to(device) for k, v in batch.items()}  # move batch to device
                # print(model(batch))
                loss, cls = model(batch)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()
                # print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(trainloader)}")
    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(trainloader)}")

    return model

In [32]:
import argparse

args = argparse.Namespace()
args.epochs = 1
args.batch_size = 8
args.learning_rate = 1e-4
args.data_path = "../data/allsides.jsonl"

In [33]:
allsides_wiki_model = train_allsides(args,wiki_model2)
    # Save the trained model if needed
allsides_wiki_model.save_pretrained('./allsides_wiki_model')

Epoch 1/1: 100%|██████████| 65720/65720 [3:01:13<00:00,  6.04it/s]  


Epoch 1 Loss: -3.609909778521399


In [34]:
torch.save(allsides_wiki_model.state_dict(), 'allsides_wiki_model.pth')


In [37]:
args = argparse.Namespace()
args.epochs = 1
args.batch_size = 8
args.learning_rate = 1e-4
args.data_path = "../data/wiki1m_for_simcse.txt"

In [42]:
allsides_wiki2_model = train_wiki(args,allsides_wiki_model)
    # Save the trained model if needed
allsides_wiki2_model.save_pretrained('./allsides_wiki2_model')
torch.save(allsides_wiki2_model.state_dict(), 'allsides_wiki2_model.pth')


Epoch 1/1:  70%|██████▉   | 86938/125000 [3:55:10<1:42:57,  6.16it/s]


KeyboardInterrupt: 

In [3]:
from RoBERTa import CustomRobertaModel
wiki_model = CustomRobertaModel()
wiki_model.load_state_dict(torch.load('1wiki.pth'))
allsides_wiki_model = CustomRobertaModel()
allsides_wiki_model.load_state_dict(torch.load('allsides_wiki_model.pth'))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

<All keys matched successfully>

In [4]:
from RoBERTa import CustomRobertaModel
simcse_1wiki_model = CustomRobertaModel()
simcse_1wiki_model.load_state_dict(torch.load('simcse_1wiki_model.pth'))


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

<All keys matched successfully>

In [5]:
args = argparse.Namespace()
args.epochs = 1
args.batch_size = 8
args.learning_rate = 1e-4
args.data_path = "../data/wiki1m_for_simcse.txt"

In [8]:
simcse_2wiki_model = train_wiki(args,simcse_1wiki_model)
    # Save the trained model if needed
simcse_2wiki_model.save_pretrained('./simcse_2wiki_model')
torch.save(simcse_2wiki_model.state_dict(), 'simcse_2wiki_model.pth')

Epoch 1/1: 100%|██████████| 125000/125000 [5:35:28<00:00,  6.21it/s]  


Epoch 1 Loss: -3.8066049778327944


NameError: name 'allsides_wiki2_model' is not defined

In [9]:
simcse_2wiki_model.save_pretrained('./simcse_2wiki_model')
torch.save(simcse_2wiki_model.state_dict(), 'simcse_2wiki_model.pth')

In [2]:
from RoBERTa import CustomRobertaModel
simcse_2wiki_model = CustomRobertaModel()
simcse_2wiki_model.load_state_dict(torch.load('simcse_2wiki_model.pth'))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

RuntimeError: Error(s) in loading state_dict for CustomRobertaModel:
	Unexpected key(s) in state_dict: "roberta.roberta.encoder.layer.0.output.adapters.my_adapter.adapter_down.0.weight", "roberta.roberta.encoder.layer.0.output.adapters.my_adapter.adapter_down.0.bias", "roberta.roberta.encoder.layer.0.output.adapters.my_adapter.adapter_up.weight", "roberta.roberta.encoder.layer.0.output.adapters.my_adapter.adapter_up.bias", "roberta.roberta.encoder.layer.1.output.adapters.my_adapter.adapter_down.0.weight", "roberta.roberta.encoder.layer.1.output.adapters.my_adapter.adapter_down.0.bias", "roberta.roberta.encoder.layer.1.output.adapters.my_adapter.adapter_up.weight", "roberta.roberta.encoder.layer.1.output.adapters.my_adapter.adapter_up.bias", "roberta.roberta.encoder.layer.2.output.adapters.my_adapter.adapter_down.0.weight", "roberta.roberta.encoder.layer.2.output.adapters.my_adapter.adapter_down.0.bias", "roberta.roberta.encoder.layer.2.output.adapters.my_adapter.adapter_up.weight", "roberta.roberta.encoder.layer.2.output.adapters.my_adapter.adapter_up.bias", "roberta.roberta.encoder.layer.3.output.adapters.my_adapter.adapter_down.0.weight", "roberta.roberta.encoder.layer.3.output.adapters.my_adapter.adapter_down.0.bias", "roberta.roberta.encoder.layer.3.output.adapters.my_adapter.adapter_up.weight", "roberta.roberta.encoder.layer.3.output.adapters.my_adapter.adapter_up.bias", "roberta.roberta.encoder.layer.4.output.adapters.my_adapter.adapter_down.0.weight", "roberta.roberta.encoder.layer.4.output.adapters.my_adapter.adapter_down.0.bias", "roberta.roberta.encoder.layer.4.output.adapters.my_adapter.adapter_up.weight", "roberta.roberta.encoder.layer.4.output.adapters.my_adapter.adapter_up.bias", "roberta.roberta.encoder.layer.5.output.adapters.my_adapter.adapter_down.0.weight", "roberta.roberta.encoder.layer.5.output.adapters.my_adapter.adapter_down.0.bias", "roberta.roberta.encoder.layer.5.output.adapters.my_adapter.adapter_up.weight", "roberta.roberta.encoder.layer.5.output.adapters.my_adapter.adapter_up.bias", "roberta.roberta.encoder.layer.6.output.adapters.my_adapter.adapter_down.0.weight", "roberta.roberta.encoder.layer.6.output.adapters.my_adapter.adapter_down.0.bias", "roberta.roberta.encoder.layer.6.output.adapters.my_adapter.adapter_up.weight", "roberta.roberta.encoder.layer.6.output.adapters.my_adapter.adapter_up.bias", "roberta.roberta.encoder.layer.7.output.adapters.my_adapter.adapter_down.0.weight", "roberta.roberta.encoder.layer.7.output.adapters.my_adapter.adapter_down.0.bias", "roberta.roberta.encoder.layer.7.output.adapters.my_adapter.adapter_up.weight", "roberta.roberta.encoder.layer.7.output.adapters.my_adapter.adapter_up.bias", "roberta.roberta.encoder.layer.8.output.adapters.my_adapter.adapter_down.0.weight", "roberta.roberta.encoder.layer.8.output.adapters.my_adapter.adapter_down.0.bias", "roberta.roberta.encoder.layer.8.output.adapters.my_adapter.adapter_up.weight", "roberta.roberta.encoder.layer.8.output.adapters.my_adapter.adapter_up.bias", "roberta.roberta.encoder.layer.9.output.adapters.my_adapter.adapter_down.0.weight", "roberta.roberta.encoder.layer.9.output.adapters.my_adapter.adapter_down.0.bias", "roberta.roberta.encoder.layer.9.output.adapters.my_adapter.adapter_up.weight", "roberta.roberta.encoder.layer.9.output.adapters.my_adapter.adapter_up.bias", "roberta.roberta.encoder.layer.10.output.adapters.my_adapter.adapter_down.0.weight", "roberta.roberta.encoder.layer.10.output.adapters.my_adapter.adapter_down.0.bias", "roberta.roberta.encoder.layer.10.output.adapters.my_adapter.adapter_up.weight", "roberta.roberta.encoder.layer.10.output.adapters.my_adapter.adapter_up.bias", "roberta.roberta.encoder.layer.11.output.adapters.my_adapter.adapter_down.0.weight", "roberta.roberta.encoder.layer.11.output.adapters.my_adapter.adapter_down.0.bias", "roberta.roberta.encoder.layer.11.output.adapters.my_adapter.adapter_up.weight", "roberta.roberta.encoder.layer.11.output.adapters.my_adapter.adapter_up.bias", "roberta_m.roberta.encoder.layer.0.output.adapters.my_adapter.adapter_down.0.weight", "roberta_m.roberta.encoder.layer.0.output.adapters.my_adapter.adapter_down.0.bias", "roberta_m.roberta.encoder.layer.0.output.adapters.my_adapter.adapter_up.weight", "roberta_m.roberta.encoder.layer.0.output.adapters.my_adapter.adapter_up.bias", "roberta_m.roberta.encoder.layer.1.output.adapters.my_adapter.adapter_down.0.weight", "roberta_m.roberta.encoder.layer.1.output.adapters.my_adapter.adapter_down.0.bias", "roberta_m.roberta.encoder.layer.1.output.adapters.my_adapter.adapter_up.weight", "roberta_m.roberta.encoder.layer.1.output.adapters.my_adapter.adapter_up.bias", "roberta_m.roberta.encoder.layer.2.output.adapters.my_adapter.adapter_down.0.weight", "roberta_m.roberta.encoder.layer.2.output.adapters.my_adapter.adapter_down.0.bias", "roberta_m.roberta.encoder.layer.2.output.adapters.my_adapter.adapter_up.weight", "roberta_m.roberta.encoder.layer.2.output.adapters.my_adapter.adapter_up.bias", "roberta_m.roberta.encoder.layer.3.output.adapters.my_adapter.adapter_down.0.weight", "roberta_m.roberta.encoder.layer.3.output.adapters.my_adapter.adapter_down.0.bias", "roberta_m.roberta.encoder.layer.3.output.adapters.my_adapter.adapter_up.weight", "roberta_m.roberta.encoder.layer.3.output.adapters.my_adapter.adapter_up.bias", "roberta_m.roberta.encoder.layer.4.output.adapters.my_adapter.adapter_down.0.weight", "roberta_m.roberta.encoder.layer.4.output.adapters.my_adapter.adapter_down.0.bias", "roberta_m.roberta.encoder.layer.4.output.adapters.my_adapter.adapter_up.weight", "roberta_m.roberta.encoder.layer.4.output.adapters.my_adapter.adapter_up.bias", "roberta_m.roberta.encoder.layer.5.output.adapters.my_adapter.adapter_down.0.weight", "roberta_m.roberta.encoder.layer.5.output.adapters.my_adapter.adapter_down.0.bias", "roberta_m.roberta.encoder.layer.5.output.adapters.my_adapter.adapter_up.weight", "roberta_m.roberta.encoder.layer.5.output.adapters.my_adapter.adapter_up.bias", "roberta_m.roberta.encoder.layer.6.output.adapters.my_adapter.adapter_down.0.weight", "roberta_m.roberta.encoder.layer.6.output.adapters.my_adapter.adapter_down.0.bias", "roberta_m.roberta.encoder.layer.6.output.adapters.my_adapter.adapter_up.weight", "roberta_m.roberta.encoder.layer.6.output.adapters.my_adapter.adapter_up.bias", "roberta_m.roberta.encoder.layer.7.output.adapters.my_adapter.adapter_down.0.weight", "roberta_m.roberta.encoder.layer.7.output.adapters.my_adapter.adapter_down.0.bias", "roberta_m.roberta.encoder.layer.7.output.adapters.my_adapter.adapter_up.weight", "roberta_m.roberta.encoder.layer.7.output.adapters.my_adapter.adapter_up.bias", "roberta_m.roberta.encoder.layer.8.output.adapters.my_adapter.adapter_down.0.weight", "roberta_m.roberta.encoder.layer.8.output.adapters.my_adapter.adapter_down.0.bias", "roberta_m.roberta.encoder.layer.8.output.adapters.my_adapter.adapter_up.weight", "roberta_m.roberta.encoder.layer.8.output.adapters.my_adapter.adapter_up.bias", "roberta_m.roberta.encoder.layer.9.output.adapters.my_adapter.adapter_down.0.weight", "roberta_m.roberta.encoder.layer.9.output.adapters.my_adapter.adapter_down.0.bias", "roberta_m.roberta.encoder.layer.9.output.adapters.my_adapter.adapter_up.weight", "roberta_m.roberta.encoder.layer.9.output.adapters.my_adapter.adapter_up.bias", "roberta_m.roberta.encoder.layer.10.output.adapters.my_adapter.adapter_down.0.weight", "roberta_m.roberta.encoder.layer.10.output.adapters.my_adapter.adapter_down.0.bias", "roberta_m.roberta.encoder.layer.10.output.adapters.my_adapter.adapter_up.weight", "roberta_m.roberta.encoder.layer.10.output.adapters.my_adapter.adapter_up.bias", "roberta_m.roberta.encoder.layer.11.output.adapters.my_adapter.adapter_down.0.weight", "roberta_m.roberta.encoder.layer.11.output.adapters.my_adapter.adapter_down.0.bias", "roberta_m.roberta.encoder.layer.11.output.adapters.my_adapter.adapter_up.weight", "roberta_m.roberta.encoder.layer.11.output.adapters.my_adapter.adapter_up.bias". 

In [6]:
simcse_3wiki_model = train_wiki(args,simcse_2wiki_model)
    # Save the trained model if needed
simcse_3wiki_model.save_pretrained('./simcse_3wiki_model')
torch.save(simcse_3wiki_model.state_dict(), 'simcse_3wiki_model.pth')

Epoch 1/1: 100%|██████████| 125000/125000 [5:38:20<00:00,  6.16it/s]  


Epoch 1 Loss: -3.821558924482346


In [2]:
from RoBERTa import CustomRobertaModel
real_simcse_wiki_model = CustomRobertaModel()


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [3]:
args = argparse.Namespace()
args.epochs = 2
args.batch_size = 8
args.learning_rate = 1e-4
args.data_path = "../data/wiki1m_for_simcse.txt"

In [4]:
real_simcse_2wiki_model  = train_wiki(args, real_simcse_wiki_model)
torch.save(real_simcse_2wiki_model.state_dict(), 'real_simcse_2wiki_model.pth')

Epoch 1/2: 100%|██████████| 125000/125000 [8:50:44<00:00,  3.93it/s]   
Epoch 2/2: 100%|██████████| 125000/125000 [8:40:10<00:00,  4.01it/s]   


Epoch 2 Loss: 1.5896223703017234


In [2]:
from RoBERTa import CustomRobertaModel
real_simcse_wiki_model = CustomRobertaModel()
args = argparse.Namespace()
args.epochs = 1
args.batch_size = 8
args.learning_rate = 1e-4
args.data_path = "../data/wiki1m_for_simcse.txt"
real_simcse_1wiki_model  = train_wiki(args, real_simcse_wiki_model)
torch.save(real_simcse_1wiki_model.state_dict(), 'real_simcse_1wiki_model.pth')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Epoch 1 Loss: 1.6381619698782004


In [5]:
try_moreal_simcse_wiki_model = CustomRobertaModel()
try_moreal_simcse_wiki_model.load_state_dict(torch.load('real_simcse_2wiki_model.pth'))



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

<All keys matched successfully>