In [4]:
from pynvml import *
from huggingface_hub import notebook_login
import pandas as pd
from datasets import Dataset


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [5]:
# Read the CSV file using pandas
df = pd.read_csv('May13VitCRun_300gen3kpop.csv')
# # Convert pandas DataFrame to Hugging Face dataset
dataset = Dataset.from_pandas(df)

In [3]:
# Calculate the maximum and minimum length of the components
max_length = df['target'].str.len().max()
min_length = df['kids'].str.len().min()

print("Kids Maximum Length:", max_length)
print("Kids Minimum Length:", min_length)

Kids Maximum Length: 158
Kids Minimum Length: 3


In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

# Text to tokenize
text = "[ 1 0 ,   3 2 ,   1 1 ,   1 0 ,   1 6 ,   1 1"

# Tokenize the text
tokens = tokenizer.tokenize(text)

# Print the tokens
print(tokens)

['[', 'Ġ1', 'Ġ0', 'Ġ,', 'Ġ', 'Ġ', 'Ġ3', 'Ġ2', 'Ġ,', 'Ġ', 'Ġ', 'Ġ1', 'Ġ1', 'Ġ,', 'Ġ', 'Ġ', 'Ġ1', 'Ġ0', 'Ġ,', 'Ġ', 'Ġ', 'Ġ1', 'Ġ6', 'Ġ,', 'Ġ', 'Ġ', 'Ġ1', 'Ġ1']


In [7]:
from Tokenizer import Tokenizer

SFtokenizer = Tokenizer()
print(SFtokenizer.encode("[C>][Branch][:0chiral][Ring2][:0chiral][Ring1][C][=Branch][=C][Branch][C][=Branch][=O][pop][O][Ring1][=Branch][pop]"))

12 20 143 10 15 10 140 0 139 10 14 2 10 140 0 139 10 14 1 10 20 10 16 15 10 16 20 10 15 10 20 10 16 15 10 16 30 10 13 10 30 10 14 1 10 16 15 10 13 11


In [None]:
testSelfies = "[C][Branch][:0chiral][Ring2][:0chiral][Ring1][C][=Branch][=C][Branch][C][=Branch][=O][pop][O][Ring1][=Branch][pop]"
print(SFtokenizer.encode(testSelfies))
print(tokenizer.tokenize(testSelfies))
print(tokenizer.tokenize(SFtokenizer.encode(testSelfies)))

In [8]:
import re
# Tokenize the dataset
copy = df.copy()
copy['target'] = copy['target'].apply(SFtokenizer.encode)
copy['kids'] = copy['kids'].apply(SFtokenizer.encode)
copy['p_score'] = copy['p_score'].apply(lambda x: re.sub(r"^\d|\.", "", str(x)).replace("", " ")[1:])
copy['p_score'] = copy['p_score'].apply(lambda x: re.sub(r"(?<=\d)(?=\d)", " ", str(x)))
dataset = Dataset.from_pandas(copy)
dataset = dataset.train_test_split(test_size=.2)

In [None]:
dataset['train'][1]

In [9]:
#One column dataset
copy["TrainingData"] = copy["target"] + ' S '+ copy["p_score"] + ' K ' + copy["kids"]
dataset = Dataset.from_pandas(copy)
dataset = dataset.train_test_split(test_size=.2)

In [10]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["TrainingData"]])

encoded_dataset = dataset.map(preprocess_function, batched=True, num_proc=4, remove_columns=dataset["train"].column_names)

Map (num_proc=4):   0%|          | 0/210845 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (649 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (693 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (599 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (611 > 512). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/52712 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (637 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (639 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


In [11]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = encoded_dataset.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/210845 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/52712 [00:00<?, ? examples/s]

In [61]:
lm_datasetdf = lm_dataset.to_pandas()
lm_datasetdf.to_csv('lm_datasetdf.csv')

AttributeError: 'DatasetDict' object has no attribute 'to_pandas'

In [13]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [29]:
#!pip install git+https://github.com/huggingface/accelerate
# uninstall transformers
!pip uninstall  transformers -y
! pip install transformers datasets

Found existing installation: transformers 4.30.0.dev0
Uninstalling transformers-4.30.0.dev0:
  Successfully uninstalled transformers-4.30.0.dev0
Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Using cached transformers-4.29.1-py3-none-any.whl (7.1 MB)
Installing collected packages: transformers
Successfully installed transformers-4.29.1


In [1]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")

In [2]:
import wandb

wandb.init(
    project="VitCRun",
    config={
        "learning_rate": 2e-5,
        "num_train_epochs": 10,
    }
)


[34m[1mwandb[0m: Currently logged in as: [33malxfgh[0m ([33mlexer[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [58]:
%pip uninstall -y transformers accelerate
%pip install transformers==4.28.0 
%pip install accelerate -U

Found existing installation: transformers 4.28.0
Uninstalling transformers-4.28.0:
  Successfully uninstalled transformers-4.28.0
Found existing installation: accelerate 0.19.0
Uninstalling accelerate-0.19.0:
  Successfully uninstalled accelerate-0.19.0
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting transformers==4.28.0
  Using cached transformers-4.28.0-py3-none-any.whl (7.0 MB)
Installing collected packages: transformers
Successfully installed transformers-4.28.0
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Using cached accelerate-0.19.0-py3-none-any.whl (219 kB)
Installing collected packages: accelerate
Successfully installed accelerate-0.19.0
Note: you may need to restart the kernel to use updated packages.


In [17]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="my_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=True,
    report_to='wandb',  # Enables reporting to W&B.
    run_name='test_run',  # Name of the W&B run.
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

Cloning https://huggingface.co/alxfgh/my_model into local empty directory.


  0%|          | 0/1228870 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

In [18]:
print_gpu_utilization()

GPU memory occupied: 3438 MB.
