In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("sagawa/ReactionT5v2-retrosynthesis")
model = AutoModelForSeq2SeqLM.from_pretrained("sagawa/ReactionT5v2-retrosynthesis")
model.resize_token_embeddings(len(tokenizer))


inp = tokenizer('CCN(CC)CCNC(=S)NC1CCCc2cc(C)cnc21', return_tensors='pt')
output = model.generate(**inp, num_beams=1, num_return_sequences=1, return_dict_in_generate=True, output_scores=True)
output = tokenizer.decode(output['sequences'][0], skip_special_tokens=True).replace(' ', '').rstrip('.')
output # 'CCN(CC)CCN=C=S.Cc1cnc2c(c1)CCCC2N'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/36.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/825 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/795M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/133 [00:00<?, ?B/s]

'CCN(CC)CCN=C=S.Cc1cnc2c(c1)CCCC2N'

In [2]:
import pandas as pd

# Lade deine Ursprungsdatei
df = pd.read_csv('/content/data_train.csv', header=None, names=['raw'])

# Beispiel: Trennung anhand des Strings '>>' (bitte an dein Format anpassen)
df[['PRODUCT', 'REACTANT']] = df['raw'].str.split('>>', expand=True)

# Optional: Whitespace entfernen
df['PRODUCT'] = df['PRODUCT'].str.strip()
df['REACTANT'] = df['REACTANT'].str.strip()

# Speichern als neue CSV mit nur den zwei Spalten
df[['PRODUCT', 'REACTANT']].to_csv('train_processed.csv', index=False)


In [11]:
from sklearn.model_selection import train_test_split
import os

#data_dir = '/content/ReactionT5v2/data'
data_dir = '/content/'


df = pd.read_csv('train_processed.csv')
df_products = pd.read_csv('/content/product_smiles_test.csv', header=None)

df_products.columns = ["PRODUCT"]

train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.to_csv(os.path.join(data_dir, 'train_data.csv'), index=False)
valid_df.to_csv(os.path.join(data_dir, 'valid_data.csv'), index=False)
df_products.to_csv(os.path.join(data_dir, 'test_data.csv'), index=False)

In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict

train_df = pd.read_csv("train_data.csv")
valid_df = pd.read_csv("valid_data.csv")

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(valid_df)
})


In [5]:
def tokenize_function(examples):
    model_inputs = tokenizer(examples["PRODUCT"], max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(examples["REACTANT"], max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [None]:
for i in range(5):
    print(tokenized_datasets["train"][i])


{'PRODUCT': 'CC(=N)C=C.NNN=C', 'REACTANT': 'CC(=N)N=C.NNC=C', 'input_ids': [26, 6, 15, 12, 5, 3, 15, 3, 221, 25, 12, 12, 12, 15, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [6]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [14]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    report_to=[],
    predict_with_generate=True,
    fp16=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Seq2SeqTrainer(


In [None]:
import torch
torch.cuda.empty_cache()


In [16]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.0257,0.023164
2,0.0205,0.020871
3,0.0174,0.01971
4,0.0146,0.019912
5,0.0122,0.020957
6,0.0105,0.022076
7,0.0091,0.023038
8,0.0081,0.024377
9,0.007,0.025653
10,0.0062,0.026629


TrainOutput(global_step=40000, training_loss=0.013471230643987656, metrics={'train_runtime': 23117.6992, 'train_samples_per_second': 13.842, 'train_steps_per_second': 1.73, 'total_flos': 9.756856615226573e+16, 'train_loss': 0.013471230643987656, 'epoch': 10.0})

In [18]:
trainer.save_model("./trained_model")
tokenizer.save_pretrained("./trained_model")


('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/tokenizer.json')

In [10]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.019990971311926842, 'eval_runtime': 154.0342, 'eval_samples_per_second': 51.937, 'eval_steps_per_second': 6.492, 'epoch': 5.0}


In [19]:
import torch
from tqdm import tqdm

df = pd.read_csv("test_data.csv")
smiles_list = df["PRODUCT"].tolist()

model = AutoModelForSeq2SeqLM.from_pretrained("./trained_model")
tokenizer = AutoTokenizer.from_pretrained("./trained_model")

# falls CUDA verfügbar:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

predictions = []
batch_size = 32

for i in tqdm(range(0, len(smiles_list), batch_size)):
    batch = smiles_list[i:i + batch_size]
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            num_beams=4,
            early_stopping=True
        )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predictions.extend(decoded)

df["REACTANT"] = predictions

df.to_csv("retrosynthesis_results.csv", index=False)

  0%|          | 0/313 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 313/313 [05:20<00:00,  1.02s/it]


In [29]:
df["REACTANT"].to_csv("submission.csv", index=False, header=False)
