# Data Preprocessing

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import numpy as np
import sentencepiece as spm
import scienceplots
import pandas as pd
from rdkit import Chem
plt.rcParams.update( {'text.usetex' : True,
                    'font.size' : 14})
plt.style.use(['science','no-latex'])

In [2]:
data_path = os.path.join("Liu_Kheyer_Retrosynthesis_Data")
train_path = os.path.join(data_path, "train")
validation_path = os.path.join(data_path, "validation")
test_path = os.path.join(data_path, "test")
train_reactants_df = pd.read_csv(os.path.join(
    train_path, "train_targets.txt"), header=None)
train_products_df = pd.read_csv(os.path.join(
    train_path, "train_sources.txt"), header=None)

In [3]:
def process_dfs(react_df, prod_df):
    react_df_n = react_df.copy()
    prod_df_n = prod_df.copy()
    prod_df_n.rename(columns={0: "products"}, inplace=True)
    react_df_n.rename(columns={0: "reactants"}, inplace=True)
    prod_df_n["reaction_type"] = prod_df_n["products"].str.extract(
        r"(\<RX_.*\>)")
    prod_df_n["products"] = prod_df_n["products"].str.replace(
        r"(\<RX_.*\>)", "", regex=True)
    df = pd.concat([react_df_n, prod_df_n], axis=1)
    return df


def prepare_whole_reaction_padded(df):
    df["full_input_format_delimited"] = "[BOS]" + \
        df["reactants"]+">>"+df["products"]+"[EOS]"
    df["full_input_format_delimited"] = df["full_input_format_delimited"].str.replace(
        " ", "")
    df["full_input_format"] = df["reactants"].str.replace(
        " ", "")+">>"+df["products"].str.replace(" ", "")


df = process_dfs(train_reactants_df, train_products_df)
prepare_whole_reaction_padded(df)

params = Chem.rdChemReactions.ReactionFingerprintParams()
params.fpSize = 2048
df["rxn"] = df.apply(lambda x: Chem.rdChemReactions.ReactionFromSmarts(
    x["full_input_format"], useSmiles=True), axis=1)
df["fingerprint"] = df.apply(
    lambda x: Chem.rdChemReactions.CreateDifferenceFingerprintForReaction(x["rxn"], params), axis=1)

In [4]:
spt_1 = spm.SentencePieceTrainer.train(
    "--input=Liu_Kheyer_Retrosynthesis_Data/vocab2.txt --model_prefix=m  --user_defined_symbols=[BOS],[EOS],[PAD],. --vocab_size=56 --bos_id=-1 --eos_id=-1")
sp = spm.SentencePieceProcessor()
sp.load('m.model')
print(sp.get_piece_size())

56


sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=Liu_Kheyer_Retrosynthesis_Data/vocab2.txt --model_prefix=m  --user_defined_symbols=[BOS],[EOS],[PAD],. --vocab_size=56 --bos_id=-1 --eos_id=-1
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: Liu_Kheyer_Retrosynthesis_Data/vocab2.txt
  input_format: 
  model_prefix: m
  model_type: UNIGRAM
  vocab_size: 56
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: [BOS]
  user_defined_symbols: [EOS]
  user_defined_symbols: [PAD]
  user_defined_symbols: .
  required_cha

Now we can tokenize the training data:

In [5]:
PAD_TOKEN = sp.encode_as_ids("[PAD]")[1]


def tokenize_and_pad(df, max_len=0):
    # choose to set the max_len based on a user entered value or the one derived from the data
    if max_len == 0:
        max_val = df["input_ids"].str.len().sort_values(
            ascending=False).head(1).values[0]
        print(max_val)
        return
    else:
        max_val = max_len
    # Create the encoded columns
    df["input_ids"] = df["full_input_format_delimited"].apply(
        sp.encode_as_ids).apply(lambda x: x[1:])
    df_new = df[df["input_ids"].apply(len) <= max_val].copy()
    print("removed ", df.shape[0]-df_new.shape[0], " samples")

    # calculate how much to pad for each case
    df_new["input_counts"] = max_val-df_new["input_ids"].str.len()

    # pad with the PAD_TOKEN
    df_new["input_ids"] = df_new.apply(lambda row: np.pad(
        row["input_ids"], (0, row["input_counts"]), mode='constant', constant_values=(0, PAD_TOKEN)), axis=1)

    return np.asarray(df_new["input_ids"].values.tolist()), max_val, df_new


train_inputs, train_input_len, df_with_max_200 = tokenize_and_pad(df, 200)
fingerprints = []
for i in df_with_max_200.index:
    array = np.zeros((0, ), dtype=np.float64)
    Chem.DataStructs.ConvertToNumpyArray(
        df_with_max_200.loc[i, "fingerprint"], array)
    fingerprints.append(array)

df_with_max_200["input_ids"].str.len().sort_values(), len(fingerprints)

removed  474  samples


(0        200
 26676    200
 26677    200
 26678    200
 26679    200
         ... 
 13352    200
 13353    200
 13354    200
 13347    200
 40028    200
 Name: input_ids, Length: 39555, dtype: int64,
 39555)

Now let's create the pipeline for validation and test data:

In [6]:
def data_to_Ids_pipeline(reactants_df, products_df, max_len=0):
    dfa = process_dfs(reactants_df, products_df)
    prepare_whole_reaction_padded(dfa)
    inputs, input_len, dfa_new = tokenize_and_pad(dfa, max_len)
    return dfa_new, inputs, input_len


valid_products_df = pd.read_csv(os.path.join(
    validation_path, "valid_sources.txt"), header=None)
valid_reactants_df = pd.read_csv(os.path.join(
    validation_path, "valid_targets.txt"), header=None)

test_products_df = pd.read_csv(os.path.join(
    test_path, "test_sources.txt"), header=None)
test_reactants_df = pd.read_csv(os.path.join(
    test_path, "test_targets.txt"), header=None)
valid_df, valid_inputs, valid_input_len = data_to_Ids_pipeline(
    valid_reactants_df, valid_products_df, max_len=200)

test_df, test_inputs, test_input_len = data_to_Ids_pipeline(
    test_reactants_df, test_products_df, max_len=200)
print("Shape of validation Ids inputs:", valid_inputs.shape)
print("Shape of Test Ids inputs:",  test_inputs.shape)

removed  69  samples
removed  48  samples
Shape of validation Ids inputs: (4935, 200)
Shape of Test Ids inputs: (4956, 200)
