# Importing

In [11]:
import pandas as pd
import re 
from sklearn.model_selection import train_test_split
import numpy as np

# Data Preparation

In [None]:
df = pd.read_csv('D:\Kuliah\Tugas Akhir\LSTM-MBA\data\GSARPC3.csv')

df['target'] = df['categories'].apply(lambda x: 1 if x == "inhibitor" else 0)
print("Distribusi label:")
print(df['target'].value_counts(normalize=True))

df_train, df_test = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['target']
)


Distribusi label:
target
0    0.504717
1    0.495283
Name: proportion, dtype: float64


# Membuat Pattern Untuk senyawa



Hal ini dilakukan agar bisa men-tokenisasi karakter kimia seperti "Br", "Cl", "Si", "Se".

In [6]:
pattern = re.compile(
    r"(\[[^\]]*\]|Br|Cl|Si|Se|@@?|==?|##?|/|\\|\(|\)|\.|\+|\-|%\d\d|%\d|\d|[A-Z][a-z]?|[a-z])"
)

def tokenize(smiles):
    return pattern.findall(smiles)

# Proses Tokenisasi

In [16]:
all_tokens = []
for s in df_train['smiles']:
    toks = tokenize(s)
    all_tokens.extend(toks)

vocab = sorted(set(all_tokens))

token_to_int = {tok: i+2 for i, tok in enumerate(vocab)}  
token_to_int['<PAD>'] = 0
token_to_int['<UNK>'] = 1   

vocab_size = len(token_to_int)
print("vocab_size:", vocab_size)


vocab_size: 28


# Penerapan Smiles to Sequence

In [17]:
MAX_LEN = min(250, max(len(tokenize(s)) for s in df_train['smiles']) + 5)

def smiles_to_seq(smiles_list, max_len):
    seqs = []
    for s in smiles_list:
        toks = tokenize(s)
        seq = [token_to_int.get(t, token_to_int['<UNK>']) for t in toks]
        
        if len(seq) > max_len:
            seq = seq[:max_len]
        else:
            seq += [0] * (max_len - len(seq))  
        
        seqs.append(seq)
    return np.array(seqs, dtype=np.int32)

X_train = smiles_to_seq(df_train['smiles'].tolist(), MAX_LEN)
X_test  = smiles_to_seq(df_test['smiles'].tolist(), MAX_LEN)

y_train = df_train['target'].values.astype(np.float32)
y_test  = df_test['target'].values.astype(np.float32)

print(f"X_train: {X_train.shape} | vocab: {vocab_size} | max_len: {MAX_LEN}")


X_train: (508, 168) | vocab: 28 | max_len: 168


In [19]:
np.savez(
    "data_preprocessed.npz",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    vocab_size=vocab_size
)

Dengan metode ini, data senyawa dalam format SMILES dapat dibaca oleh model LSTM