In [33]:
import pandas as pd
import numpy as np
import os

PATH = "./dataset/"

train_df = pd.read_csv(os.path.join(PATH, "train_set.csv"))
test_df = pd.read_csv(os.path.join(PATH, "test_set.csv"))

In [34]:
train_df.head()

Unnamed: 0,index,SMILES,Reorg_g,Reorg_ex
0,train_0,CC[C@H]1CCCCN1C(=O)[C@@H](C)OC(=O)c1c(C)oc(-n2...,0.631486,0.53506
1,train_1,O[C@@H](CNC1CC1)CN1CCc2sccc2C1,0.825901,1.116781
2,train_2,N#CCCNC(=O)[C@@]1(O)CCSC1,1.463943,0.964848
3,train_3,COC[C@H]1CN(c2ccc(OCC[C@@H](C)O)cc2)C(=O)O1,0.166669,0.161458
4,train_4,N#Cc1c(-c2ccccc2OCC(N)=O)[nH]c(C(N)=O)c1N,0.31382,0.338862


In [35]:
def char_grams(text : str, n : int = 3, jump_size : int = 2):
    return [text[i:i+n] for i in range(0, len(text) - n + 1, jump_size)]

train_df['3_gram'] = train_df['SMILES'].apply(lambda x : char_grams(x, 3, 2))
test_df['3_gram'] = test_df['SMILES'].apply(lambda x : char_grams(x, 3, 2))

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
df_train, df_test = train_test_split(train_df, test_size = 0.2, random_state = 42)

x_train, y_train = df_train['SMILES'], df_train[['Reorg_g','Reorg_ex']]
x_test, y_test = df_test['SMILES'], df_test[['Reorg_g','Reorg_ex']]

vector = CountVectorizer(analyzer='char_wb', ngram_range=(4,4))
vector.fit(x_train)

x_train_cv = vector.transform(x_train)
x_test_cv = vector.transform(x_test)
x_sub_cv = vector.transform(test_df)

print("number of ngrams : ", len(vector.get_feature_names()))

number of ngrams :  6289


In [23]:
# RandomForest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
kwargs = {
    'n_estimators' : 128,
    'n_jobs' : 16,
}

model = RandomForestRegressor(**kwargs)
model.fit(x_train_cv, y_train)

y_train_pred = model.predict(x_train_cv)
y_test_pred = model.predict(x_test_cv)

train_loss = mean_squared_error(y_train, y_train_pred, squared = False)
test_loss = mean_squared_error(y_test, y_test_pred, squared=False)

print("train loss : {:.3f}".format(train_loss))
print("test loss : {:.3f}".format(test_loss))

train loss : 0.111
test loss : 0.299


In [None]:
# Extract BoW features and training
from src.utils import extract_alphabet_dict

total_smiles = pd.concat([train_df[['SMILES','index']] , test_df[['SMILES', 'index']]], axis = 0).reset_index(drop = True)
char2idx,idx2char = extract_alphabet_dict(total_smiles)

alphabet = list(char2idx.keys())

train_alphabet_dict = {}
test_alphabet_dict = {}

n = 4

for a in alphabet:
    for b in alphabet:
        for c in alphabet:
            for d in alphabet:
                for e in alphabet:
                    train_alphabet_dict[f'{a}{b}{c}{d}{e}'] = []
                    test_alphabet_dict[f'{a}{b}{c}{d}{e}'] = []

for idx, seq in enumerate(train_df['SMILES']):
    for i in range(n, len(seq)):
        pattern = seq[i-n:i+1]
        train_alphabet_dict[pattern].append(idx)

for idx, seq in enumerate(test_df['SMILES']):
    for i in range(n, len(seq)):
        pattern = seq[i-n:i+1]
        test_alphabet_dict[pattern].append(idx)

train_np_dict = {}
test_np_dict = {}

key_columns = sorted(train_alphabet_dict, key = lambda x : len(train_alphabet_dict[x]), reverse = True)[:1024]

for key in key_columns:
    train_df[key] = 0
    test_df[key] = 0
    train_np_dict[key] = np.zeros(len(train_df))
    test_np_dict[key] = np.zeros(len(test_df))

for pattern in key_columns:
    for idx in train_alphabet_dict[pattern]:
        train_np_dict[pattern][idx] += 1
    
    for idx in test_alphabet_dict[pattern]:
        test_np_dict[pattern][idx] += 1
    
for pattern in key_columns:
    train_df[pattern] = train_np_dict[pattern]
    test_df[pattern] = test_np_dict[pattern]

In [25]:
from src.preprocessing import get_mol_properties

train_df = get_mol_properties(train_df)
train_df.head()

In [28]:
x_cols = train_df.columns.drop(['Reorg_g','Reorg_ex','index','3_gram','SMILES']).to_list()
y_cols = ['Reorg_g','Reorg_ex']

print("x_cols : ", x_cols)
print("y_cols : ", y_cols)

x_cols :  ['MolMR', 'NHOHCount', 'NOCount', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumValenceElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'NumAromaticRings', 'NumSaturatedRings', 'NumAliphaticRings', 'NumAromaticHeterocycles', 'NumAromaticCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticCarbocycles', 'RingCount', 'FractionCSP3', 'TPSA', 'LabuteASA']
y_cols :  ['Reorg_g', 'Reorg_ex']


In [30]:
df_train, df_test = train_test_split(train_df, test_size = 0.2, random_state = 42)

x_train, y_train = df_train[x_cols], df_train[y_cols]
x_test, y_test = df_test[x_cols], df_test[y_cols]

kwargs = {
    'n_estimators' : 128,
    'n_jobs' : 16,
}

model = RandomForestRegressor(**kwargs)
model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

train_loss = mean_squared_error(y_train, y_train_pred, squared = False)
test_loss = mean_squared_error(y_test, y_test_pred, squared=False)

print("train loss : {:.3f}".format(train_loss))
print("test loss : {:.3f}".format(test_loss))

train loss : 0.112
test loss : 0.294


In [32]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor

kwargs = {
    'n_estimators' : 1024,
}

model = MultiOutputRegressor(LGBMRegressor(**kwargs), n_jobs = 16)
model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

train_loss = mean_squared_error(y_train, y_train_pred, squared = False)
test_loss = mean_squared_error(y_test, y_test_pred, squared=False)

print("train loss : {:.3f}".format(train_loss))
print("test loss : {:.3f}".format(test_loss))

train loss : 0.139
test loss : 0.299
