In [1]:
import pandas as pd
import numpy as np 
from rdkit import Chem
from rdkit.Chem import Descriptors

In [2]:
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [3]:
print('train shape:', train.shape)
print('test shape:', test.shape)
print('sample_submission shape:', sample_submission.shape)

train shape: (7973, 7)
test shape: (3, 2)
sample_submission shape: (3, 6)


In [4]:
print('train columns:', train.columns)
print('test columns:', test.columns)
print('sample_submission columns:', sample_submission.columns)


train columns: Index(['id', 'SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg'], dtype='object')
test columns: Index(['id', 'SMILES'], dtype='object')
sample_submission columns: Index(['id', 'Tg', 'FFV', 'Tc', 'Density', 'Rg'], dtype='object')


In [5]:
train.isnull().sum().sort_values(ascending=False).head(20)


Tg         7462
Density    7360
Rg         7359
Tc         7236
FFV         943
id            0
SMILES        0
dtype: int64

In [6]:
test.isnull().sum().sort_values(ascending=False).head(20)

id        0
SMILES    0
dtype: int64

In [7]:
import selfies as sf

# Define function to safely encode to SELFIES
def smiles_to_selfies(smiles):
    try:
        return sf.encoder(smiles)
    except:
        return None  # or np.nan if you prefer

# Apply to the dataset
train["selfies"] = train["SMILES"].apply(smiles_to_selfies)

# Drop rows where conversion failed
train = train.dropna(subset=["selfies"])

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow.keras.preprocessing.sequence import pad_sequences


train["selfies_tokens"] = train["selfies"].apply(list)
all_tokens = [tok for tokens in train["selfies_tokens"] for tok in tokens]
vocab = sorted(set(all_tokens))
token_to_idx = {token: i + 1 for i, token in enumerate(vocab)}  # +1 to reserve 0 for padding

# Convert tokens to indices
train["selfies_ids"] = train["selfies_tokens"].apply(lambda tokens: [token_to_idx[t] for t in tokens])

# Pad to fixed length
X = pad_sequences(train["selfies_ids"], maxlen=128, padding="post", truncating="post")

y = train['FFV'].values

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"FFV RMSE: {rmse:.4f}")

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
ffv_data