In [None]:
import pandas as pd
import numpy as np 
from rdkit import Chem
from rdkit.Chem import Descriptors

In [None]:
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [None]:
print('train shape:', train.shape)
print('test shape:', test.shape)
print('sample_submission shape:', sample_submission.shape)

In [None]:
print('train columns:', train.columns)
print('test columns:', test.columns)
print('sample_submission columns:', sample_submission.columns)


In [None]:
train.isnull().sum().sort_values(ascending=False).head(20)


In [None]:
test.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
train.dropna(inplace=True)
print('train shape after dropping NaNs:', train.shape)

In [None]:


def featurize(smiles):
    mol = Chem.MolFromSmiles(smiles.replace("*", "H"))  # Replace wildcards
    if mol is None:
        return [np.nan]*5
    return [
        Descriptors.MolWt(mol),
        Descriptors.NumValenceElectrons(mol),
        Descriptors.TPSA(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumRotatableBonds(mol),
    ]

train[['MolWt', 'ValenceElectrons', 'TPSA', 'LogP', 'RotBonds']] = train['SMILES'].apply(lambda x: pd.Series(featurize(x)))


In [None]:
ffv_data = train[train['FFV'].notnull()].copy()

In [None]:
ffv_data