In [None]:
%%file test_generate_descriptors.py
!pip install pytest
!pip install rdkit
import pandas as pd
import numpy as np
import pytest
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator


def generate_descriptors(file_path, smiles_column_name, phase_column_name, status_column_name, disease_column_name):
        """
        :param file_path: path to the .csv table.
        :param smiles_column_name: string reflecting the SMILES column name.
        :return: a DataFrame object containing N columns with chemical descriptors from Rdkit
        + 1 column with disease categories.
        """

        input_data = pd.read_csv(file_path)[[smiles_column_name, phase_column_name, status_column_name, disease_column_name]]

        unique_data = input_data.drop_duplicates(subset=[smiles_column_name, disease_column_name]).dropna(subset=[smiles_column_name, disease_column_name]).reset_index()

        invalid_smiles = []
        for smile in unique_data[smiles_column_name]:
          mol = Chem.MolFromSmiles(smile)
          if mol is None:
           invalid_smiles.append(smile)

        # Удаление строк с некорректными SMILES-записями
        verify_data = unique_data[~unique_data[smiles_column_name].isin(invalid_smiles)]

        desc_list = Chem.Descriptors.descList
        unique_desc_set = set(desc_list)
        unique_desc_list = list(unique_desc_set)

        descriptors = [x[0] for x in unique_desc_list]
        calculator = MolecularDescriptorCalculator(descriptors)

        dataset_with_descriptors = []

        for smile in verify_data[smiles_column_name]:
          mol = Chem.MolFromSmiles(smile)
          descriptors_values = calculator.CalcDescriptors(mol)
          dataset_with_descriptors.append(descriptors_values)
        dataset_with_descriptors = pd.DataFrame(dataset_with_descriptors, columns=descriptors)

        descripted_data = pd.concat([verify_data.reset_index(drop=True), dataset_with_descriptors], axis=1)

        return descripted_data


@pytest.fixture
def sample_data(tmp_path):
    # Создаем CSV-файл
    data = {
        'SMILES': ['C(CCl)NC(=O)N(CCCl)N=O', 'C(CCl)NC(=O)N(CCCl)N=O', None,  'CC(F)C(C)C', None, 'F/C=C/F', 'F/C=C\F',  ' CC(=O)N[C@@H](CS)C(=O)O', 'CC(=O)N[C@H](CS)C(=O)O', 'CC(CC1=CC=C=C1N'],

        'Disease': ['X', 'X', 'Y', None, None, 'cancer', 'cancer', 'hematology', 'hematology', 'parcinson'],

        'Phase': np.random.choice([None, 1, 2, 3],10),
        'Status': np.random.choice(['A', 'B', None],10)
    }
    df = pd.DataFrame(data)
    file_path = tmp_path / "test_data.csv"
    df.to_csv(file_path, index=False)
    return file_path

def test_generate_descriptors(sample_data):
    descriptors_df = generate_descriptors(sample_data, 'SMILES', 'Phase', 'Status', 'Disease')


    assert descriptors_df['SMILES'].isnull().any() == False, "есть пропущенные смайлсы"
    assert descriptors_df['Disease'].isnull().any() == False, "есть пропущенные заболевания"

    assert not descriptors_df.duplicated().any()

    desc_list = Chem.Descriptors.descList
    unique_desc_set = set(desc_list)
    unique_desc_list = list(unique_desc_set)

    expected_columns = 5 + len(unique_desc_list)
    assert len(descriptors_df.columns) == expected_columns, "размерность выходной таблицы не соответствует ожидаемой"
    expected_rows = 5
    assert descriptors_df.shape[0] == expected_rows

    # Проверка, что SMILES 0 присутствует 1 раз
    assert descriptors_df['SMILES'].value_counts().get('C(CCl)NC(=O)N(CCCl)N=O', 0) == 1

    # Проверка, что SMILES 3 отсутствует
    assert 'CC(F)C(C)C' not in descriptors_df['SMILES'].values

    # Проверка, что SMILES 5-8 присутствуют
    smiles_5_to_8 = ['F/C=C/F', 'F/C=C\F', ' CC(=O)N[C@@H](CS)C(=O)O', 'CC(=O)N[C@H](CS)C(=O)O']
    assert all(smile in descriptors_df['SMILES'].values for smile in smiles_5_to_8)

    # Проверка, что SMILES 9 отсутствует
    assert 'CC(CC1=CC=C=C1N' not in descriptors_df['SMILES'].values


Overwriting test_generate_descriptors.py


In [None]:
!python -m pytest test_generate_descriptors.py

platform linux -- Python 3.10.12, pytest-7.4.4, pluggy-1.4.0
rootdir: /content
plugins: anyio-3.7.1
collected 1 item                                                                                   [0m

test_generate_descriptors.py [32m.[0m[32m                                                               [100%][0m

