In [None]:
%%file test_filter_by_variance.py
!pip install pytest
import pandas as pd
import numpy as np
import pytest
from sklearn import preprocessing


def filter_by_variance(dataframe, var_threshold, features_to_keep):
    """
    :param dataframe: table in the format of DataFrame object.
    :param var_threshold: the minimal level of tolerable feature variance.
    :param features_to_keep: list of column names of the features to keep.
    :param target_feature: column name of a target feature to predict further.
    :return: a DataFrame object M columns (where M <= N) with chemical descriptors from Rdkit
    + 1 column with disease categories.
    """

    columns_to_drop = features_to_keep + ['disease category', 'phase', 'status', 'iso_smiles']
    descriptors_to_filt = dataframe.drop(columns_to_drop, axis=1)

    var_filtered_dataset_with_descriptors = pd.DataFrame()

    to_save = dataframe[features_to_keep]

    scaler = preprocessing.MinMaxScaler()
    scaled_descriptors = pd.DataFrame(scaler.fit_transform(descriptors_to_filt), columns=descriptors_to_filt.columns)

    filter = scaled_descriptors.loc[:, scaled_descriptors.var() < var_threshold]
    filtered_data = descriptors_to_filt.drop(columns=filter.columns, axis=1)

    var_filtered_dataset_with_descriptors = filtered_data.join(to_save, how='outer')
    var_filtered_dataset_with_descriptors['iso_smiles'] = dataframe['iso_smiles']
    var_filtered_dataset_with_descriptors['phase'] = dataframe['phase']
    var_filtered_dataset_with_descriptors['status'] = dataframe['status']
    var_filtered_dataset_with_descriptors['disease category'] = dataframe['disease category']

    return var_filtered_dataset_with_descriptors


@pytest.fixture
def df():
    # Создаем DataFrame для теста
    data = {
        'feature1': [1, 10, 40,-10],
        'feature2': [1, 5, 10, 10],
        'feature3': [1, 1, 1, 0],
        'feature4': [5, 19, 18, 3],
        'feature5': [5, 19, None, 3],
        'disease category': np.random.choice(['A', 'B'], 4),
        'phase': np.random.choice(['I', 'II', 'III', None], 4),
        'status': np.random.choice(['active', 'inactive'], 4),
        'iso_smiles': ['COO'] * 4
    }
    return pd.DataFrame(data)

def test_filter_by_variance(df):
    var_threshold = 0.25
    features_to_keep = ['feature1']
    pass_filter = ['feature3', 'feature4', 'feature5']

    filtered_df = filter_by_variance(df, var_threshold, features_to_keep)

    for column in filtered_df.columns:
      if column in pass_filter:
          assert filtered_df[column].var() >= var_threshold

    expected_columns = 8
    assert len(filtered_df.columns) == expected_columns, "размерность выходной таблицы не соответствует ожидаемой"
    expected_rows = 4
    assert filtered_df.shape[0] == expected_rows

    assert  filtered_df.isnull().values.any()
    assert 'feature2' not in filtered_df.columns


Overwriting test_filter_by_variance.py


In [None]:
!python -m pytest test_filter_by_variance.py

platform linux -- Python 3.10.12, pytest-7.4.4, pluggy-1.4.0
rootdir: /content
plugins: anyio-3.7.1
collected 1 item                                                                                   [0m

test_filter_by_variance.py [32m.[0m[32m                                                                 [100%][0m

