In [None]:
%%file test_filter_correlation.py
!pip install pytest
import pandas as pd
import numpy as np
import pytest
from scipy import stats

def filter_correlations(dataframe, max_corr_threshold, mean_corr_threshold, features_to_keep):
        """
        For every feature
        1) skips all the next steps if in features_to_keep
        2) counts mean and max correlations for Q3 quantile
        3) compares max and mean values with max_corr_threshold and mean_corr_threshold, respectively
        4) deletes if at least one number is bigger than the threshold
        :param dataframe: table in the format of DataFrame object.
        :param max_corr_threshold: the minimal level of correlation considered as "strong".
        :param mean_corr_threshold: the minimal level of mean correlation considered as "strong".
        :param features_to_keep: set of important chemical features to keep in any case (includes a target feature).
        :return: a DataFrame object M columns (where M <= N) with chemical descriptors from Rdkit
        + 1 column (target feature) with disease categories.
        """
        columns_to_drop = features_to_keep + ['disease category', 'phase', 'status', 'iso_smiles']
        descriptors = dataframe.drop(columns_to_drop, axis=1)

        corr_matrix = descriptors.corr().abs()
        Columns = corr_matrix.columns

        #upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        correlated_descriptors = []

        for i in range(len(corr_matrix.columns)):
          for j in range(i):
            if corr_matrix.iloc[i, j] > max_corr_threshold:
              column_mean_corr_1 = np.mean(corr_matrix[Columns[i]])
              column_mean_corr_2 = np.mean(corr_matrix[Columns[j]])
              column_to_drop = i*(column_mean_corr_1 > mean_corr_threshold and column_mean_corr_1 > column_mean_corr_2) + j*(column_mean_corr_2 > mean_corr_threshold and column_mean_corr_2 > column_mean_corr_1)
              correlated_descriptors.append(Columns[column_to_drop])

        corr_filtered_dataset_with_descriptors = dataframe.drop(correlated_descriptors, axis=1)


        corr_filtered_dataset_with_descriptors['iso_smiles'] = dataframe['iso_smiles']
        corr_filtered_dataset_with_descriptors['phase'] = dataframe['phase']
        corr_filtered_dataset_with_descriptors['status'] = dataframe['status']
        corr_filtered_dataset_with_descriptors['disease category'] = dataframe['disease category']

        return corr_filtered_dataset_with_descriptors

@pytest.fixture
def df():
    data = {
        'feature1': [None, 8, -40,-10],
        'feature2': [1, None, -4, 7],
        'feature3': [0, -11, 11, 2],
        'feature4': [-13, -50, 18, 30],
        'feature5': [5, -19, 0, -4],
        'disease category': np.random.choice(['A', 'B'], 4),
        'phase': np.random.choice(['I', 'II', 'III', None], 4),
        'status': np.random.choice(['active', 'inactive'], 4),
        'iso_smiles': ['COO']*4
    }

    return pd.DataFrame(data)

def test_filter_correlation(df):

    max_corr_threshold = 0.8
    mean_corr_threshold = 0.7

    features_to_keep = ['feature1']

    # Применяем функцию
    filtered_df = filter_correlations(df, max_corr_threshold, mean_corr_threshold, features_to_keep)

    assert df.shape[0] == filtered_df.shape[0]
    assert df.shape[1] == 9
    assert filtered_df.isnull().values.any()
    assert 'feature3' not in filtered_df


In [None]:
!python -m pytest test_filter_correlation.py

platform linux -- Python 3.10.12, pytest-7.4.4, pluggy-1.4.0
rootdir: /content
plugins: anyio-3.7.1
collected 1 item                                                                                   [0m

test_filter_correlation.py [32m.[0m[32m                                                                 [100%][0m

