In [1]:
import pandas as pd

def analyze_columns_for_binary_target(df, column_names_list, target_column):
    """
    Iterates through a list of column names, performs analysis:
    1. Deletes rows with zeros in the considered column.
    2. Counts the non-zero values based on a binary target column (0 and 1).
    3. Calculates the deviation between the counts for target 1 and 0.
    4. Creates a table with column name, counts for each target, and deviation.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_names_list (list): A list of column names to analyze.
        target_column (str): The name of the binary target column (containing 0 and 1).

    Returns:
        pd.DataFrame: A table with columns 'Analyzed_Column', 'Count_Target_1',
                      'Count_Target_0', and 'Deviation_1_vs_0'.
                      Returns an empty DataFrame if no valid columns are found.
    """
    results_list = []
    for analyzed_column in column_names_list:
        if analyzed_column not in df.columns:
            print(f"Warning: Column '{analyzed_column}' not found in the DataFrame. Skipping.")
            continue

        if target_column not in df.columns:
            print(f"Warning: Target column '{target_column}' not found in the DataFrame. Skipping analysis for '{analyzed_column}'.")
            continue

        # Create a copy to avoid modifying the original DataFrame
        df_analyzed = df.copy()

        # Delete rows where the analyzed column has a zero value
        df_analyzed = df_analyzed[df_analyzed[analyzed_column] != 0]

        # Count values for target 1 and 0
        counts = df_analyzed.groupby(target_column)[analyzed_column].count()
        count_target_1 = counts.get(1, 0)  # Get count for class 1, default to 0 if not present
        count_target_0 = counts.get(0, 0)  # Get count for class 0, default to 0 if not present

        # Calculate the deviation
        deviation = count_target_1 - count_target_0

        # Calculate the percentage of target 1
        percentage = count_target_1/(count_target_1+count_target_0)

        # Calculate the porportion between target 1 and 0
        proportion_1_0 = count_target_1 / count_target_0

        # Create a row for the results table
        results_list.append({
            'Analyzed_Column': analyzed_column,
            'Count_target_1': count_target_1,
            'Count_target_0': count_target_0,
            'Deviation_1_vs_0': deviation, 
            'Percentage_taget_1': percentage,
            'Proportion_between_1_and_0': proportion_1_0
        })

    if results_list:
        results_df = pd.DataFrame(results_list)
        # Sort the DataFrame in descendent order based on the Relative_porportion column
        results_df_sorted = results_df.sort_values(by='Proportion_between_1_and_0', ascending=False)
        return results_df_sorted
    else:
        return pd.DataFrame(columns=['Analyzed_Column', 'Count_Target_1',
                                     'Count_Target_0', 'Deviation_1_vs_0',
                                     'Percentage_taget_1', 'Proportion_between_1_and_0'])

In [2]:
# Upload the dataset 
df = pd.read_csv('data_IUPACs.csv', index_col=[0])

# Create the list with first 24 functional groups selected as the most important features for the RFC 
columns_to_analyze = ['phenyl',
                    'thiourea',
                    'chlorophenyl',
                    'naphthalen',
                    'pyridin', 
                    'bromo',
                    'carbohydrazide',
                    'ethyl',
                    'methylidene',        
                    'trifluoromethyl',   
                    'benzothiazol'
                    'amino',
                    'bromophenyl',
                    'methylphenyl',
                    'chloro',
                    'phenol',
                    'quinolin',           
                    'dimethylphenyl',    
                    'pyridine',        
                    'pyrimidin',
                    'methyl',
                    'piperazin',
                    'dichlorophenyl',
                    'sulfanyl',
                    'sulfamoyl',
                    'carbamothioyl', 
                    'trichloro']

target_column_name = 'target'

analysis_table = analyze_columns_for_binary_target(df, columns_to_analyze, target_column_name)

print(analysis_table)

    Analyzed_Column  Count_target_1  Count_target_0  Deviation_1_vs_0  \
22        sulfamoyl              69              13                56   
24        trichloro              42              12                30   
13           phenol             126              55                71   
10      bromophenyl             195              88               107   
3        naphthalen             159              83                76   
23    carbamothioyl              85              46                39   
5             bromo             254             150               104   
20   dichlorophenyl             231             145                86   
19        piperazin             235             162                73   
14         quinolin             180             127                53   
8       methylidene             315             226                89   
9   trifluoromethyl             454             340               114   
15   dimethylphenyl             238             183

In [3]:
# imidazo 
def find_word_in_column_names(df, search_word):
    """
    Looks for a specific word within the column names of a Pandas DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        search_word (str): The word to search for.

    Returns:
        list: A list of column names that contain the search word.
              Returns an empty list if the word is not found.
    """
    matching_columns = []
    for col_name in df.columns:
        if search_word in col_name:
            matching_columns.append(col_name)
    return matching_columns

word_to_find = 'sulfamoyl'
found_columns = find_word_in_column_names(df, word_to_find)

if found_columns:
    print(f"Columns containing the word '{word_to_find}':")
    for col in found_columns:
        print(col)
else:
    print(f"The word '{word_to_find}' was not found in any column names.")

Columns containing the word 'sulfamoyl':
benzylsulfamoyl
butylsulfamoyl
cyclohexylsulfamoyl
cyclopentylsulfamoyl
cyclopropylsulfamoyl
dibutylsulfamoyl
diethylsulfamoyl
dimethylsulfamoyl
dimethylsulfamoylamino
dipropylsulfamoyl
enylsulfamoyl
ethylsulfamoyl
methoxyethylsulfamoyl
methylsulfamoyl
phenylethylsulfamoyl
phenylsulfamoyl
propylsulfamoyl
sulfamoyl
sulfamoylanilino
sulfamoylbenzoate
sulfamoylbenzoicacid
sulfamoylphenyl
ylethylsulfamoyl
ylmethylsulfamoyl
ylsulfamoyl


In [5]:
# Select the data frame 
word_to_find = "sulfamoyl"
# filtered_df = filter_dataframe_columns_by_word(df, word_to_find)
df_filtered = df.filter(like=word_to_find, axis=1)

df_filtered.head()

Unnamed: 0,benzylsulfamoyl,butylsulfamoyl,cyclohexylsulfamoyl,cyclopentylsulfamoyl,cyclopropylsulfamoyl,dibutylsulfamoyl,diethylsulfamoyl,dimethylsulfamoyl,dimethylsulfamoylamino,dipropylsulfamoyl,...,phenylsulfamoyl,propylsulfamoyl,sulfamoyl,sulfamoylanilino,sulfamoylbenzoate,sulfamoylbenzoicacid,sulfamoylphenyl,ylethylsulfamoyl,ylmethylsulfamoyl,ylsulfamoyl
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Instantite df with lables 
df_targets = df['target']
df_targets.shape

(19499,)

In [8]:
# Unite the filter df with the lables 
df = pd.concat([df_targets, df_filtered], axis=1)
df.shape

(19499, 26)

In [9]:
# Values of Active and Inactive compounds with a functional group containing imidazo
df['target'].value_counts()

target
0    11593
1     7906
Name: count, dtype: int64