In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path


polygon_path = Path("../../polygon/results/")
mtmol_path = Path("../../MTMol-GPT/results/")
deeplig_path = Path("../../DeepLig/results/")
diseases = {
    "schizophrenia": ["_5HT2A", "D2R"],
    "alzheimer": ["AChE", "MAOB"],
    "parkinson": ["D2R", "D3R"]
}

dataframes_polygon = {
    disease: pd.read_csv(polygon_path / f"{disease}_sample_activity.csv")
    for disease in diseases.keys()
}
dataframes_mtmol = {
    disease: pd.read_csv(mtmol_path / f"{disease}_sample_activity.csv")
    for disease in diseases.keys()
}
dataframes_deeplig = {
    disease: pd.read_csv(deeplig_path / f"{disease}_sample_activity.csv")
    for disease in diseases.keys()
}

def dataframes_statistics(dataframes):
    invalid_stats = []
    duplicate_stats = []
    for disease in dataframes:
        df = dataframes[disease]
        col_preds = df.columns[1]
        
        count_invalid = len(df[df[col_preds].isna()])
        count_duplicates = df.duplicated(subset='SMILES').sum()
        
        invalid_stats.append({
            'Disease': disease,
            'Invalid SMILES': count_invalid
        })
        duplicate_stats.append({
            'Disease': disease, 
            'Duplicates': count_duplicates
        })
    
    return pd.DataFrame(invalid_stats), pd.DataFrame(duplicate_stats)

# Create statistics for each method
polygon_invalid, polygon_duplicates = dataframes_statistics(dataframes_polygon)
mtmol_invalid, mtmol_duplicates = dataframes_statistics(dataframes_mtmol)
deeplig_invalid, deeplig_duplicates = dataframes_statistics(dataframes_deeplig)

# Add method column and combine invalid stats
polygon_invalid['Method'] = 'POLYGON'
mtmol_invalid['Method'] = 'MTMol-GPT'
deeplig_invalid['Method'] = 'DeepLig'
combined_invalid = pd.concat([polygon_invalid, mtmol_invalid, deeplig_invalid])

# Add method column and combine duplicate stats  
polygon_duplicates['Method'] = 'POLYGON'
mtmol_duplicates['Method'] = 'MTMol-GPT'
deeplig_duplicates['Method'] = 'DeepLig'
combined_duplicates = pd.concat([polygon_duplicates, mtmol_duplicates, deeplig_duplicates])

# Create pivot tables for better visualization
pivot_invalid = combined_invalid.pivot(index='Disease', columns='Method', values='Invalid SMILES')
pivot_duplicates = combined_duplicates.pivot(index='Disease', columns='Method', values='Duplicates')

# Display styled dataframes
display(pivot_invalid.style.background_gradient(cmap='YlOrRd', axis=None)
       .format(precision=0))
display(pivot_duplicates.style.background_gradient(cmap='YlOrRd', axis=None)
       .format(precision=0))

Method,DeepLig,MTMol-GPT,POLYGON
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alzheimer,0,286,0
parkinson,0,261,0
schizophrenia,7,286,0


Method,DeepLig,MTMol-GPT,POLYGON
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alzheimer,9976,5152,5875
parkinson,275,5352,5770
schizophrenia,0,5152,5776


In [3]:
gen_molecules_labels = {
    "Unconditional": "Unconditional",
    "D2R": "D2R",
    "D3R": "D3R",
    "_5HT2A": "5-HT2AR",
    "5HT2A": "5-HT2AR",
    "D2R__5HT2A_SUM": "D2R + 5-HT2AR (Sum)",
    "AChE": "AChE",
    "MAOB": "MAO-B",
    "AChE_MAOB_SUM": "AChE + MAO-B (Sum)",
    "D2R_D3R_SUM": "D2R + D3R (Sum)",
}

ACT_TYPE = "pXC50"

def dataframes_pivots(dataframes, scenario):
    pivots_combinations = dict()
    pivots = pd.DataFrame()

    for disease in dataframes:
        df = dataframes[disease].copy()
        combination = "_".join(diseases[disease])
        activity_column = f"{combination}_{ACT_TYPE}"
        df["Scenario"] = scenario
        
        activities = pd.concat([df]).reset_index()
        pivot = activities.pivot_table(index='Scenario', values=activity_column, aggfunc=[np.mean, np.std])
        pivot = pivot.rename(columns={'mean': 'Mean (pXC50)', 'std': 'Std Deviation (pXC50)'})
        pivot = pivot.sort_values(by=('Mean (pXC50)', activity_column), ascending=False)

        pivots_combinations[combination] = pivot
        pivot.columns = pivot.columns.droplevel(1)
        pivots = pd.concat([pivots, pivot])
    
    return pivots, pivots_combinations

pivots_polygon, pivots_combinations_polygon = dataframes_pivots(dataframes_polygon, "POLYGON")
pivots_mtmol, pivots_combinations_mtmol = dataframes_pivots(dataframes_mtmol, "MTMol-GPT")
pivots_deeplig, pivots_combinations_deeplig = dataframes_pivots(dataframes_deeplig, "DeepLig")

for combination in pivots_combinations_polygon:
    print(f"Combination: {combination}")
    # Concatenate the two dataframes
    pivot = pd.concat(
        [pivots_combinations_polygon[combination], 
         pivots_combinations_mtmol[combination],
         pivots_combinations_deeplig[combination]],
        axis=0
    )
    format_str = "{:.2%}" if ACT_TYPE == "Activity" else "{:.2f}"
    display(pivot.style
                .format(format_str)
                .set_caption(f"{combination} {ACT_TYPE} statistics")
                .background_gradient(cmap='Blues', axis=0))

Combination: _5HT2A_D2R


Unnamed: 0_level_0,Mean (pXC50),Std Deviation (pXC50)
Scenario,Unnamed: 1_level_1,Unnamed: 2_level_1
POLYGON,6.79,0.17
MTMol-GPT,6.93,0.69
DeepLig,5.92,0.58


Combination: AChE_MAOB


Unnamed: 0_level_0,Mean (pXC50),Std Deviation (pXC50)
Scenario,Unnamed: 1_level_1,Unnamed: 2_level_1
POLYGON,6.5,0.3
MTMol-GPT,5.14,0.55
DeepLig,5.88,0.01


Combination: D2R_D3R


Unnamed: 0_level_0,Mean (pXC50),Std Deviation (pXC50)
Scenario,Unnamed: 1_level_1,Unnamed: 2_level_1
POLYGON,6.3,0.47
MTMol-GPT,7.15,0.7
DeepLig,4.72,0.09


In [None]:
# Active molecules rule:
# - pXC50 < 6 = Low
# - 6 <= pXC50 < 7 = Medium
# - 7 <= pXC50 < 8 = High
# - pXC50 >= 8 = Ultra High
column_order = ["Low", "Medium", "High", "Ultra High"]
# column_order = ["Low", "Medium", "High/Ultra High"]

def activity_level(pXC50):
    if pXC50 < 6:
        return "Low"
    elif 6 <= pXC50 < 7:
        return "Medium"
    elif 7 <= pXC50 < 8:
        return "High"
    else:
        return "Ultra High"
        # return "High/Ultra High"


def activity_groups_percentage(dataframes, scenario):
    acitivity_groups = dict()

    for disease in dataframes:
        df = dataframes[disease].copy()
        combination = "_".join(diseases[disease])
        activity_column = f"{combination}_{ACT_TYPE}"
        df["Scenario"] = scenario

        activities = pd.concat([df]).reset_index(drop=True)
        activities["Activity Level"] = activities[activity_column].apply(activity_level)

        pivot = activities.pivot_table(index='Scenario', columns='Activity Level', values=activity_column, aggfunc='count')
        pivot = pivot.fillna(0).astype(int)
        # Normalize the counts
        pivot = pivot.div(pivot.sum(axis=1), axis=0)
        for column in column_order:
            if column not in pivot.columns:
                pivot[column] = 0.0

        column_order_subset = [c for c in column_order if c in pivot.columns]        
        acitivity_groups[combination] = pivot[column_order_subset]
    
    return acitivity_groups

acitivity_groups_polygon = activity_groups_percentage(dataframes_polygon, "POLYGON")
acitivity_groups_mtmol = activity_groups_percentage(dataframes_mtmol, "MTMol-GPT")

for combination in acitivity_groups_polygon:
    print(f"Combination: {combination}")
    # Concatenate the two dataframes
    pivot = pd.concat(
        [acitivity_groups_polygon[combination], acitivity_groups_mtmol[combination]],
        axis=0
    )
    display(pivot.style
                .format("{:.1%}")
                .set_caption(f"{combination} activity level statistics")
                .background_gradient(cmap='Blues', axis=1))


Combination: _5HT2A_D2R


Activity Level,Low,Medium,High,Ultra High
Scenario,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
POLYGON,0.0%,89.6%,10.4%,0.0%
MTMol-GPT,7.5%,49.8%,34.3%,8.3%


Combination: AChE_MAOB


Activity Level,Low,Medium,High,Ultra High
Scenario,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
POLYGON,5.6%,91.9%,2.5%,0.0%
MTMol-GPT,95.5%,4.5%,0.0%,0.0%


Combination: D2R_D3R


Activity Level,Low,Medium,High,Ultra High
Scenario,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
POLYGON,29.0%,57.9%,13.0%,0.0%
MTMol-GPT,5.1%,36.5%,46.1%,12.2%
