In [1]:
dataset_info = {
    'Lipophilicity': '1225', 
    'esol': '1225', 
    'Mutagenicity': '1225', 
    'hERG': '1225', 
    'BBBP': '1225'
}# Choose from 'Mutagenicity', 'hERG', 'BBBP', 'clintox'
least_count = 15

In [4]:
import pandas as pd
from DataLoader import get_setup_files_with_folds
import numpy as np
import math
# List to store data for the DataFrame
results = []

for dataset_name, date_tag in dataset_info.items():
    for algo in ['RBRICS', 'MGSSL']:
        for fold in [0, 1, 2, 3, 4]:
            lookup, motif_list, motif_counts, motif_lengths, motif_class_count, graph_to_motifs, test_data_lookup, test_graph_to_motifs, train_mask_data, val_mask_data, test_mask_data = get_setup_files_with_folds(dataset_name, date_tag, fold, algo)
            
            # Compute values
            total_train_val_graphs = len(lookup)
            vocab_size = len(motif_list)
            total_test_graphs = len(test_data_lookup)
            graphs_with_motifs_train_val = len(graph_to_motifs.keys())
            total_unique_motifs_train_val = sum([val for values in graph_to_motifs.values() for val in values])
            graphs_with_motifs_test = len(test_graph_to_motifs.keys())
            total_unique_motifs_test = sum([val for values in test_graph_to_motifs.values() for val in values])
            highest_vocab_size = len(motif_counts)
            # Calculate mean
            values = list(motif_lengths.values())
            mean_motif_length = sum(values) / len(values)

            # Calculate standard deviation
            variance = sum((x - mean_motif_length) ** 2 for x in values) / len(values)
            std_dev_length = math.sqrt(variance)
            
            # Calculate mean
            values = list(motif_counts.values())
            mean_motif_freq = sum(values) / len(values)

            # Calculate standard deviation
            variance = sum((x - mean_motif_freq) ** 2 for x in values) / len(values)
            std_dev_motif_freq = math.sqrt(variance)

            all_possible_occurrences = sum([values for values in motif_counts.values()])
            
            # Add data to results
            results.append({
                'Dataset': dataset_name,
                'DateTag': date_tag,
                'Algorithm': algo,
                'Fold': fold,
                'TotalGraphs_TrainVal': total_train_val_graphs,
                'VocabSize': vocab_size,
                'TotalGraphs_Test': total_test_graphs,
                'GraphsWithMotifs_TrainVal': graphs_with_motifs_train_val,
                'TotalUniqueMotifs_TrainVal': total_unique_motifs_train_val,
                'GraphsWithMotifs_Test': graphs_with_motifs_test,
                'TotalUniqueMotifs_Test': total_unique_motifs_test,
                'HighestVocabSize': highest_vocab_size,
                'AllPossibleOccurrences': all_possible_occurrences,
                'Mean Length': mean_motif_length,
                'Std length':std_dev_length,
                'Mean Frequency': mean_motif_freq,
                'Std Frequency':std_dev_motif_freq,
            })

# Create a DataFrame from the results
df = pd.DataFrame(results)

# # Save to a CSV file (optional)
# df.to_csv('dataset_statistics.csv', index=False)

# Display the DataFrame
print(df)


          Dataset DateTag Algorithm  Fold  TotalGraphs_TrainVal  VocabSize   
0   Lipophilicity    1225    RBRICS     0                  3780        219  \
1   Lipophilicity    1225    RBRICS     1                  3780        223   
2   Lipophilicity    1225    RBRICS     2                  3780        223   
3   Lipophilicity    1225    RBRICS     3                  3780        219   
4   Lipophilicity    1225    RBRICS     4                  3780        220   
5   Lipophilicity    1225     MGSSL     0                  3780         77   
6   Lipophilicity    1225     MGSSL     1                  3780         75   
7   Lipophilicity    1225     MGSSL     2                  3780         77   
8   Lipophilicity    1225     MGSSL     3                  3780         79   
9   Lipophilicity    1225     MGSSL     4                  3780         77   
10           esol    1225    RBRICS     0                  1014         77   
11           esol    1225    RBRICS     1                  1014 

In [3]:
input()

 


''

In [7]:
df.columns

Index(['Dataset', 'DateTag', 'Algorithm', 'Fold', 'TotalGraphs_TrainVal',
       'VocabSize', 'TotalGraphs_Test', 'GraphsWithMotifs_TrainVal',
       'TotalUniqueMotifs_TrainVal', 'GraphsWithMotifs_Test',
       'TotalUniqueMotifs_Test', 'HighestVocabSize', 'AllPossibleOccurrences',
       'Mean Length', 'Std length', 'Mean Frequency', 'Std Frequency'],
      dtype='object')

In [16]:
# Select relevant columns
selected_columns = ['Dataset', 'Algorithm', 'Fold', 'VocabSize', 'Mean Length', 'Std length', 'Mean Frequency', 'Std Frequency']
df_selected = df[selected_columns]

# Group by 'Dataset_Fold' and calculate mean
grouped = df_selected.groupby(['Dataset','Algorithm']).agg({'VocabSize':'mean','Mean Length': 'mean', 'Mean Frequency': 'mean'})

# Reset index to remove Dataset_Fold column
result = grouped.reset_index()

print(result)

         Dataset Algorithm  VocabSize  Mean Length  Mean Frequency
0           BBBP     MGSSL       41.2     4.388966       18.110118
1           BBBP    RBRICS      173.8     4.784498        7.315293
2  Lipophilicity     MGSSL       77.0     7.015723       18.242272
3  Lipophilicity    RBRICS      220.8     5.774342       12.760917
4   Mutagenicity     MGSSL      197.6     7.416999       10.380603
5   Mutagenicity    RBRICS      556.4     6.398680        6.510290
6           esol     MGSSL       37.4     4.614762        7.163092
7           esol    RBRICS       75.8     4.131801        3.750084
8           hERG     MGSSL      163.6     8.869530       24.963594
9           hERG    RBRICS      504.8     6.302284       19.237789


In [17]:
result.to_csv('dataset_statistics_grouped.csv', index=False)

In [5]:
df.to_csv('dataset_statistics.csv', index=False)

In [5]:
import pandas as pd

# Define the columns of the table
columns = [
    "Dataset Name", "Date Tag", "Total Graphs in Train Val", "Vocab Size",
    "Total Graphs in Test", "Graphs With Motifs in Train Val",
    "Total Unique Motifs in Train Val", "Graphs With Motifs in Test",
    "Total Unique Motifs in Test", "Highest Vocab Size", "All Possible Occurrences"
]

# Initialize an empty list to store each row of the table
data = []

# Loop through dataset info and populate the table
for dataset_name, date_tag in dataset_info.items():
    # Fetch the setup files for each dataset
    lookup, motif_list, motif_counts, motif_class_count, graph_to_motifs, test_data_lookup, test_graph_to_motifs = get_setup_files(dataset_name, date_tag)
    
    # Collect the required values
    row = [
        dataset_name,
        date_tag,
        len(lookup),
        len(motif_list),
        len(test_data_lookup),
        len(graph_to_motifs.keys()),
        sum(val for values in graph_to_motifs.values() for val in values),
        len(test_graph_to_motifs.keys()),
        sum(val for values in test_graph_to_motifs.values() for val in values),
        len(motif_counts),
        sum(values for values in motif_counts.values())
    ]
    
    # Append the row to the data list
    data.append(row)

# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=columns)

# Display the table
print(df)


NameError: name 'get_setup_files' is not defined