#### Imports

In [None]:
from glob import glob
from itertools import product
import pandas as pd
import numpy as np
from polyphys.manage import organizer
from polyphys.manage.parser import \
    SumRuleCyl, TransFociCyl, TransFociCub, HnsCub
from polyphys.analyze import measurer
import polyphys.api as api
from polyphys.probe import logger
import warnings
warnings.filterwarnings('ignore')

#  Choose these two before running this script:
#project = 'HnsCub'  # 'SumRuleCyl', 'TransFociCyl'
project = 'TransFociCub'
analysis_db = '/Users/amirhsi_mini/research_data/analysis/'
# List of physical properties: Set the project hierarchy
project_details = {
    'SumRuleCyl': {
        'group': 'bug',
        'geometry': 'cylindrical',
        'topology': 'linear',
        'parser': SumRuleCyl,
        'space_pat': 'N*D*ac*',
        'hierarchy': 'N*',
        'space_hierarchy': 'N*',
        'attributes': ['space', 'ensemble_long', 'ensemble', 'nmon', 'dcyl',
                       'dcrowd', 'phi_c_bulk'
                       ],
        'time_varying_props': ['asphericityTMon', 'fsdTMon', 'gyrTMon',
                               'rfloryTMon', 'shapeTMon', 'transSizeTMon'
                               ],
        'equil_measures': [np.mean, np.var, measurer.sem],
        'equil_attributes': ['space', 'ensemble_long', 'ensemble', 'nmon',
                             'dcyl', 'dcrowd', 'phi_c_bulk',
                             'phi_c_bulk_round'
                             ],
        'equil_properties': ['asphericityMon-mean', 'asphericityMon-var',
                             'asphericityMon-sem', 'fsdMon-mean','fsdMon-var',
                             'fsdMon-sem', 'gyrMon-mean', 'gyrMon-var',
                             'gyrMon-sem', 'rfloryMon-mean', 'rfloryMon-var',
                             'rfloryMon-sem', 'shapeMon-mean', 'shapeMon-var',
                             'shapeMon-sem', 'transSizeMon-mean',
                             'transSizeMon-var', 'transSizeMon-sem'
                             ],
        'rhosPhisNormalizedScaled': [('Mon', 'dmon'), ('Crd', 'dcrowd')]
    },
    'TransFociCyl': {
        'group': 'bug',
        'geometry': 'cylindrical',
        'topology': 'ring',
        'parser': TransFociCyl,
        'space_pat': 'ns*nl*al*D*ac*',
        'hierarchy': 'eps*',
        'space_hierarchy': 'ns*',
        'attributes': ['space', 'ensemble_long', 'ensemble', 'nmon_small',
                       'nmon_large', 'dmon_large', 'dcyl', 'dcrowd',
                       'phi_c_bulk'
                       ],
        'time_varying_props': ['asphericityTMon', 'fsdTMon', 'gyrTMon',
                               'shapeTMon'
                               ],
        'equil_measures': [np.mean, np.var, measurer.sem],
        'equil_attributes': ['ensemble_long', 'ensemble', 'space', 'dcyl',
                             'dmon_large', 'nmon_large', 'nmon_small',
                             'dcrowd', 'phi_c_bulk', 'phi_c_bulk_round'
                             ],
        'equil_properties': ['asphericityMon-mean', 'asphericityMon-var',
                             'asphericityMon-sem', 'fsdMon-mean',
                             'fsdMon-var', 'fsdMon-sem', 'gyrMon-mean',
                             'gyrMon-var', 'gyrMon-sem', 'shapeMon-mean',
                             'shapeMon-var', 'shapeMon-sem'
                             ],
        'rhosPhisNormalizedScaled': [('Mon', 'dmon_small'), ('Crd', 'dcrowd'),
                                     ('Foci', 'dmon_large')
                                     ]
    },
    'TransFociCub': {
        'group': 'bug',
        'geometry': 'cubic',
        'topology': 'ring',
        'parser': TransFociCub,
        'space_pat': 'ns*nl*al*ac*',
        'hierarchy': 'al*',
        'space_hierarchy': 'ns*',
        'attributes': ['space', 'ensemble_long', 'ensemble', 'nmon_small',
                       'nmon_large', 'dmon_large', 'dcrowd', 'phi_c_bulk'
                       ],
        'time_varying_props': ['asphericityTMon', 'gyrTMon', 'shapeTMon'],
        'equil_measures': [np.mean],
        'equil_attributes': ['ensemble_long', 'ensemble', 'space',
                             'dmon_large', 'nmon_large', 'nmon_small',
                             'dcrowd', 'phi_c_bulk', 'phi_c_bulk_round'
                             ],
        'equil_properties': ['asphericityMon-mean', 'asphericityMon-var',
                             'asphericityMon-sem', 'gyrMon-mean',
                             'gyrMon-var', 'gyrMon-sem', 'shapeMon-mean',
                             'shapeMon-var', 'shapeMon-sem'
                             ],
        'rhosPhisNormalizedScaled': [('Mon', 'dmon_small'), ('Crd', 'dcrowd'),
                                     ('Foci', 'dmon_large')
                                     ]
    },
    'HnsCub': {
        'group': 'nucleoid',
        'geometry': 'cubic',
        'topology': 'ring',
        'parser': HnsCub,
        'space_pat': 'N*epshm*nh*ac*',
        'hierarchy': 'N*',
        'space_hierarchy': 'N*',
        'attributes': ['space', 'ensemble_long', 'ensemble', 'eps_hm',
                       'nmon', 'nhns', 'dcrowd', 'phi_c_bulk'
                       ],
        'time_varying_props': ['asphericityTMon', 'gyrTMon', 'shapeTMon'],
        'equil_measures': [np.mean],
        'equil_attributes': ['ensemble_long', 'ensemble', 'space',
                             'eps_hm', 'nmon', 'nhns', 'dcrowd', 'phi_c_bulk',
                             'phi_c_bulk_round'
                             ],
        'equil_properties': ['asphericityMon-mean', 'asphericityMon-var',
                             'asphericityMon-sem', 'gyrMon-mean',
                             'gyrMon-var', 'gyrMon-sem', 'shapeMon-mean',
                             'shapeMon-var', 'shapeMon-sem'
                             ],
        'rhosPhisNormalizedScaled': [('Mon', 'dmon'), ('Crd', 'dcrowd'),
                                     ('Hns', 'dhns')
                                     ]
    }
}

# allInOne *whole* and *ensAvg* stamps per project

## ensemble-averaged stamps per project (ensAvg phase)

In [None]:
space_dbs = glob(analysis_db + project_details[project]['space_pat'])
ens_avg_space_dbs = [
    space_db + "/" for space_db in space_dbs if space_db.endswith(
        project_details[project]['group'] + '-ensAvg'
    )
]
allInOne_stamps = []
for space_db in ens_avg_space_dbs:
    stamp_path = project_details[project]['space_hierarchy'] + 'stamps*'
    stamp_path = glob(space_db + "/" + stamp_path + '.csv')[0]
    space_stamps = pd.read_csv(stamp_path)
    allInOne_stamps.append(space_stamps)
allInOne_stamps = pd.concat(allInOne_stamps, axis=0)
allInOne_stamps.reset_index(inplace=True, drop=True)
output = analysis_db + "allInOne-" + project + "-stamps-ensAvg.csv"
allInOne_stamps.to_csv(output, index=False)

## whole stamps per project (ens phase)

In [None]:
space_dbs = glob(analysis_db + project_details[project]['space_pat'])
ens_avg_space_dbs = [
    space_db + "/" for space_db in space_dbs if space_db.endswith(
        project_details[project]['group'] + '-ens'
    )
]
allInOne_stamps = []
for space_db in ens_avg_space_dbs:
    stamp_path = project_details[project]['space_hierarchy'] + 'stamps*'
    stamp_path = glob(space_db + "/" + stamp_path + '.csv')[0]
    space_stamps = pd.read_csv(stamp_path)
    allInOne_stamps.append(space_stamps)
allInOne_stamps = pd.concat(allInOne_stamps, axis=0)
allInOne_stamps.reset_index(inplace=True, drop=True)
output = analysis_db + "allInOne-" + project + "-stamps-ens.csv"
allInOne_stamps.to_csv(output, index=False)

## Logs per project

In [None]:
log_details = {
    'SumRuleCyl': {
        'space_pat': 'N*D*ac*',
        'hierarchy': '/N*.log',  # dir/file
        'parser': SumRuleCyl,
        'group': 'bug',
        'geometry': 'cylindrical',
        'topology': 'linear',
        'product_idx': 1
    },
    'TransFociCyl': {
        'space_pat': 'ns*nl*al*D*ac*',
        'hierarchy': '/eps*.log',  # dir/file
        'parser': TransFociCyl,
        'group': 'bug',
        'geometry': 'cylindrical',
        'topology': 'ring',
        'product_idx': 2
    },
    'TransFociCub': {
        'space_pat': 'ns*nl*al*ac*',
        'hierarchy': '/al*.log',  # dir/file
        'parser': TransFociCub,
        'group': 'bug',
        'geometry': 'cubic',
        'topology': 'ring',
        'product_idx': 2
    },
    'HnsCubWhole': {
        'space_pat': 'N*epshm*nh*ac*',
        'hierarchy': '/N*.log',  # dir/file
        'parser': HnsCub,
        'group': 'nucleoid',
        'geometry': 'cubic',
        'topology': 'ring',
        'product_idx': 2
    }
}
#log_db = "/Users/amirhsi_mini/OneDrive - University of Waterloo/PhD Research/Jupyter/Datasets/logs/"
log_db = '/Users/amirhsi_mini/research_data/TransFociCub/logs/'
space_dbs = glob(log_db + log_details[project]['space_pat'] + '-logs')
space_dbs.sort()
space_with_segment_lineage = [
    'N500D10.0ac0.6-logs',
    'N500D10.0ac0.8-logs',
    'N500D10.0ac1.0-logs',
    'N2000D30.0ac4.0-logs',
    'N2000D30.0ac6.0-logs'
]

In [None]:
thermos = []
run_stats = []
wall_times = []
save_to = './'

for space_db in space_dbs:
    print(space_db)
    space = space_db.split("/")[-1]
    if space in space_with_segment_lineage:
        lineage = 'segment'
    else:
        lineage = 'whole'
    logs = glob(space_db + log_details[project]['hierarchy'])
    logs = organizer.sort_filenames(logs,  fmts=['.log']) # sorted
    logs = [log[0] for log in logs]
    for log in logs:
        log_info = log_details[project]['parser'](
            log,
            lineage,
            log_details[project]['geometry'],
            log_details[project]['group'],
            log_details[project]['topology']
        )
        try:
            product_idx = log_details[project]['product_idx']
            if lineage == 'segment' and log_info.segment_id>1:
                product_idx = 0
            log_data = logger.LammpsLog(log, product_idx)
        except (logger.BrokenLogError, IndexError):
            print("broken log: ", log_info.filepath.split("/")[-1])
        log_data.extract_thermo()
        log_data.extract_run_stat()
        thermo = log_data.thermo
        run_stat = log_data.run_stat
        wall_time = log_data.wall_time
        for attr_name in log_info._lineage_attributes[lineage].keys():
            attr_value = getattr(log_info, attr_name)
            thermo[attr_name] = attr_value
        attr_names = ['phi_m_bulk', 'rho_m_bulk', 'phi_c_bulk', 'rho_c_bulk']
        for attr_name in attr_names:
            attr_value = getattr(log_info, attr_name)
            thermo[attr_name] = attr_value
        for lineage_name in log_info.genealogy:
            attr_value = getattr(log_info, lineage_name)
            thermo[lineage_name] = attr_value
            run_stat[lineage_name] = attr_value
            wall_time[lineage_name] = attr_value
        thermos.append(thermo)
        run_stats.append(run_stat)
        wall_times.append(wall_time)

output = "-".join([project, "allInOne", "thermo"])
thermos = pd.concat(thermos)
thermos.reset_index(inplace=True, drop=True)
thermos.to_parquet(
    save_to + output + ".parquet.brotli", index=False, compression='brotli'
)
output = "-".join([project, "allInOne", "runStat"])
run_stats = pd.concat(run_stats)
run_stats.reset_index(inplace=True, drop=True)
run_stats.to_csv(save_to + output + ".csv", index=False)
output = "-".join([project, "allInOne", "wallTimeStat"])
wall_times = pd.concat(wall_times)
wall_times.reset_index(inplace=True, drop=True)
wall_times.to_csv(save_to + output + ".csv", index=False)

# ensAvg timeseries and their associated measures

## Measures of chain size timeseries properties per space

### allInONe esnAvg ACFs of the chain-size properties per space

In [None]:
%%time
# Wall time: 60 s for TransFoci
# Wall time: 4 min for SumRule
phase = 'ensAvg'
space_dbs = glob(analysis_db + project_details[project]['space_pat'])
ens_avg_space_dbs = [
    space_db + "/" for space_db in space_dbs if space_db.endswith(
        project_details[project]['group'] + '-' + phase
    )
]
print(ens_avg_space_dbs)
# list of unique property_measures:
filepath = ens_avg_space_dbs[0] + '*' + project_details[project]['hierarchy'] + '.csv'  # physical properties in all the
_, uniq_props_measures = organizer.unique_property(
    filepath, 2, ["-" + phase], drop_properties=['stamps'])
print(uniq_props_measures)
for ens_avg_space_db in ens_avg_space_dbs:
    ens_avgs = list()
    space = ens_avg_space_db.split('/')[-2].split('-')[0]
    for property_ in uniq_props_measures:
        ens_avg = organizer.space_tseries(
            ens_avg_space_db,
            property_,
            project_details[project]['parser'],
            project_details[project]['hierarchy'],
            project_details[project]['attributes'],
            project_details[project]['group'],
            project_details[project]['geometry'],
            project_details[project]['topology'],
            is_save = False  # if True, save per property per space
        )
        ens_avgs.append(ens_avg)
    ens_avgs = pd.concat(ens_avgs,axis=1)
    # drop duplicated columns:
    ens_avgs = ens_avgs.loc[:,~ens_avgs.columns.duplicated()]
    output_name = analysis_db +  "-".join(
        [space,
         project_details[project]['group'],
         "chainSize-acf.parquet.brotli"
        ]
    )
    ens_avgs.to_parquet(output_name, index=False, compression='brotli')

### **allInOne** the chain-size properties per **space**

In [None]:
%%time
# Wall time: 2 min s for TransFoci
# Wall time: 7 min for SumRule
phase = 'ensAvg'
space_dbs = glob(analysis_db + project_details[project]['space_pat'])
ens_avg_space_dbs = [
    space_db + "/" for space_db in space_dbs if space_db.endswith(
        project_details[project]['group'] + '-' + phase
    )
]
print(ens_avg_space_dbs)
# list of unique property_measures:
filepath = ens_avg_space_dbs[0] + '*' + project_details[project]['hierarchy'] + '.csv'  # physical properties in all the
_, uniq_props_measures = organizer.unique_property(
    filepath, 2, ["-" + phase], drop_properties=['stamps'])
props_tseries = list(
    set(
        [prop.split("-acf")[0] for prop in uniq_props_measures]
    )
)
print(props_tseries)
for ens_avg_space_db in ens_avg_space_dbs:
    ens_avgs = list()
    space = ens_avg_space_db.split('/')[-2].split('-')[0]
    for property_ in props_tseries:
        ens_avg = organizer.space_tseries(
            ens_avg_space_db,
            property_,
            project_details[project]['parser'],
            project_details[project]['hierarchy'],
            project_details[project]['attributes'],
            project_details[project]['group'],
            project_details[project]['geometry'],
            project_details[project]['topology'],
            is_save = False  # if True, save per property per space
        )
        ens_avgs.append(ens_avg)
    ens_avgs = pd.concat(ens_avgs,axis=1)
    # drop duplicated columns:
    ens_avgs = ens_avgs.loc[:,~ens_avgs.columns.duplicated()]
    output_name = analysis_db +  "-".join(
        [space,  project_details[project]['group'], "chainSize.parquet.brotli"]
    )
    ens_avgs.to_parquet(output_name, index=False, compression='brotli')

## TransFoci Project: Pair distance time-series per project

In [None]:
%%time
analysis_db = '/Users/amirhsi_mini/research_data/analysis/'
phase = 'ensAvg'
space_dbs = glob(analysis_db + project_details[project]['space_pat'])
ens_avg_space_dbs = [
    space_db + "/" for space_db in space_dbs if space_db.endswith(
        project_details[project]['group'] + '-' + phase
    )
]
print(ens_avg_space_dbs)
tseries_foci_props = ['pairDistTFoci']
project_ens_avgs = []
for prop in tseries_foci_props:
    prop_ens_avgs = list()
    for ens_avg_space_db in ens_avg_space_dbs:
        space = ens_avg_space_db.split('/')[-2].split('-')[0]
        ens_avg = organizer.space_tseries(
            ens_avg_space_db,
            prop,
            project_details[project]['parser'],
            project_details[project]['hierarchy'],
            project_details[project]['attributes'],
            project_details[project]['group'],
            project_details[project]['geometry'],
            project_details[project]['topology'],
            is_save = False  # if True, save per property per space
        )
        prop_ens_avgs.append(ens_avg)
    prop_ens_avgs = pd.concat(prop_ens_avgs,axis=0)
    # drop duplicated columns:
    prop_ens_avgs = prop_ens_avgs.loc[:, ~prop_ens_avgs.columns.duplicated()]
    prop_ens_avgs.reset_index(inplace=True, drop=True)
    project_ens_avgs.append(prop_ens_avgs)
project_ens_avgs = pd.concat(project_ens_avgs,axis=1)
project_ens_avgs = \
    project_ens_avgs.loc[:, ~project_ens_avgs.columns.duplicated()]
project_ens_avgs.reset_index(inplace=True, drop=True)
output ='-'.join(['allInOne', project, project_details[project]['group'], 'pairDistT.parquet.brotli'])
output = analysis_db + output
project_ens_avgs.to_parquet(output, index=False, compression='brotli')

# Equilibrium timeseries properties per space AND per project

## Whole equilibrium properties allInOne

In [None]:
%%time
# Wall time: 23 s for TransFoci
# Wall time: 10 min s for SumRule
spaces = glob(analysis_db + project_details[project]['space_pat'])
spaces = list(set([space.split('/')[-1].split('-')[0] for space in spaces]))
save_space = True
equili_props_wholes = api.all_in_one_equil_tseries(
    project,
    analysis_db,
    project_details[project]['group'],
    spaces,
    project_details[project]['time_varying_props'],
    project_details[project]['equil_measures'],
    save_space=save_space,
    divisor=0.025,
    round_to=3,
    save_to=analysis_db,
)
ens_avg = api.all_in_one_equil_tseries_ens_avg(
    project,
    equili_props_wholes,
    project_details[project]['group'],
    project_details[project]['equil_properties'],
    project_details[project]['equil_attributes'],
    save_to=analysis_db
)


# Distributions

## Persistence lengths: HnsCub
- Currently the **probe** phase is **HnsCub** project

In [None]:
analysis_db = '/Users/amirhsi_mini/research_data/analysis/'
phase = 'ensAvg'
ext = '.csv'
property_ext = phase + '-mean' + ext
space_dbs = glob(analysis_db + project_details[project]['space_pat'])
ens_avg_space_dbs = [
    space_db + "/" for space_db in space_dbs if space_db.endswith(
        project_details[project]['group'] + '-' + phase
    )
]
filepath = ens_avg_space_dbs[0] + project_details[project]['hierarchy'] + ext # physical properties in all the
uniq_props,  _ = organizer.unique_property(
    filepath, 2, ["-" + phase], drop_properties=['stamps'])
bond_props = list(
    set(
        [prop for prop in uniq_props if prop.startswith("bondCosine")]
    )
)
print(bond_props)

In [None]:
project_all_in_one = list()
polymer_topo = project_details[project]['topology']
for ens_avg_space_db in ens_avg_space_dbs:
    ens_avgs = list()
    space = ens_avg_space_db.split('/')[-2].split('-')[0]
    space_info = project_details[project]['parser'](
        space,
        'space',
        project_details[project]['geometry'],
        project_details[project]['group'],
        project_details[project]['topology'],
        ispath=False
    )
    bonds_per_topology = {
       'linear': np.arange(1, space_info.nmon, 1),
       'ring': np.arange(1, space_info.nmon+1, 1)
    }
    for property_ in bond_props:
        ens_avg = organizer.space_hists(
            ens_avg_space_db,
            property_,
            project_details[project]['parser'],
            project_details[project]['hierarchy'],
            project_details[project]['attributes'],
            project_details[project]['group'],
            project_details[project]['geometry'],
            project_details[project]['topology'],
            bin_center=bonds_per_topology[polymer_topo],
            normalize=False,
            is_save=False
        )
        ens_avgs.append(ens_avg)
    ens_avgs = pd.concat(ens_avgs,axis=1)
    # drop duplicated columns:
    ens_avgs = ens_avgs.loc[:,~ens_avgs.columns.duplicated()]
    ens_avgs.reset_index(inplace=True, drop=True)
    project_all_in_one.append(ens_avgs)
project_all_in_one = pd.concat(project_all_in_one,axis=0)
project_all_in_one = \
    project_all_in_one.loc[:, ~project_all_in_one.columns.duplicated()]
project_all_in_one.reset_index(inplace=True, drop=True)
output = '-'.join(
    ['allInOne', project, project_details[project]['group'], 'BondCosCorrVecMon.csv']
)
output = analysis_db + output
project_all_in_one.to_csv(output, index=False)

## Clusters and bonds per project: TransFoci

- Applicable to any project in which clustering happens such as **HnsCub**, **TransFociCub**,and **TransFociCyl**.
- The histograms of **Clusters** and **bonds** can **not** be combined in **one** dataset.
- Since **per project** datasets are small, we create **one** per project dataset for each property.

In [None]:
analysis_db = '/Users/amirhsi_mini/research_data/analysis/'
phase = 'ensAvg'
space_dbs = glob(analysis_db + project_details[project]['space_pat'])
ens_avg_space_dbs = [
    space_db + "/" for space_db in space_dbs if space_db.endswith(
        project_details[project]['group'] + '-' + phase
    )
]
print(ens_avg_space_dbs)

nmon_large = 5
hist_t_foci_bin_centers = {
   'bondsHistFoci': np.arange(nmon_large),
   'clustersHistFoci': np.arange(1, nmon_large + 1)
}
# Separate dataset for bonds and clusters per
for prop, bin_center in hist_t_foci_bin_centers.items():
    ens_avgs = list()
    for ens_avg_space_db in ens_avg_space_dbs:
        space = ens_avg_space_db.split('/')[-2].split('-')[0]
        ens_avg = organizer.space_hists(
            ens_avg_space_db,
            prop,
            project_details[project]['parser'],
            project_details[project]['hierarchy'],
            project_details[project]['attributes'],
            project_details[project]['group'],
            project_details[project]['geometry'],
            project_details[project]['topology'],
            bin_center=bin_center,
            normalize=True,
            is_save = False
        )
        ens_avgs.append(ens_avg)
    ens_avgs = pd.concat(ens_avgs,axis=0)
    # drop duplicated columns:
    ens_avgs = ens_avgs.loc[:, ~ens_avgs.columns.duplicated()]
    ens_avgs.reset_index(inplace=True, drop=True)
    output =  "-".join(['allInOne', project, project_details[project]['group'], prop + ".parquet.brotli"])
    output = analysis_db + output
    ens_avgs.to_parquet(output, index=False, compression='brotli')

## TransFoci and HnsCub Projects: Pair Distance Statistics per project: **bug** or **nuceloid** groups

- Applicable to any project in which oair distance matters such as **HnsCub**, **TransFociCub**,and **TransFociCyl**.
- These **properties** can be **combined** in one file per project.
- Since **per project** datasets are small, we create **one** per project dataset for **all** properties.

In [None]:
analysis_db = '/Users/amirhsi_mini/research_data/analysis/'
phase = 'ensAvg'
space_dbs = glob(analysis_db + project_details[project]['space_pat'])
ens_avg_space_dbs = [
    space_db + "/" for space_db in space_dbs if space_db.endswith(
        project_details[project]['group'] + '-' + phase
    )
]
print(ens_avg_space_dbs)

hist_foci_props = ['pairDistHistFoci', 'pairDistRdfFoci']
# One per-project database for both property since they are small and related
project_ens_avgs = []
for prop in hist_foci_props:
    prop_ens_avgs = list()
    for ens_avg_space_db in ens_avg_space_dbs:
        space = ens_avg_space_db.split('/')[-2].split('-')[0]
        ens_avg = organizer.space_hists(
            ens_avg_space_db,
            prop,
            project_details[project]['parser'],
            project_details[project]['hierarchy'],
            project_details[project]['attributes'],
            project_details[project]['group'],
            project_details[project]['geometry'],
            project_details[project]['topology'],
            bin_center=None,
            normalize=False,
            is_save=False
        )
        prop_ens_avgs.append(ens_avg)
    prop_ens_avgs = pd.concat(prop_ens_avgs,axis=0)
    # drop duplicated columns:
    prop_ens_avgs = prop_ens_avgs.loc[:, ~prop_ens_avgs.columns.duplicated()]
    prop_ens_avgs.reset_index(inplace=True, drop=True)
    project_ens_avgs.append(prop_ens_avgs)
project_ens_avgs = pd.concat(project_ens_avgs,axis=1)
# drop duplicated columns:
project_ens_avgs = project_ens_avgs.loc[:, ~project_ens_avgs.columns.duplicated()]
project_ens_avgs.reset_index(inplace=True, drop=True)
output = '-'.join(
    ['allInOne', project, project_details[project]['group'], 'pairDistStats.parquet.brotli']
)
output = analysis_db + output
project_ens_avgs.to_parquet(output, index=False, compression='brotli')


## Spatial Distributions and the sum rule: **all** group

- Finding the spatial histogram, number density, and local volume fraction in different geometries.

### Cylindrical geometry: 

- This works for project such as **SumRuleCyl** and **TransFociCyl** in which the spherical beads are confined in a cylinder.

#### NOT needed: allInOne Local Distributions: 

- ensAvg of Hists, Rhos, Phis with var and sem per project: Do not need to run this as the information already exist in the "allIneOne Sum-Rule" section

In [None]:
analysis_db = '/Users/amirhsi_mini/research_data/analysis/'
phase = 'ensAvg'
group = 'all'
space_dbs = glob(analysis_db + project_details[project]['space_pat'])
ens_avg_space_dbs = [
    space_db + "/" for space_db in space_dbs if space_db.endswith(
       group  + '-' + phase
    )
]
print(ens_avg_space_dbs)
# list of unique properties and property_measures:
# Local distributions do not have any property_measures:
uniq_props, _ = organizer.unique_property(
    ens_avg_space_dbs[0] + '*' + \
        project_details[project]['hierarchy'] + '.csv',
    2,
    ["-" + phase],
    drop_properties=["stamps"])
print(uniq_props)

In [None]:
directions = ['theta', 'z', 'r']
for direction in directions:
    props_by_dir = [prop for prop in uniq_props if prop.startswith(direction)]
    dir_ens_avgs = list()
    for prop in props_by_dir:
        prop_ens_avgs = list()
        for ens_avg_space_db in ens_avg_space_dbs:
            space = ens_avg_space_db.split('/')[-2].split('-')[0]
            ens_avg = organizer.space_hists(
                ens_avg_space_db,
                prop,
                project_details[project]['parser'],
                project_details[project]['hierarchy'],
                project_details[project]['attributes'],
                group,
                project_details[project]['geometry'],
                project_details[project]['topology'],
                normalize=True,
                is_save=False
            )
            prop_ens_avgs.append(ens_avg)
        prop_ens_avgs = pd.concat(prop_ens_avgs,axis=0)
        # drop duplicated columns:
        prop_ens_avgs = \
            prop_ens_avgs.loc[:, ~prop_ens_avgs.columns.duplicated()]
        prop_ens_avgs.reset_index(inplace=True, drop=True)
        dir_ens_avgs.append(prop_ens_avgs)
    dir_ens_avgs = pd.concat(dir_ens_avgs,axis=1)
        # drop duplicated columns:
    dir_ens_avgs = dir_ens_avgs.loc[:, ~dir_ens_avgs.columns.duplicated()]
    dir_ens_avgs.reset_index(inplace=True, drop=True)
    output = analysis_db +  "-".join([
        'allInOne', project,  project_details[project]['group'],  direction + "LocalDist.parquet.brotli"
    ])
    dir_ens_avgs.to_parquet(output, index=False, compression='brotli')

#### allInONe Sum-Rule:

In [None]:
analysis_db = '/Users/amirhsi_mini/research_data/analysis/'
phase = 'ensAvg'
group = 'all'
space_dbs = glob(analysis_db + project_details[project]['space_pat'])
ens_avg_space_dbs = [
    space_db + "/" for space_db in space_dbs if space_db.endswith(
        group + '-' + phase
    )
]
print(ens_avg_space_dbs)
species_dict = project_details[project]['rhosPhisNormalizedScaled']
print('species_dict: ', species_dict)
directions = ['r', 'z']
props= ['Rho', 'Phi']
dir_prop_pairs = list(product(props, directions))
print('dir_prop_pairs: ', dir_prop_pairs)

In [None]:
for (prop, direction) in dir_prop_pairs:
    all_in_one = list()
    for (species, size_attr) in species_dict:
        per_species = list()
        for ens_avg_space_db in ens_avg_space_dbs:
            space = ens_avg_space_db.split('/')[-2].split('-')[0]
            per_space = organizer.space_sum_rule(
                ens_avg_space_db,
                prop,
                project_details[project]['parser'],
                project_details[project]['hierarchy'],
                project_details[project]['attributes'],
                species,
                size_attr,
                group,
                project_details[project]['geometry'],
                project_details[project]['topology'],
                direction,
                is_save=False
            )
            per_species.append(per_space)
        per_species = pd.concat(per_species,axis=0)
        per_species = per_species.loc[:, ~per_species.columns.duplicated()]
        per_species.reset_index(inplace=True, drop=True)
        all_in_one.append(per_species)
    all_in_one = pd.concat(all_in_one,axis=1)
    all_in_one = all_in_one.loc[:, ~all_in_one.columns.duplicated()]
    all_in_one.reset_index(inplace=True, drop=True)
    output = '-'.join(['allInOne', project, group, direction + prop])
    output += '-NormalizedScaled.parquet.brotli'
    output = analysis_db + output
    all_in_one.to_parquet(output, index=False, compression='brotli')


### Cubic geometry:

- This works for project such as **TransFociCub** and **HnsCub** in which the spherical beads are in free space or a cubic box with the periodic boundary conditions in all directions.

In [None]:
analysis_db = '/Users/amirhsi_mini/research_data/analysis/'
phase = 'ensAvg'
group = 'all'
space_dbs = glob(analysis_db + project_details[project]['space_pat'])
ens_avg_space_dbs = [
    space_db + "/" for space_db in space_dbs if space_db.endswith(
       group + '-' + phase
    )
]
print(ens_avg_space_dbs)
species_dict = project_details[project]['rhosPhisNormalizedScaled']
print('species_dict: ', species_dict)
directions = ['r']
props= ['Rho', 'Phi']
dir_prop_pairs = list(product(props, directions))
print('dir_prop_pairs: ', dir_prop_pairs)

In [None]:
from typing import (
    List,
    Dict,
    Tuple,
    Optional,
    Union,
    Callable,
)
import pathlib
from glob import glob
import re
import numpy as np
import pandas as pd
import warnings

from polyphys.manage.typer import WholeT, EnsembleT
from polyphys.analyze.clusters import whole_dist_mat_foci
from polyphys.manage.utilizer import round_up_nearest, invalid_keyword

from polyphys.manage.organizer import normalize_r, normalize_z, sort_filenames

def space_sum_rule(
    input_database: str,
    property_: str,
    parser: Callable,
    hierarchy: str,
    physical_attrs: List[str],
    species: str,
    size_attr: str,
    group: str,
    geometry: str,
    topology: str,
    direction: str,
    divisor: Optional[float] = 0.025,
    round_to: Optional[int] = 3,
    is_save: Optional[bool] = False
) -> pd.DataFrame:
    """Takes the `property_path` to 'ensAvg' local distribution of a given
    `property_` in a given space `input_database`, normalize and scale that
    distribution, adds the `physical_attrs` of interest as the new columns to
    each 'ensAvg' distribution, and merges all the 'ensAvg' distributions into
    one 'space' dataframe along the 0 (or 'row' or 'index') in pandas lingo,

    In each 'ensemble-averaged' dataframe, there are 4 columns with this name
    pattern:

    column name = '[long_ensemble]-[property_][-measure]-[stat]'

    , and sometimes

    column name = 'bin_center'

    where '[-measure]' is a physical measurement such as the auto correlation
    function (AFC) done on the physical 'property_'. [...] means this keyword
    in the column name can be optional. the 'stat' keyword is either 'mean',
    'ver', or 'sem'. If the 'bin_center' presents as a column in a
    'ensemble_averaged' dataframe, then it is inferred; otherwise, it should
    be passed to the function. See `bin_center` kw argument below.

    Issues
    ------
    Currently, `direction` is only defined for 'cylindrical' geometry.

    Parameters
    ----------
    input_database: str
        Path to the timeseries of the physical property of interest.
    property_: str
        Name of the physical property of interest.
    parser: Callable
        A class from 'PolyPhys.manage.parser' module that parses filenames
        or filepaths to infer information about a file.
    hierarchy: str
        The pattern by which the filenames of timeseries are started with; for
        instance, "N*" means files start with "N"
    physical_attrs: list of str
        The physical attributes that will be added as new columns to the
        concatenated timeseries.
    species: {'Mon', 'Crd', 'Foci', 'Dna'}
        The species of the particles in a group.
            'Mon': Monomers or small monomers
            'Crd': Crowders
            'Foci': Large monomers
            'Dna': Small and large monomers
    size_attr: str
        The attribute of the `parser` object that is the size (diameter) of
        species.
    group: {'bug', 'nucleoid', 'all'}
        The type of the particle group.
    geometry : {'cylindrical', 'slit', 'cubic'}
        The shape of the simulation box.
    topology: str
        Topology of the polymer.
    direction: {'r', 'z'}
        The direction along which operation is done.
    divisor: float, default 0.025
        The step by which the values of "phi_c_bulk" attribute are rounded.
    round_to: int, default 3
        The number of significant decimal digits in the values of "phi_c_bulk"
        attribute.
    is_save : bool, default False
        whether to save output to file or not.

    Return
    ------
    all_in_one: pandas.DataFrame
        a dataframe in which all the timeseries are concatenated along `orient`
        of interest, and "properties and attributes" of interest are added to
        it as the new columns.

    Requirements
    ------------
    PolyPhys, Pandas
    """
    normalizer = {
        'r': normalize_r,
        'z': normalize_z
    }
    property_ext = "-" + group + "-" + direction + property_ + species
    property_ext += "-ensAvg.csv"
    prop = direction + property_ + species  # full name of physical property
    ens_avg_csvs = glob(input_database + hierarchy + property_ext)
    ens_avg_csvs = sort_filenames(ens_avg_csvs, fmts=[
        property_ext])
    property_db = []
    # ens_csvs is a list of tuples, each has one member.
    for ens_avg_csv in ens_avg_csvs:
        ens_avg = pd.read_csv(ens_avg_csv[0], header=0)
        property_info = parser(
            ens_avg_csv[0],
            'ensemble_long',
            geometry,
            group,
            topology
        )
        if property_ == 'Phi':
            scaler = getattr(property_info, size_attr)
            ens_avg[prop + '-scaler'] = scaler
            ens_avg[prop + '-scale'] = ens_avg[prop + '-mean'] / scaler
        elif property_ == 'Rho':
            scaler = getattr(property_info, size_attr)
            ens_avg[prop + '-scaler'] = scaler ** 2
            ens_avg[prop + '-scale'] = ens_avg[prop + '-mean'] * scaler ** 2
        else:
            raise NotImplementedError(
                "Sum rule's scaler is only defined for "
                "'rho' (density) or 'phi' (volume fraction) properties."
            )
        ens_avg[prop + '-scale-normalized_curve'] = \
            ens_avg[prop + '-scale'] / ens_avg[prop + '-scale'].sum()
        ens_avg = normalizer[direction](prop, ens_avg)
        ens_avg[prop + '-sumrule_constant'] = \
            ens_avg[prop + '-normalizer'] / ens_avg[prop + '-scaler']
        ens_avg['bin_center-norm'] = \
            ens_avg['bin_center'] / ens_avg['bin_center'].max()
        for attr_name in physical_attrs:
            ens_avg[attr_name] = getattr(property_info, attr_name)
        ens_avg['bin_center-dcrowd'] = (
            2 * ens_avg['bin_center'] / ens_avg['dcrowd']
        )
        ens_avg['phi_c_bulk_round'] = ens_avg['phi_c_bulk'].apply(
            round_up_nearest, args=[divisor, round_to])
        if geometry == 'cylindrical':
            ens_avg['temp'] = (
                (ens_avg['dcyl'] % ens_avg['dcrowd']) /
                (ens_avg['dcrowd'])
            )
            ens_avg['bin_center-dcrowd-recentered'] = (
                ens_avg['bin_center-dcrowd'] - ens_avg['temp']
            )
            ens_avg['bin_center-recentered-norm'] = (
                ens_avg['bin_center'] - (ens_avg['dcyl'] % ens_avg['dcrowd'])
            )
            ens_avg['bin_center-recentered-norm'] = (
                ens_avg['bin_center-recentered-norm'] /
                ens_avg['bin_center-recentered-norm'].max()
            )
            ens_avg.drop(columns=['temp'], inplace=True)
        property_db.append(ens_avg)
    property_db = pd.concat(property_db, axis=0)
    property_db.reset_index(inplace=True, drop=True)
    if is_save is not False:
        save_to_space = database_path(
            input_database,
            'analysis',
            stage='space',
            group=group
        )
        space = save_to_space.split("/")[-2].split("-")[0]
        output = "-".join([space, group, property_, species])
        output += "-normalizedRescaled-space.csv"
        property_db.to_csv(save_to_space + output, index=False)
        print("done")
    return property_db

In [None]:
for (prop, direction) in dir_prop_pairs:
    all_in_one = list()
    for (species, size_attr) in species_dict:
        per_species = list()
        for ens_avg_space_db in ens_avg_space_dbs:
            space = ens_avg_space_db.split('/')[-2].split('-')[0]
            per_space = space_sum_rule(
                ens_avg_space_db,
                prop,
                project_details[project]['parser'],
                project_details[project]['hierarchy'],
                project_details[project]['attributes'],
                species,
                size_attr,
                group,
                project_details[project]['geometry'],
                project_details[project]['topology'],
                direction,
                is_save=False
            )
            per_species.append(per_space)
        per_species = pd.concat(per_species,axis=0)
        per_species = per_species.loc[:, ~per_species.columns.duplicated()]
        per_species.reset_index(inplace=True, drop=True)
        all_in_one.append(per_species)
    all_in_one = pd.concat(all_in_one,axis=1)
    all_in_one = all_in_one.loc[:, ~all_in_one.columns.duplicated()]
    all_in_one.reset_index(inplace=True, drop=True)
    output = '-'.join(['allInOne', project, group, direction + prop])
    output += '-NormalizedScaled.parquet.brotli'
    output = analysis_db + output
    all_in_one.to_parquet(output, index=False, compression='brotli')
