In [1]:
from glob import glob
import pathlib
import pandas as pd
import numpy as np
import math
import re
from polyphys.manage import organizer
from polyphys.manage.parser import SumRule
from polyphys.probe import prober
from polyphys.analyze import analyzer
from polyphys.analyze import distributions
import MDAnalysis as mda
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from dask.distributed import Client
from dask import delayed
from dask import compute
client = Client(n_workers=4)
client

# The probe phase:
There are several ways of analyzing the topology and trajectory pairs, depending on the number of trajectory files per a topology file, the continuity of trjaectory files, organization of files in a directory, and the parallel or sequencial arrangement of the computation powerhorse.

## Separated *whole* simulation directories on a cluster: gnuparallel
On the cluster, *whole* simulations are organized into *whole* directories, where each *whole* directory contains all the files for a given *whole* simulation. The **gnuparallel** is used to parallalize the **probe** phase at the **shell** level. For this purpose, all the python modules and scripts are separatedly installed and run on each core. For instance, if 32 cores are available, then the files in 32 *whole* directories are simulatenously installed. However, each *whole* directory may contains multiple toplogy and trajectory pairs. Thus, there is parallelization at the level of *whole* directories, not at the levle of the *segment* or *whole* trajectories inside a *whole* directory. Inside each *whole* directory, a python **main_probe.py** script analyzes the trajectories in a sequencial way. 

### trj anf all segments on a cluster
For each *whole* directory, the following script is executed by means of *gnuparallel*. See these scripts: *probe-1.7-all_trj_segments.py* and *probe-1.7-bug_trj_segments*

In [None]:
from glob import glob
from polyphys.manage import organizer
from polyphys.manage.parser import SumRule
from polyphys.probe import prober

geometry = 'biaxial'
trj_lineage = 'segment'
save_to="./"

bug_trjs = glob("../test_data/trjs-continuous/N500D10.0ac0.8-trjs/N500epsilon5.0r5.5lz205.5sig0.8nc12012dt0.002bdump1000adump5000ens2/N*.bug.lammpstrj")
bug_trjs = organizer.sort_filenames(bug_trjs,fmts=['.bug.lammpstrj'])
bug_trjs = [bug_trj[0] for bug_trj in bug_trjs]
bug_topo = glob("../test_data/trjs-continuous/N500D10.0ac0.8-trjs/N500epsilon5.0r5.5lz205.5sig0.8nc12012dt0.002bdump1000adump5000ens2/N*.bug.data")
bug_topo = organizer.sort_filenames(bug_topo,fmts=['.bug.data'])
bug_topo = bug_topo[0][0]
print(bug_topo)
for bug_trj in bug_trjs:
    print(bug_trj)
    trj_info = SumRule(bug_trj, geometry=geometry, group='bug',lineage=trj_lineage)
    # all the frames in the last segment are probed:
    if trj_info.segment_id ==len(bug_trjs):
        #print("last: " + bug_trj)
        prober.probe_bug(bug_topo, bug_trj, geometry, trj_lineage, save_to)
    # the last frame in the all other segments is ignored:
    else:
        #print(bug_trj)
        prober.probe_bug(bug_topo, bug_trj, geometry, trj_lineage, save_to, continuous=True)
trj_lineage = 'segment'
all_trjs = glob("../test_data/trjs-continuous/N500D10.0ac0.8-trjs/N500epsilon5.0r5.5lz205.5sig0.8nc12012dt0.002bdump1000adump5000ens2/N*.all.lammpstrj")
all_trjs = organizer.sort_filenames(all_trjs, fmts=['.all.lammpstrj'])
all_trjs = [all_trj[0] for all_trj in all_trjs]
all_topo = glob("../test_data/trjs-continuous/N500D10.0ac0.8-trjs/N500epsilon5.0r5.5lz205.5sig0.8nc12012dt0.002bdump1000adump5000ens2/N*.all.data")
all_topo = organizer.sort_filenames(all_topo, fmts=['.all.data'])
all_topo = all_topo[0][0]
print(all_topo)
for all_trj in all_trjs:
    print(all_trj)
    trj_info = SumRule(all_trj, geometry=geometry, group='all',lineage=trj_lineage)
    # all the frames in the last segment are probed:
    if trj_info.segment_id ==len(all_trjs):
        #print("last: " + all_trj)
        prober.probe_all(all_topo, all_trj, geometry, trj_lineage, save_to)
    # the last frame in the all other segments is ignored:
    else:
        #print(all_trj)
        prober.probe_all(all_topo, all_trj, geometry, trj_lineage, save_to,continuous=True)

## Separated *whole* directories on a PC: Dask
On a PC, the *whole* directories are located in a master *space-trjs* directory; however, one main python script probes all the *whole* directories in a parallel scheme via Dask. This is different from the *gnuparallel*-based approach in which each *whole* directory has its own copy of the required scripts and a main pytohn script is run to probe that direcotry individually.

In [None]:
## This approach from HERE
path = pathlib.Path('../test_data/trjs-continuous/N500D10.0ac0.8-trjs')
path = path.resolve() # convert relative path to aabsolute one
input_database = str(path)
if not pathlib.Path(input_database).exists():
    raise OSError(f"'{input_database}'"
                    "path does not exist.")
## to HERE, does not work of * is used in the string input for Path.
geometry = 'biaxial'
group = 'bug'
hierarchy = '/N*/N*'
observations = glob(input_database + hierarchy)
if observations == []:
    raise OSError(
        "File not found in "
        f"'{input_database + hierarchy}'"
        )
topologies = organizer.sort_filenames(observations, fmts=['.bug.data'])
trajectories = organizer.sort_filenames(observations, fmts=['.bug.lammpstrj'])
# 'bug' time series and historams
save_to = analyzer.database_path(input_database, phase='probe', stage='segment', group=None)
topo_info = SumRule(topology[0],geometry=geometry, group=group, lineage='whole')
for topology in topologies:
    print(topology[0])
    topo_info = SumRule(topology[0],geometry=geometry, group=group, lineage='whole')
    save_to_whole = save_to + '/' + topo_info.whole
    save_to_whole = pathlib.Path(save_to_whole) 
    try:
        save_to_whole.mkdir(parents=True, exist_ok=False)
    except FileExistsError as error:
        print(error)
        print(
            f"Directory '{save_to_whole}'"
            " exist. Files are saved/overwritten to an existing directoy.")
    finally:
        save_to_whole = str(save_to_whole) + '/'
    for trajectory in trajectories:
        trj_info = SumRule(trajectory[0],geometry=geometry, group=group, lineage='segment')
        if trj_info.whole == topo_info.whole:
            if trj_info.segment_id ==10:
                prober.probe_bug(topology[0], trajectory[0], geometry, 'segment', save_to_whole)
            else:
                prober.probe_bug(topology[0], trajectory[0], geometry, 'segment', save_to_whole, continuous=True)

### bug whole trjs 

### trjs whole

In [None]:
## This approach from HERE
path = pathlib.Path('/Users/amirhsi_mini/trjs/N500D10.0ac0.8-trjs')
path = path.resolve() # convert relative path to aabsolute one
input_database = str(path)
if not pathlib.Path(input_database).exists():
    raise OSError(f"'{input_database}'"
                    "path does not exist.")
## to HERE, does not work of * is used in the string input for Path.
geometry = 'biaxial'
group = 'bug'
hierarchy = '/N*/N*'
observations = glob(input_database + hierarchy)
if observations == []:
    raise OSError(
        "File not found in "
        f"'{input_database + hierarchy}'"
        )
topologies = organizer.sort_filenames(observations, fmts=['.bug.data'])
trajectories = organizer.sort_filenames(observations, fmts=['.bug.lammpstrj'])
# 'bug' time series and historams
save_to = analyzer.database_path(input_database, phase='probe', stage='segment', group=None)
for topology in topologies:
    print(topology[0])
    topo_info = SumRule(topology[0],geometry=geometry, group=group, lineage='whole')
    save_to_whole = save_to + '/' + topo_info.whole
    save_to_whole = pathlib.Path(save_to_whole) 
    try:
        save_to_whole.mkdir(parents=True, exist_ok=False)
    except FileExistsError as error:
        print(error)
        print(
            f"Directory '{save_to_whole}'"
            " exist. Files are saved/overwritten to an existing directoy.")
    finally:
        save_to_whole = str(save_to_whole) + '/'
    for trajectory in trajectories:
        trj_info = SumRule(trajectory[0],geometry=geometry, group=group, lineage='whole')
        if trj_info.whole == topo_info.whole:
            prober.probe_bug(topology[0], trajectory[0], geometry, 'whole', save_to_whole, continuous=False)

### bug whole trjs dask

In [None]:
## This approach from HERE
path = pathlib.Path('/Users/amirhsi_mini/trjs/N500D10.0ac0.8-trjs')
path = path.resolve() # convert relative path to aabsolute one
input_database = str(path)
if not pathlib.Path(input_database).exists():
    raise OSError(f"'{input_database}'"
                    "path does not exist.")
## to HERE, does not work of * is used in the string input for Path.
geometry = 'biaxial'
group = 'bug'
hierarchy = '/N*/N*'
observations = glob(input_database + hierarchy)
if observations == []:
    raise OSError(
        "File not found in "
        f"'{input_database + hierarchy}'"
        )
topologies = organizer.sort_filenames(observations, fmts=['.bug.data'])
trajectories = organizer.sort_filenames(observations, fmts=['.bug.lammpstrj'])
# 'bug' time series and historams
save_to = analyzer.database_path(input_database, phase='probe', stage='segment', group=None)
trjs_computed = []
for topology in topologies:
    print(topology[0])
    topo_info = SumRule(topology[0],geometry=geometry, group=group, lineage='whole')
    save_to_whole = save_to + '/' + topo_info.whole
    save_to_whole = pathlib.Path(save_to_whole) 
    try:
        save_to_whole.mkdir(parents=True, exist_ok=False)
    except FileExistsError as error:
        print(error)
        print(
            f"Directory '{save_to_whole}'"
            " exist. Files are saved/overwritten to an existing directoy.")
    finally:
        save_to_whole = str(save_to_whole) + '/'
    for trajectory in trajectories:
        trj_info = SumRule(trajectory[0],geometry=geometry, group=group, lineage='whole')
        if trj_info.whole == topo_info.whole:
            trj_delayed = delayed(prober.probe_bug)(topology[0], trajectory[0], geometry, 'whole', save_to_whole, continuous=False)
            trjs_computed.append(trj_delayed)

In [None]:
%%time
# it takes 9min and 34s.
results = compute(trjs_computed)

## Separated *segment* directories on a PC: Dask

In [None]:
path = pathlib.Path('../test_data/trjs-continuous/N500D10.0ac0.8-trjs')
path = path.resolve() # convert relative path to aabsolute one
input_database = str(path)
geometry = 'biaxial'
group = 'all'
hierarchy = '/N*/N*'
if not pathlib.Path(input_database).exists():
    raise OSError(f"'{input_database}'"
                    "path does not exist.")
observations = glob(input_database + hierarchy)
if observations == []:
    raise OSError(
        "File not found in "
        f"'{input_database + hierarchy}'"
        )
topologies = organizer.sort_filenames(observations, fmts=['.all.data'])
trajectories = organizer.sort_filenames(observations, fmts=['.all.lammpstrj'])
# 'bug' time series and historams
save_to = analyzer.database_path(input_database, phase='probe', stage='segment', group=None)
for topology in topologies:
    topo_info = SumRule(topology[0],geometry=geometry, group=group, lineage='whole')
    save_to_whole = save_to + '/' + topo_info.whole
    save_to_whole = pathlib.Path(save_to_whole) 
    try:
        save_to_whole.mkdir(parents=True, exist_ok=False)
    except FileExistsError as error:
        print(error)
        print(
            f"Directory '{save_to_whole}'"
            " exist. Files are saved/overwritten to an existing directoy.")
    finally:
        save_to_whole = str(save_to_whole) + '/'
    for trajectory in trajectories:
        trj_info = SumRule(trajectory[0],geometry=geometry, group=group, lineage='segment')
        if trj_info.segment_id ==10:
            prober.probe_all(topology[0], trajectory[0], geometry, 'segment', save_to_whole)
        else:
            prober.probe_all(topology[0], trajectory[0], geometry, 'segment', save_to_whole, continuous=True)

In [None]:
path = pathlib.Path('../test_data/trjs-continuous/N500D10.0ac0.8-trjs')
path = path.resolve() # convert relative path to aabsolute one
input_database = str(path)
geometry = 'biaxial'
hierarchy = '/N*/N*'
observations = glob(input_database + hierarchy)
all_tuples =  organizer.sort_filenames(observations,fmts=['all.lammpstrj'])
all_trjs = [all_tuple[0] for all_tuple in all_tuples]
all_data =  organizer.sort_filenames(observations,fmts=['all.data'])
all_data = all_data[0][0]

    
for all_trj in all_trjs:
    print(all_trj)
    #PipeLine.extract_trj_all(all_data, all_trj, geom, save_to)

### trjs all segments dask:

In [None]:
path = pathlib.Path('/Users/amirhsi_mini/trjs/N500D10.0ac0.8-trjs')
path = path.resolve() # convert relative path to aabsolute one
input_database = str(path)
geometry = 'biaxial'
group = 'all'
hierarchy = '/N*/N*'
if not pathlib.Path(input_database).exists():
    raise OSError(f"'{input_database}'"
                    "path does not exist.")
observations = glob(input_database + hierarchy)
if observations == []:
    raise OSError(
        "File not found in "
        f"'{input_database + hierarchy}'"
        )
topologies = organizer.sort_filenames(observations, fmts=['.all.data'])
trajectories = organizer.sort_filenames(observations, fmts=['.all.lammpstrj'])
# 'bug' time series and historams
save_to = analyzer.database_path(input_database, phase='probe', stage='segment', group=None)
trjs_computed = []
for topology in topologies:
    topo_info = SumRule(topology[0],geometry=geometry, group=group, lineage='whole')
    save_to_whole = save_to + '/' + topo_info.whole
    save_to_whole = pathlib.Path(save_to_whole) 
    try:
        save_to_whole.mkdir(parents=True, exist_ok=False)
    except FileExistsError as error:
        print(error)
        print(
            f"Directory '{save_to_whole}'"
            " exist. Files are saved/overwritten to an existing directoy.")
    finally:
        save_to_whole = str(save_to_whole) + '/'
    for trajectory in trajectories:
        trj_info = SumRule(trajectory[0],geometry=geometry, group=group, lineage='segment')
        if trj_info.whole == topo_info.whole:
            if trj_info.segment_id ==14:
                trj_delayed = delayed(prober.probe_all_new)(topology[0], trajectory[0], geometry, 'segment', save_to_whole, continuous=False)
                trjs_computed.append(trj_delayed)
            else:
                trj_delayed = delayed(prober.probe_all_new)(topology[0], trajectory[0], geometry, 'segment', save_to_whole, continuous=True)
                trjs_computed.append(trj_delayed)  

In [None]:
results = compute(trjs_computed)

# The analyze phase:
In this phase, the segment files in the probe phase are merged into whole files. The ensemble, ensemble-averaged, and space files are created from whole files

## bugs:

In [4]:
%%time
# takes 25 min with nlags=100000
input_database = '/Users/amirhsi_mini/probe/N500D10.0ac1.0-segment/'
#input_database = '../test_data/probe/N2000D30.0ac4.0-segment/'
non_scalar_properties_bug = [
    # property_, species, group
    ('principalT', 'Mon', 'bug'),
]

acf_tseries_properties_bug = [
    # property_, species, group
    ('fsdT', 'Mon', 'bug'),
    ('gyrT', 'Mon', 'bug'),
    ('rfloryT', 'Mon', 'bug'),
    ('shapeT', 'Mon', 'bug'),
    ('asphericityT', 'Mon', 'bug')
]

hist_properties_bug = [
    # direction, species, group
    ('rflory', 'Mon', 'bug')
]
geometry = 'biaxial'
analyzer.analyze_segments_bug(
    input_database,
    #non_scalar_properties=non_scalar_properties_bug,
    acf_tseries_properties=acf_tseries_properties_bug,
    #hist_properties=hist_properties_bug,
    geometry=geometry,
    hierarchy='N*/N*',
    nlags=100000
)

[Errno 17] File exists: '//Users/amirhsi_mini/analysis/N500D10.0ac1.0-bug-wholeSim'
Directory '//Users/amirhsi_mini/analysis/N500D10.0ac1.0-bug-wholeSim' exist. Files are saved/overwritten to an existing directory.
[Errno 17] File exists: '//Users/amirhsi_mini/analysis/N500D10.0ac1.0-bug-ens'
Directory '//Users/amirhsi_mini/analysis/N500D10.0ac1.0-bug-ens' exist. Files are saved/overwritten to an existing directory.
[Errno 17] File exists: '//Users/amirhsi_mini/analysis/N500D10.0ac1.0-bug-ensAvg'
Directory '//Users/amirhsi_mini/analysis/N500D10.0ac1.0-bug-ensAvg' exist. Files are saved/overwritten to an existing directory.


ValueError: All arrays must be of the same length

# AllInOne Files

## Faking nc=0 for some spaces:
Faking is done at the **ens** and **ensAvg** levels (and directories) not at the **segment** and **whole** levles, so the **segment stamps** are not modified byt the **whole** and **ens** ones are modified.

#### Naming convention:
This is the pattern of file or directory names:

1. **whole** files: whole-group-property_[-measure][-stage][.ext]
2. **ensemble** files: ensemble-group-property_[-measure][-stage][.ext]
3. **ensemble_long** files: ensemble_long-group-property_[-measure][-stage][.ext]
4. **space** files: space-group-property_[-measure][-stage][.ext]
5. **all in one** files: **allInOne**-group-property_[-measure][-stage][.ext]

[keyword] means that the keyword in the file name is option. [-measure] is a physical measurement such as the auto correlation function (AFC) done on the physical 'property_'.

### setings:

In [8]:
# list of unique property_measures:
database = '/Users/amirhsi_mini/analysis/'
bug_property_measures = glob(database+"/N*-ensAvg"+"/N*.csv")
bug_property_measures = list(set(["-".join(property_measure.split("/")[-1].split(".csv")[0] .split("-")[2:]) for property_measure in bug_property_measures]))
bug_property_measures.remove("stamps-ensAvg")
bug_property_measures.sort()

#### allInONe ensAvg stamps:

In [22]:
database = '/Users/amirhsi_mini/analysis/'
spaces_stamps = glob(database+"/N*-ensAvg"+"/N*-stamps-ensAvg.csv")
allInOne_stamps = []
for space in spaces_stamps:
    space_stamps = pd.read_csv(space)
    allInOne_stamps.append(space_stamps)
allInOne_stamps = pd.concat(allInOne_stamps, axis=0)
allInOne_stamps

Unnamed: 0,ensemble_long,ensemble,space,n_segments,nmon,epsilon,dcyl,lcyl,dcrowd,ncrowd,dt,bdump,adump,n_ensembles,dmon,phi_m_bulk,rho_m_bulk,phi_c_bulk,rho_c_bulk,n_frames
0,N2000epsilon5.0r15.5lz379.5sig4.0nc0dt0.005bdu...,N2000D30.0ac4.0nc0,N2000D30.0ac4.0,2.0,2000,5.0,30.0,759.0,4.0,0,0.005,1000,5000,8,1.0,0.001952,0.003728,0.0,0.0,300001
1,N2000epsilon5.0r15.5lz379.5sig4.0nc1602dt0.005...,N2000D30.0ac4.0nc1602,N2000D30.0ac4.0,2.0,2000,5.0,30.0,759.0,4.0,1602,0.005,1000,5000,8,1.0,0.001952,0.003728,0.100061,0.002986,300001
2,N2000epsilon5.0r15.5lz379.5sig4.0nc2402dt0.005...,N2000D30.0ac4.0nc2402,N2000D30.0ac4.0,2.0,2000,5.0,30.0,759.0,4.0,2402,0.005,1000,5000,8,1.0,0.001952,0.003728,0.15003,0.004477,300001
3,N2000epsilon5.0r15.5lz379.5sig4.0nc3203dt0.005...,N2000D30.0ac4.0nc3203,N2000D30.0ac4.0,2.0,2000,5.0,30.0,759.0,4.0,3203,0.005,1000,5000,8,1.0,0.001952,0.003728,0.200061,0.00597,300001
4,N2000epsilon5.0r15.5lz379.5sig4.0nc3603dt0.005...,N2000D30.0ac4.0nc3603,N2000D30.0ac4.0,2.0,2000,5.0,30.0,759.0,4.0,3603,0.005,1000,5000,8,1.0,0.001952,0.003728,0.225045,0.006716,300001
5,N2000epsilon5.0r15.5lz379.5sig4.0nc4003dt0.005...,N2000D30.0ac4.0nc4003,N2000D30.0ac4.0,2.0,2000,5.0,30.0,759.0,4.0,4003,0.005,1000,5000,8,1.0,0.001952,0.003728,0.250029,0.007461,300001
6,N2000epsilon5.0r15.5lz379.5sig4.0nc4403dt0.005...,N2000D30.0ac4.0nc4403,N2000D30.0ac4.0,2.0,2000,5.0,30.0,759.0,4.0,4403,0.005,1000,5000,8,1.0,0.001952,0.003728,0.275013,0.008207,300001
7,N2000epsilon5.0r15.5lz379.5sig4.0nc4804dt0.005...,N2000D30.0ac4.0nc4804,N2000D30.0ac4.0,2.0,2000,5.0,30.0,759.0,4.0,4804,0.005,1000,5000,8,1.0,0.001952,0.003728,0.30006,0.008954,300001
8,N2000epsilon5.0r15.5lz379.5sig4.0nc5204dt0.005...,N2000D30.0ac4.0nc5204,N2000D30.0ac4.0,2.0,2000,5.0,30.0,759.0,4.0,5204,0.005,1000,5000,8,1.0,0.001952,0.003728,0.325044,0.0097,300001
9,N2000epsilon5.0r15.5lz379.5sig4.0nc5604dt0.005...,N2000D30.0ac4.0nc5604,N2000D30.0ac4.0,2.0,2000,5.0,30.0,759.0,4.0,5604,0.005,1000,5000,8,1.0,0.001952,0.003728,0.350028,0.010445,300001


### chain-size timeseries and their associated measures

In [6]:
# separating property_measures of kinds timeseries and timesseries acfs:
bug_property_acfs = list()
for property_measure in bug_property_measures:
    if "-acf" in property_measure:
        bug_property_acfs.append(property_measure)
bug_property_acfs.sort()
print(bug_property_acfs)
# chain timeseries:
bug_properties = list()
for property_measure in bug_property_acfs:
    if "-acf-" in property_measure:
        bug_properties.append(property_measure.split("-")[0]+'-ensAvg')
bug_properties.sort()
print(bug_properties)

['asphericityTMon-acf-ensAvg', 'asphericityTMon-acfLowerCi-ensAvg', 'asphericityTMon-acfUpperCi-ensAvg', 'fsdTMon-acf-ensAvg', 'fsdTMon-acfLowerCi-ensAvg', 'fsdTMon-acfUpperCi-ensAvg', 'gyrTMon-acf-ensAvg', 'gyrTMon-acfLowerCi-ensAvg', 'gyrTMon-acfUpperCi-ensAvg', 'rfloryTMon-acf-ensAvg', 'rfloryTMon-acfLowerCi-ensAvg', 'rfloryTMon-acfUpperCi-ensAvg', 'shapeTMon-acf-ensAvg', 'shapeTMon-acfLowerCi-ensAvg', 'shapeTMon-acfUpperCi-ensAvg']
['asphericityTMon-ensAvg', 'fsdTMon-ensAvg', 'gyrTMon-ensAvg', 'rfloryTMon-ensAvg', 'shapeTMon-ensAvg']


In [7]:
# allInOne timeseries for chain-size statistics
group = 'bug'
geometry = 'biaxial'
ensAvg_path = "/Users/amirhsi_mini/analysis/N2000D30.0ac4.0-bug-ensAvg"
ensAvgs = []
for property_measure in bug_properties:
    ensAvg = organizer.all_in_one_tseries(
        ensAvg_path,
        property_measure,
        group = group,
        geometry = geometry,
        save_to = None
    )
    ensAvgs.append(ensAvg)
ensAvgs = pd.concat(ensAvgs,axis=1)
# drop duplicated columns:
ensAvgs = ensAvgs.loc[:,~ensAvgs.columns.duplicated()]
output_name = database + "allInOne-bug-chainSize.csv"
ensAvgs.to_csv(output_name, index=False)

KeyboardInterrupt: 

In [None]:
# all in one timeseries for chain-size acf statistics
group = 'bug'
geometry = 'biaxial'
ensAvg_path = "/Users/amirhsi_mini/analysis/N2000D30.0ac4.0-bug-ensAvg"
ensAvgs = list()
for property_measure in bug_property_acfs:
    ensAvg = organizer.all_in_one_tseries(
        ensAvg_path,
        property_measure,
        group = group,
        geometry = geometry,
        save_to = None
    )
    ensAvgs.append(ensAvg)
ensAvgs = pd.concat(ensAvgs,axis=1)
# drop duplicated columns:
ensAvgs = ensAvgs.loc[:,~ensAvgs.columns.duplicated()]
output_name = database + "allInOne-bug-chainSize-acf.csv"
ensAvgs.to_csv(output_name, index=False)

In [None]:
# parallel version has memory leak issue.
%%time
# This has memory leaking issue
group = 'bug'
geometry = 'biaxial'
ensAvg_path = "/Users/amirhsi_mini/analysis/N2000D30.0ac4.0-bug-ensAvg"
all_in_one_computed = []
for property_measure in bug_property_measures:
    all_in_one_delayed = delayed(organizer.all_in_one_tseries)(
        ensAvg_path,
        property_measure,
        group = group,
        geometry = geometry,
        save_to = database
    )
    all_in_one_computed.append(all_in_one_delayed)
_ = compute(all_in_one_computed)

## Distributions

In [None]:
#hist_paths = glob('/Users/amirhsi_mini/probe/N500D10.0ac0.8-segment/N500epsilon5.0r5.5lz205.5sig0.8nc12012dt0.002bdump1000adump5000ens1/N500epsilon5.0r5.5lz205.5sig0.8nc12012dt0.002bdump1000adump5000ens1*')
hist_paths = glob('/Users/amirhsi_mini/probe/N500D10.0ac0.8-segment/N500epsilon5.0r5.5lz205.5sig0.8nc48047dt0.002bdump1000adump5000ens1/N500epsilon5.0r5.5lz205.5sig0.8nc48047dt0.002bdump1000adump5000ens1*')
species = 'Crd'
direction = 'z'
geometry='biaxial'
group='all'
segments = organizer.sort_filenames(
                hist_paths,
                fmts=['-' + direction + 'Hist' + species + '.npy']
            )
edge_segments = organizer.sort_filenames(
                hist_paths,
                fmts=['-' + direction + 'Edge' + species + '.npy']
            )
wholes = organizer.whole(
                direction + 'Hist' + species,
                segments,
                geometry=geometry,
                group=group,
                relation='histogram',
                save_to=None
            )
edge_wholes = organizer.whole(
                direction + 'Edge' + species,
                edge_segments,
                geometry=geometry,
                group=group,
                relation='bin_edge',
                save_to=None
            )
            # 'whole' dataframes, each with a 'whole' columns.
rho_wholes, phi_wholes = distributions.distributions_generator(
                wholes,
                edge_wholes,
                group,
                species,
                geometry,
                direction,
                save_to=None,
normalized=True)

In [None]:
wholes['N500epsilon5.0r5.5lz205.5sig0.8nc48047dt0.002bdump1000adump5000ens1']

In [None]:
edge_wholes

In [None]:
plt.hist(edge_wholes['N500epsilon5.0r5.5lz205.5sig0.8nc48047dt0.002bdump1000adump5000ens1'][:-1],edge_wholes['N500epsilon5.0r5.5lz205.5sig0.8nc48047dt0.002bdump1000adump5000ens1'],weights=wholes['N500epsilon5.0r5.5lz205.5sig0.8nc48047dt0.002bdump1000adump5000ens1'],histtype='step',density=True)
plt.show()

In [None]:
sns.histplot(edge_wholes['N500epsilon5.0r5.5lz205.5sig0.8nc48047dt0.002bdump1000adump5000ens1'][:-1],bins=edge_wholes['N500epsilon5.0r5.5lz205.5sig0.8nc48047dt0.002bdump1000adump5000ens1'],weights=wholes['N500epsilon5.0r5.5lz205.5sig0.8nc48047dt0.002bdump1000adump5000ens1'])
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1,ncols=1,sharex=True,figsize=(8,6))
centers = 0.5*(edge_wholes['N500epsilon5.0r5.5lz205.5sig0.8nc48047dt0.002bdump1000adump5000ens1'][:-1]+edge_wholes['N500epsilon5.0r5.5lz205.5sig0.8nc48047dt0.002bdump1000adump5000ens1'][1:])
hist_df = pd.DataFrame(wholes)
rho_df = pd.DataFrame(rho_wholes)
phi_df = pd.DataFrame(phi_wholes)
df = pd.concat([hist_df,rho_df,phi_df],axis=1)
df.columns = ['histogram','number_density','volume_fraction']
df['center'] = centers
#df['histogram'] = df['histogram'] / df['histogram'].sum()
df['fake']= 1
#df.set_index('center',inplace=True)
#sns.histplot(x='center',bins=edge_wholes['N500epsilon5.0r5.5lz205.5sig0.8nc48047dt0.002bdump1000adump5000ens1'] ,weights='volume_fraction',data=df,element='poly',fill=False, kde=True)
#plt.show()
#df['histogram'].plot(ax=axes,ylabel='histogram')
#sns.set_theme(style="whitegrid")
#sns.set(font_scale=1.2)
sns.axes_style("darkgrid")
sns.lineplot(x='center',y='histogram', data=df,ax=axes)
#df.loc[-200:200,'number_density'].plot(ax=axes[1],ylabel='number_density')
#df.loc[-200:200,'volume_fraction'].plot(ax=axes[2],ylabel='volume_fraction',xlabel='center')
#axes.grid()
#axes.set_xlim(df.index[0]-5, df.index[-1]+5)
#axes.axvline(df.loc[df.index[0],'center'],lw=0.5,c='red',label='left end')
#axes.axvline(df.loc[df.index[-1],'center'],lw=0.5,c='green',label='right end')
#axes.axvline(df['center'],lw=0.5,c='red')
axes.set_xlabel('z (a.u.)')
axes.set_ylabel('Freqency of type-1 particles')
#ax.set_xlim[]
plt.savefig('histogram.pdf',dpi=200)

In [None]:
df

In [None]:
df['center'][-2:]

In [None]:

name = 'N500epsilon5.0r5.5lz205.5sig0.8nc36036dt0.002bdump1000adump5000ens1'
hist_info = SumRule(name, geometry='biaxial', group='all', lineage='whole')
dist_new = distributions.Distribution(
    wholes[name],
    edges[name],
    hist_info,
    'dcrowd',
    geometry='biaxial',
    direction='z',
    normalized=False)

### whole_from_Segments

In [None]:
input_database = '/Users/amirhsi_mini/probe/N500D10.0ac0.8-segment'
geometry = 'biaxial'
hierarchy = '/N*/N*'
lineage = 'segment'
observations = glob(input_database + hierarchy)
if observations == []:
    raise OSError(
        "File not found in "
        f"'{input_database + hierarchy}'"
        )
#save_to = analyzer.database_path(input_database, phase='analysis', stage='wholeSim', group=group)
#analyzer.analyze_segments(input_database, geometry, hierarchy)
analyzer.analyze_wholes(input_database, geometry, hierarchy)

In [None]:
from dask.distributed import Client
from dask import delayed
from dask import compute
client = Client(n_workers=4)
client

### This script is used in GNU-Parallel

In [None]:
from glob import glob
from PipeLine import *

fname = glob("../N*.bug.*")
fname = PipeLine.file_reader(fname) # This is a list with one member

save_to="./"
geom = 'cylindrical'
print(fname)
PipeLine.extract_trj_bug(fname[0], geom, save_to) # A list with one member, the member is a tuple of a trj and data pair.
#PipeLine.bug_trj_rmsd(fname[0], geom, save_to) 

trj_files = glob("./N*all.lammpstrj")
all_tuples = PipeLine.file_reader(trj_files,extensions=['lammpstrj'])
all_trjs = [all_tuple[0] for all_tuple in all_tuples]

data_file = glob("./N*.all.data")
all_data = PipeLine.file_reader(data_file,extensions=['all.data'])
all_data = all_data[0][0]

    
for all_trj in all_trjs:
    print(all_trj)
    PipeLine.extract_trj_all(all_data, all_trj, geom, save_to)

## Standard approach: Running on clusters: extraction from orgaznied *trjs_all* and *trjs_bug* directories

### This are not work properly on Graham cluster but work well on iMacmini

### 1. Extract from an organized *trjs_bug* directory:

In [None]:
# This script extract different bug's information from pairs (toplogy and trajectory) of bug simulation files in oen or more organized *trjs_bug* directories.
from pathlib import Path
import os
from glob import glob
from PipeLine import *
from dask.distributed import Client
from dask import delayed
from dask import compute

cores = 32
print(f"number of workers set to {cores}; is this the same requested cores on the cluster?")
client = Client(n_workers=cores)
home = str(Path.home())
cwdir = str(Path.cwd())
# information extraction from simulations
geom = 'cylindrical'
fname = glob(home+'/amirhsi_rrg/cylinder_simulations/N*-trjs_bug/N*bug*')
bug_pairs = PipeLine.file_reader(fname) # each bug_pair is a pair of trajectory and topopgy file.
trjs_computed = []
bug_dir = 'extraction_bug/'
for bug_pair in bug_pairs:
    sim_name = bug_pair[0].split("/")[-1].split('bug')[0]
    sim_dir = cwdir+bug_dir+sim_name
    Path(sim_dir).mkdir(parents=True, exist_ok=False)
    sim_save_to = sim_dir+"/"
    trj_delayed = delayed(PipeLine.extract_trj_bug)(bug_pair, geom,sim_save_to)
    trjs_computed.append(trj_delayed)
results = compute(trjs_computed)

### 2. Extract from an organized *trjs_all* directory:

In [None]:
# This script extract different bug's information from pairs (toplogy and trajectory) of bug simulation files in oen or more organized *trjs_bug* directories.
from pathlib import Path
import os
from glob import glob
from PipeLine import *
from dask.distributed import Client
from dask import delayed
from dask import compute

cores = 32
print(f"number of workers set to {cores}; is this the same requested cores on the cluster?")
client = Client(n_workers=cores)
home = str(Path.home())
cwdir = str(Path.cwd())
sim_all_dirs = glob(home+'/amirhsi_rrg/cylinder_simulations/N*-trjs_all/N*/')
geom = 'cylindrical'

trjs_computed = []
all_extraction_dir = 'extraction_all/'
for sim_all_dir in sim_all_dirs:
    sim_name = sim_all_dir[0].split("/")[-1]
    all_trjs = glob(sim_all_dir+"N*.lammpstrj")
    all_trjs = PipeLine.file_reader(all_trjs,extensions=['lammpstrj'])
    all_trjs = [all_trj[0] for all_trj in all_trjs]

    all_topology = glob(sim_all_dir+"N*.all.data")
    all_topology = PipeLine.file_reader(all_topology,extensions=['all.data'])
    all_topology = all_topology[0][0]
    
    
    sim_extract_dir = cwdir+all_extraction_dir+sim_name
    Path(sim_extract_dir).mkdir(parents=True, exist_ok=False)
    sim_save_to = sim_extract_dir+"/"
    
    for all_trj in all_trjs:
        trj_delayed = delayed(PipeLine.extract_trj_all)(all_topology, all_trj, geom,sim_save_To)
        trjs_computed.append(trj_delayed)

results = compute(trjs_computed)

## Extraction from *extraction_bug* directory after a simulation

In [None]:
home = str(Path.home())
path=home+'N2000epsilon5.0r10.5lz336sig1.0nc100800dt0.005bdump1000adump5000ens1'
fname = glob(path+"/N*.bug.*")
fname = PipeLine.file_reader(fname) # This is a list with one member
geom = 'cylindrical'
print(fname)
PipeLine.extract_trj_bug(fname[0], geom) # A list with one member, the member is a tuple of a trj and data pair.
PipeLine.rmsd_trj_bug(fname[0], geom)

In [None]:
# all the segments in one trajectory: M dump files + one data file.
path='/Users/amirhsi_mini/N2000epsilon5.0r10.5lz336sig1.0nc100800dt0.005bdump1000adump5000ens*'
geom = 'cylindrical'
trj_files = glob(path+"/N*.lammpstrj")
all_tuples = PipeLine.file_reader(trj_files,extensions=['lammpstrj',])
all_trjs = [all_tuple[0] for all_tuple in all_tuples]
data_file = glob(path+"/N*.all.data")
all_data = PipeLine.file_reader(data_file,extensions=['all.data'])
all_data = all_data[0][0]
for all_trj in all_trjs:
    PipeLine.extract_trj_all(all_data, all_trj, geom)

## New approach: tested on iMac Pro:

### A single ensemble with one or more segments with one data file

In [None]:
path='/Users/amirhsi_mini/N2000epsilon5.0r15.5lz379.5sig6.0nc1068dt0.005bdump1000adump5000ens*'
#path='/Users/amirhsi_mini/N1000epsilon5.0r8.0lz308.5sig2.0nc10412dt0.005bdump1000adump5000ens*'
geom = 'cylindrical'
trj_files = glob(path+"/N*all*")
all_pairs = PipeLine.file_reader(trj_files)
trjs_computed = []
for all_pair in all_pairs:
    trj_delayed = delayed(PipeLine.extract_all_trj_polymer_cog_fsd)(all_pair[1], all_pair[0], geom)
    trjs_computed.append(trj_delayed)

In [None]:
%%time
results = compute(trjs_computed)

### N ensemble with N data file, each ensemble with one or more segments 

In [None]:
path='/Users/amirhsi_mini/N2000epsilon5.0r15.5lz379.5sig6.0nc1068dt0.005bdump1000adump5000ens*'
#path='/Users/amirhsi_mini/N1000epsilon5.0r8.0lz308.5sig2.0nc10412dt0.005bdump1000adump5000ens*'
geom = 'cylindrical'
trj_pathes = glob(path+"/N*all*")
trjs = PipeLine.file_reader(trj_pathes,extensions=['lammpstrj'])
trjs = [trj[0] for trj in trjs]
topology_pathes = glob(path+"/N*.all.data")
topologies = PipeLine.file_reader(topology_pathes,extensions=['all.data'])
topologies = [topology[0] for topology in topologies]

In [None]:
def simulation(pair):
    """
    simulation_pair pairs an "all" topology file with all the "all" trjectories of that "all" topology.
    
    Parameters:
    pair (list of tuples): a list in whic each tuple is  pair of topolgy and trajectories of a simulations.
    
    Return:
    a dict of of simulation pairs.
    """
    return {'topology':pair[0], 'trajectories':pair[1]}

In [None]:
ens_names = [topology.split("/")[-1].split('.all')[0] for topology in topologies]
ens_names = list(dict.fromkeys(ens_names))
trjs_per_ens = []
for ens_name in ens_names:
    ens_trjs = []
    for trj in trjs:
        trj_name = trj.split("/")[-1].split(".all")[0]
        if trj_name == ens_name:
            ens_trjs.append(trj)
            #ensembles[key]['trajectories'] = trj
    trjs_per_ens.append(ens_trjs)
ensembles= dict(zip(ens_names,list(map(simulation,list(zip(topologies,trjs_per_ens))))))

In [None]:
geom = 'cylindrical'
trjs_computed = []
for ensemble in ensembles.values():
    for trj_segment in ensemble['trajectories']:
        trj_delayed = delayed(PipeLine.extract_trj_all)(ensemble['topology'], trj_segment, geom)
        trjs_computed.append(trj_delayed)

In [None]:
%%time
results = compute(trjs_computed)