Skip to content

Commit

Permalink
R2: tabulation and aggregation now play together
Browse files Browse the repository at this point in the history
  • Loading branch information
brunobeltran committed Apr 19, 2019
1 parent 5418b3f commit 79c3b03
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 69 deletions.
71 changes: 13 additions & 58 deletions nuc_chain/fluctuations.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,77 +700,32 @@ def get_kuhns_grouped(df, thresh, groups, rmax_col='rmax', r2_col='r2'):
ks['b'] = ks['slope']
return ks

def aggregate_existing_kuhns(glob='*.csv'):
"""WARNING: not really for direct use. an example function that appends new
r2 calculations to existing repository of kuhn lengths.
you should be able to modify for further use by just adding match_*
variables that correspond to your format string in the r2-tabulation.py
script"""
def aggregate_existing_kuhns(glob='*.csv', thresh=5000):
"""Aggregates all Kuhn lengths that can be calculated from the
r2-tabulation script."""
kuhns = []
r2_format_re = re.compile('r2-(fluct|geom)-(box|exp)-mu_([0-9]+)-sigma_([0-9]+)-([0-9]+)unwraps(-.*)?.csv')
for path in Path('./csvs/r2').glob(glob):
try:
df = pd.read_csv(path)
except:
continue
match_npy = re.search('kuhns-(fluctuations|geometrical)-mu([0-9]+)-sigma_([0-9]+)_([0-9]+)_([0-9]+)unwraps.npy', str(path))
match_box = re.search('r2-(fluctuations|geometrical)-mu_([0-9]+)-sigma_([0-9]+)_?([0-9]+)?_([0-9]+)unwraps(?:-[0-9]+)?(-random_phi)?.csv', str(path))
match_new = re.search('r2-(fluctuations|geometrical)-(box|exponential|homogenous)-link-mu_([0-9]+)-(?:sigma_([0-9]+))?_?([0-9]+)unwraps(?:-[0-9]+)?(-random_phi)?.csv', str(path))
match_random_phi_exp = re.search('r2-(fluctuations|geometrical)-mu_([0-9]+)-sigma_([0-9]+)_([0-9]+)unwraps_random-phi-rz-(left|right).csv', str(path))
if match_box is not None:
variance_type = 'box'
sim, mu, sigma_min, sigma_max, unwraps, is_random = match_box.groups()
if is_random:
sim = sim + '-random_phi'
groups = ['mu', 'variance']
elif match_new is not None:
sim, variance_type, mu, sigma, unwraps, is_random = match_new.groups()
if variance_type == 'homogenous' and sigma is not None:
variance_type = 'box'
if is_random:
sim = sim + '-random_phi'
groups = ['mu']
elif match_npy is not None:
variance_type = 'box'
sim, mu, sigma_min, sigma_max, unwraps = match_npy.groups()
elif match_random_phi_exp is not None:
variance_type = 'box'
sim1, mu, sigma, unwraps, sim2 = match_random_phi_exp.groups()
# it was later discovered that left application is the correct one
if sim2 == 'right':
continue
sim = f'{sim1}-random_phi'
else:
print('Unknown simulation type: ' + str(path))
match = r2_format_re.search(path.name)
if match is None:
print("File name cannot be parsed: " + str(path))
continue
sim_type, variance_type, mu, sigma, unwrap, desc = match.groups()
df['mu'] = mu
ks = get_kuhns_grouped(df, thresh=5000, groups=groups)
df['sigma'] = sigma
df['unwrap'] = unwrap
ks = get_kuhns_grouped(df, thresh=thresh, groups=['mu', 'sigma', 'unwrap'])
ks = ks.reset_index()
ks['type'] = sim
ks['sim_type'] = sim_type
ks['variance_type'] = variance_type
ks['unwrap'] = unwraps
if 'homogenous' == ks['variance_type'].iloc[0]:
ks['variance'] = 0
elif 'exponential' == ks['variance_type'].iloc[0]:
ks['variance'] = ks['mu']
elif match_new and sigma is not None:
ks['variance'] = sigma
else:
assert('variance' in ks)
kuhns.append(ks)
all_ks = [ks.set_index(['variance_type', 'type', 'mu', 'variance', 'unwrap']) for ks in kuhns]
all_ks = [ks.set_index(['variance_type', 'sim_type', 'mu', 'sigma', 'unwrap']) for ks in kuhns]
all_ks = pd.concat(all_ks)

# df = pd.read_csv('csvs/r2/kuhn_lengths_so_far.csv')
# df['variance_type'] = 'box'
# all_ks['variance'] = all_ks['mu']
# all_ks['variance_type'] = 'exponential'
# df.set_index(['variance_type', 'type', 'mu', 'variance'], inplace=True)
# all_ks.set_index(['variance_type', 'type', 'mu', 'variance'], inplace=True)
# all_ks.sort_index(inplace=True)
# all_ks = pd.concat([ak, df])
# all_ks.to_csv('./csvs/r2/kuhn_lengths_so_far.csv')

return all_ks

def calculate_kuhn_length_from_fluctuating_r2(df, mu, chain_length, **kwargs):
Expand Down
29 changes: 18 additions & 11 deletions scripts/r2-tabulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,18 @@
from nuc_chain import fluctuations as wlc
from nuc_chain import geometry as ncg

mus = np.arange(31, 60)
mus = np.arange(31, 150)
sigmas = [0]
unwraps = [0]

r2_format_string = 'csvs/r2/r2-{fluct}-{mode}-mu_{mu}-sigma_{sigma}-{unwrap}unwraps.csv'
def save_r2s(param, num_chains=None, num_linkers=None, fluct=True, mode='box',
force_override=False, **kwargs):
def save_r2s(param, num_chains=None, num_linkers=None, fluct=True, mode='exp',
force_override=False, desc=None, **kwargs):
f"""Tabulates r2 for a bunch of chains with fixed parameters.
extra kwargs are passed to the relevant r2 generation function from wlc or
ncg for each chain.
Parameters
----------
params : Tuple[int, int, int]
Expand All @@ -36,6 +39,8 @@ def save_r2s(param, num_chains=None, num_linkers=None, fluct=True, mode='box',
force_override : bool
whether or not to recompute file if it already exists. this is mostly
useful if you're increasing num_chains or num_nucleosomes.
desc : string
extra information to append to filename, like values of kwargs used
Notes
-----
Expand All @@ -47,16 +52,18 @@ def save_r2s(param, num_chains=None, num_linkers=None, fluct=True, mode='box',
where fluct is 'fluct' or 'geom'.
"""
mu, sigma, unwrap = param
fluct = 'fluct' if fluct else 'geom'
file_name = r2_format_string.format(fluct=fluct, mode=mode, mu=mu, sigma=sigma, unwrap=unwrap)
fluct_str = 'fluct' if fluct else 'geom'
file_name = r2_format_string.format(fluct=fluct_str, mode=mode, mu=mu, sigma=sigma, unwrap=unwrap)
if desc is not None:
file_name = file_name[:-4] + '-' + desc + '.csv'

if Path(file_name).exists() and not force_override:
return
Path(file_name).touch()

if mode == 'box':
if not fluct:
num_chains = num_chains if num_chains else 400
num_chains = num_chains if num_chains else 1000
num_linkers = num_linkers if num_linkers else 7500
df = ncg.tabulate_r2_heterogenous_chains_by_variance(num_chains, num_linkers, [sigma], mu=mu, unwraps=unwrap, **kwargs)
elif fluct:
Expand All @@ -65,16 +72,16 @@ def save_r2s(param, num_chains=None, num_linkers=None, fluct=True, mode='box',
df = wlc.tabulate_r2_heterogenous_fluctuating_chains_by_variance(num_chains, num_linkers, [sigma], mu=mu, unwraps=unwrap, **kwargs)
elif mode == 'exp':
sigma = mu # by definition
raise NotImplementedError("this is implemented, just need to go dig up what function to call.")
if not fluct:
pass
raise NotImplementedError("this is implemented, just need to go dig up what function to call.")
elif fluct:
pass
num_chains = num_chains if num_chains else 100
num_linkers = num_linkers if num_linkers else 7500
df = wlc.tabulate_r2_heterogenous_fluctuating_chains_exponential(num_chains, num_linkers, mu=mu, unwraps=unwrap, **kwargs)
else:
raise NotImplementedError("Invalid mode (linker variance type)")

df['variance'] = sigma
df.to_csv(file_name)
df.to_csv(file_name, index=False)

pool_size = multiprocessing.cpu_count() - 1
with Pool(processes=pool_size) as p:
Expand Down

0 comments on commit 79c3b03

Please sign in to comment.