R2: tabulation and aggregation now play together

brunobeltran · Apr 19, 2019 · 79c3b03 · 79c3b03
1 parent 5418b3f
commit 79c3b03
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 69 deletions.
diff --git a/nuc_chain/fluctuations.py b/nuc_chain/fluctuations.py
@@ -700,77 +700,32 @@ def get_kuhns_grouped(df, thresh, groups, rmax_col='rmax', r2_col='r2'):
     ks['b'] = ks['slope']
     return ks
 
-def aggregate_existing_kuhns(glob='*.csv'):
-    """WARNING: not really for direct use. an example function that appends new
-    r2 calculations to existing repository of kuhn lengths.
-
-    you should be able to modify for further use by just adding match_*
-    variables that correspond to your format string in the r2-tabulation.py
-    script"""
+def aggregate_existing_kuhns(glob='*.csv', thresh=5000):
+    """Aggregates all Kuhn lengths that can be calculated from the
+    r2-tabulation script."""
     kuhns = []
+    r2_format_re = re.compile('r2-(fluct|geom)-(box|exp)-mu_([0-9]+)-sigma_([0-9]+)-([0-9]+)unwraps(-.*)?.csv')
     for path in Path('./csvs/r2').glob(glob):
         try:
             df = pd.read_csv(path)
         except:
             continue
-        match_npy = re.search('kuhns-(fluctuations|geometrical)-mu([0-9]+)-sigma_([0-9]+)_([0-9]+)_([0-9]+)unwraps.npy', str(path))
-        match_box = re.search('r2-(fluctuations|geometrical)-mu_([0-9]+)-sigma_([0-9]+)_?([0-9]+)?_([0-9]+)unwraps(?:-[0-9]+)?(-random_phi)?.csv', str(path))
-        match_new = re.search('r2-(fluctuations|geometrical)-(box|exponential|homogenous)-link-mu_([0-9]+)-(?:sigma_([0-9]+))?_?([0-9]+)unwraps(?:-[0-9]+)?(-random_phi)?.csv', str(path))
-        match_random_phi_exp = re.search('r2-(fluctuations|geometrical)-mu_([0-9]+)-sigma_([0-9]+)_([0-9]+)unwraps_random-phi-rz-(left|right).csv', str(path))
-        if match_box is not None:
-            variance_type = 'box'
-            sim, mu, sigma_min, sigma_max, unwraps, is_random = match_box.groups()
-            if is_random:
-                sim = sim + '-random_phi'
-            groups = ['mu', 'variance']
-        elif match_new is not None:
-            sim, variance_type, mu, sigma, unwraps, is_random = match_new.groups()
-            if variance_type == 'homogenous' and sigma is not None:
-                variance_type = 'box'
-            if is_random:
-                sim = sim + '-random_phi'
-            groups = ['mu']
-        elif match_npy is not None:
-            variance_type = 'box'
-            sim, mu, sigma_min, sigma_max, unwraps = match_npy.groups()
-        elif match_random_phi_exp is not None:
-            variance_type = 'box'
-            sim1, mu, sigma, unwraps, sim2 = match_random_phi_exp.groups()
-            # it was later discovered that left application is the correct one
-            if sim2 == 'right':
-                continue
-            sim = f'{sim1}-random_phi'
-        else:
-            print('Unknown simulation type: ' + str(path))
+        match = r2_format_re.search(path.name)
+        if match is None:
+            print("File name cannot be parsed: " + str(path))
             continue
+        sim_type, variance_type, mu, sigma, unwrap, desc = match.groups()
         df['mu'] = mu
-        ks = get_kuhns_grouped(df, thresh=5000, groups=groups)
+        df['sigma'] = sigma
+        df['unwrap'] = unwrap
+        ks = get_kuhns_grouped(df, thresh=thresh, groups=['mu', 'sigma', 'unwrap'])
         ks = ks.reset_index()
-        ks['type'] = sim
+        ks['sim_type'] = sim_type
         ks['variance_type'] = variance_type
-        ks['unwrap'] = unwraps
-        if 'homogenous' == ks['variance_type'].iloc[0]:
-            ks['variance'] = 0
-        elif 'exponential' == ks['variance_type'].iloc[0]:
-            ks['variance'] = ks['mu']
-        elif match_new and sigma is not None:
-            ks['variance'] = sigma
-        else:
-            assert('variance' in ks)
         kuhns.append(ks)
-    all_ks = [ks.set_index(['variance_type', 'type', 'mu', 'variance', 'unwrap']) for ks in kuhns]
+    all_ks = [ks.set_index(['variance_type', 'sim_type', 'mu', 'sigma', 'unwrap']) for ks in kuhns]
     all_ks = pd.concat(all_ks)
 
-    # df = pd.read_csv('csvs/r2/kuhn_lengths_so_far.csv')
-    # df['variance_type'] = 'box'
-    # all_ks['variance'] = all_ks['mu']
-    # all_ks['variance_type'] = 'exponential'
-    # df.set_index(['variance_type', 'type', 'mu', 'variance'], inplace=True)
-    # all_ks.set_index(['variance_type', 'type', 'mu', 'variance'], inplace=True)
-    # all_ks.sort_index(inplace=True)
-    # all_ks = pd.concat([ak, df])
-    # all_ks.to_csv('./csvs/r2/kuhn_lengths_so_far.csv')
-
     return all_ks
 
 def calculate_kuhn_length_from_fluctuating_r2(df, mu, chain_length, **kwargs):

diff --git a/scripts/r2-tabulation.py b/scripts/r2-tabulation.py
@@ -9,15 +9,18 @@
 from nuc_chain import fluctuations as wlc
 from nuc_chain import geometry as ncg
 
-mus = np.arange(31, 60)
+mus = np.arange(31, 150)
 sigmas = [0]
 unwraps = [0]
 
 r2_format_string = 'csvs/r2/r2-{fluct}-{mode}-mu_{mu}-sigma_{sigma}-{unwrap}unwraps.csv'
-def save_r2s(param, num_chains=None, num_linkers=None, fluct=True, mode='box',
-             force_override=False, **kwargs):
+def save_r2s(param, num_chains=None, num_linkers=None, fluct=True, mode='exp',
+             force_override=False, desc=None, **kwargs):
     f"""Tabulates r2 for a bunch of chains with fixed parameters.
 
+    extra kwargs are passed to the relevant r2 generation function from wlc or
+    ncg for each chain.
+
     Parameters
     ----------
     params : Tuple[int, int, int]
@@ -36,6 +39,8 @@ def save_r2s(param, num_chains=None, num_linkers=None, fluct=True, mode='box',
     force_override : bool
         whether or not to recompute file if it already exists. this is mostly
         useful if you're increasing num_chains or num_nucleosomes.
+    desc : string
+        extra information to append to filename, like values of kwargs used
 
     Notes
     -----
@@ -47,16 +52,18 @@ def save_r2s(param, num_chains=None, num_linkers=None, fluct=True, mode='box',
     where fluct is 'fluct' or 'geom'.
     """
     mu, sigma, unwrap = param
-    fluct = 'fluct' if fluct else 'geom'
-    file_name = r2_format_string.format(fluct=fluct, mode=mode, mu=mu, sigma=sigma, unwrap=unwrap)
+    fluct_str = 'fluct' if fluct else 'geom'
+    file_name = r2_format_string.format(fluct=fluct_str, mode=mode, mu=mu, sigma=sigma, unwrap=unwrap)
+    if desc is not None:
+        file_name = file_name[:-4] + '-' + desc + '.csv'
 
     if Path(file_name).exists() and not force_override:
         return
     Path(file_name).touch()
 
     if mode == 'box':
         if not fluct:
-            num_chains = num_chains if num_chains else 400
+            num_chains = num_chains if num_chains else 1000
             num_linkers = num_linkers if num_linkers else 7500
             df = ncg.tabulate_r2_heterogenous_chains_by_variance(num_chains, num_linkers, [sigma], mu=mu, unwraps=unwrap, **kwargs)
         elif fluct:
@@ -65,16 +72,16 @@ def save_r2s(param, num_chains=None, num_linkers=None, fluct=True, mode='box',
             df = wlc.tabulate_r2_heterogenous_fluctuating_chains_by_variance(num_chains, num_linkers, [sigma], mu=mu, unwraps=unwrap, **kwargs)
     elif mode == 'exp':
         sigma = mu # by definition
-        raise NotImplementedError("this is implemented, just need to go dig up what function to call.")
         if not fluct:
-            pass
+            raise NotImplementedError("this is implemented, just need to go dig up what function to call.")
         elif fluct:
-            pass
+            num_chains = num_chains if num_chains else 100
+            num_linkers = num_linkers if num_linkers else 7500
+            df = wlc.tabulate_r2_heterogenous_fluctuating_chains_exponential(num_chains, num_linkers, mu=mu, unwraps=unwrap, **kwargs)
     else:
         raise NotImplementedError("Invalid mode (linker variance type)")
 
-    df['variance'] = sigma
-    df.to_csv(file_name)
+    df.to_csv(file_name, index=False)
 
 pool_size = multiprocessing.cpu_count() - 1
 with Pool(processes=pool_size) as p: