In [1]:
import random
import os
import pandas as pd
import progressbar
import time

In [31]:
scramble_output_dir = '/Users/akre96/Data/HSC_aging_project/serial_transplant/rna_seq/scrambles'
normalized_rna_seq_input_dir = '/Users/akre96/Data/HSC_aging_project/serial_transplant/rna_seq/experiments/normalized'
bridged_output_dir = '/Users/akre96/Data/HSC_aging_project/serial_transplant/rna_seq/experiments/bridged'
exhaustion_mapped_output_dir = '/Users/akre96/Data/HSC_aging_project/serial_transplant/rna_seq/experiments/exhaustion_mapped'
label_filt_output_dir = '/Users/akre96/Data/HSC_aging_project/serial_transplant/rna_seq/experiments/label_filtered'

labels_df = pd.read_csv('/Users/akre96/Data/HSC_aging_project/serial_transplant/rna_seq/exhaustion_labels.csv').rename(columns={'survived': 'label_name'})


In [28]:
technical_replicates = [
    'AN6',
    'AN7'
]
mice = [
    'M12',
    'M16',
    'M7',
    'M6',
    'M3',
    'M5',
]
n_scrambles = 5

## Bridge Cell Barcodes to Tracking Barcodes in RNA Seq Data and Save bridged CSV

In [6]:
for replicate in techincal_replicates:
    print('Working on', replicate)
    print('\t...Importing Normalized Matrix')
    matrix_file_name = os.path.join(
        normalized_rna_seq_input_dir,
        replicate + '_normalized-matrix.csv'
    )
    rna_matrix = pd.read_csv(matrix_file_name)
    rna_matrix['CBC'] = rna_matrix['Unnamed: 0'].str.split('-').str[0]

    print('\t...Importing Bridge')

    bridge_file = os.path.join(
        normalized_rna_seq_input_dir,
        replicate + '_bridge.txt'
    )
    bridge = pd.read_csv(bridge_file, sep='\t', header=None, names=['CBC', 'TBC', 'n'])
    
    print('\t...Merging Bridge')
    rna_matrix = rna_matrix.merge(
        bridge,
        how='inner',
        validate='m:1'
    )
    first_cols = ['CBC', 'TBC', 'n']
    gene_cols = [c for c in rna_matrix.columns if c not in first_cols + ['Unnamed: 0']]
    col_order = first_cols + gene_cols
    rna_matrix = rna_matrix[col_order]
    bridged_matrix_filename = os.path.join(
        bridged_output_dir,
        replicate + '_bridged.csv'
    )
    
    print('\t...Saving Bridged Data')
    rna_matrix.to_csv(bridged_matrix_filename, index=False)
print('Done!')

Working on AN6
	...Importing Normalized Matrix
	...Importing Bridge
	...Merging Bridge
	...Saving Bridged Data
Working on AN7
	...Importing Normalized Matrix
	...Importing Bridge
	...Merging Bridge
	...Saving Bridged Data


## Mapping Bridged Data to Exhuastion Labels

In [29]:
start = time.process_time()
for replicate in technical_replicates:
    print('Mapping Replicate:', replicate)
    
    print('\t...Importing Bridged Data')
    bridged_matrix = pd.read_csv(
        os.path.join(
            bridged_output_dir,
            replicate + '_bridged.csv'    
        )
    )
    
    print('\t...Merging with labels')
    mapped_matrix = bridged_matrix.merge(
        labels_df.rename(columns={'code': 'TBC'}),
        how='inner',
        validate='m:1'
    )
    print('\t...Saving Mapped Data')
    
    mapped_file = os.path.join(
        exhaustion_mapped_output_dir,
        replicate + '_exhaustion_mapped.csv'
    )
    mapped_matrix.to_csv(mapped_file, index=False)
print('Done!')
print('Time Elapsed:', round(time.process_time() - start), 'seconds')

Mapping Replicate: AN6
	...Importing Bridged Data
	...Merging with labels
	...Saving Mapped Data Per Mouse
Mapping Replicate: AN7
	...Importing Bridged Data
	...Merging with labels
	...Saving Mapped Data Per Mouse
Done!
Time Elapsed: 98 seconds


In [35]:
for mouse_id in mice:
    print('Filtering Mouse:', mouse_id)

    mouse_data = pd.DataFrame()
    for replicate in technical_replicates:
        print('\t...Importing Replicate:', replicate)

        mouse_rep_data = pd.read_csv(
            os.path.join(
                exhaustion_mapped_output_dir,
                mouse_id + '-' + replicate + '_exhaustion_mapped.csv'
            )
        )
        mouse_data = mouse_data.append(mouse_rep_data, ignore_index=True)
    
    
    count_df = pd.DataFrame(mouse_data.groupby('label_name').CBC.nunique()).reset_index()
    if count_df.label_name.nunique() < 2:
        print('\t\t1 label only, Skipping', mouse_id)
        continue           
    if not count_df[count_df.CBC < 10].empty:
        print('\t\tnot enough cells, Skipping', mouse_id)
        print(count_df)
        continue
    filt_file = os.path.join(
        label_filt_output_dir,
        mouse_id + '_label_filtered.csv'
    )
    mouse_data.to_csv(filt_file, index=False)
    

Filtering Mouse: M12
	...Importing Replicate: AN6
	...Importing Replicate: AN7
		1 label only, Skipping M12
Filtering Mouse: M16
	...Importing Replicate: AN6
	...Importing Replicate: AN7
		1 label only, Skipping M16
Filtering Mouse: M7
	...Importing Replicate: AN6
	...Importing Replicate: AN7
Filtering Mouse: M6
	...Importing Replicate: AN6
	...Importing Replicate: AN7
Filtering Mouse: M3
	...Importing Replicate: AN6
	...Importing Replicate: AN7
		1 label only, Skipping M3
Filtering Mouse: M5
	...Importing Replicate: AN6
	...Importing Replicate: AN7
		not enough cells, Skipping M5
  label_name  CBC
0  Exhausted    2
1   Survived   15


## Generate Scramble Using Bridged Data

In [38]:
start = time.process_time()
for fname in os.listdir(bridged_output_dir):
    if fname.split('.')[-1] != 'csv':
        continue
    replicate = fname.split('_')[0]
    print('Scrambling Run:', replicate)

    mapped_matrix = pd.read_csv(
        os.path.join(
            bridged_output_dir,
            fname    
        )
    )

    cells = mapped_matrix['CBC'].unique()
    clones = mapped_matrix['TBC'].unique().tolist()

    for i in range(n_scrambles):
        scramble_clones = []
        print('\t...Scramble', i)
        for cell in cells:
            rand_clone = random.sample(clones, 1)[0]
            scramble_clones.append(rand_clone)

        scramble_matrix = mapped_matrix.copy()
        scramble_matrix['TBC'] = scramble_clones

        scramble_file = os.path.join(
            scramble_output_dir,
            replicate + '_scramble' + '-' + str(i) + '.csv'
        )
        scramble_matrix.to_csv(scramble_file, index=False)
print('Done!')
print('Time Elapsed:', round(time.process_time() - start), 'seconds')

Scrambling Mouse: AN7
	...Scramble 0
	...Scramble 1
	...Scramble 2
	...Scramble 3
	...Scramble 4
Scrambling Mouse: AN6
	...Scramble 0
	...Scramble 1
	...Scramble 2
	...Scramble 3
	...Scramble 4
Done!
Time Elapsed: 363 seconds
