In [3]:
import random
import os
import pandas as pd
import progressbar
import time

## Aging Data

In [2]:
scramble_output_dir = '/Users/akre96/Data/HSC_aging_project/aging_and_10x/rna_seq/scrambles'
normalized_rna_seq_input_dir = '/Users/akre96/Data/HSC_aging_project/aging_and_10x/rna_seq/experiments/normalized'
bridged_output_dir = '/Users/akre96/Data/HSC_aging_project/aging_and_10x/rna_seq/experiments/bridged'

technical_replicates = [
    'AN8',
    'AN9'
]
n_scrambles = 5

### Bridging Data by matching cell barcodes (cell marker via 10x) to tracking barcodes (clone marker)

In [5]:
for replicate in technical_replicates:
    print('Working on', replicate)
    print('\t...Importing Normalized Matrix')
    matrix_file_name = os.path.join(
        normalized_rna_seq_input_dir,
        replicate + '_normalized-matrix.csv'
    )
    rna_matrix = pd.read_csv(matrix_file_name)
    print('\t...Shape Before Bridge:', rna_matrix.shape)

    rna_matrix['CBC'] = rna_matrix['Unnamed: 0'].str.split('-').str[0]

    print('\t...Importing Bridge')

    bridge_file = os.path.join(
        normalized_rna_seq_input_dir,
        replicate + '_bridge.txt'
    )
    bridge = pd.read_csv(bridge_file, sep='\t', header=None, names=['CBC', 'TBC', 'n'])
    
    print('\t...Merging Bridge, shape:', bridge.shape)
    rna_matrix = rna_matrix.merge(
        bridge,
        how='inner',
        validate='m:1'
    )
    first_cols = ['CBC', 'TBC', 'n']
    gene_cols = [c for c in rna_matrix.columns if c not in first_cols + ['Unnamed: 0']]
    col_order = first_cols + gene_cols
    rna_matrix = rna_matrix[col_order]
    bridged_matrix_filename = os.path.join(
        bridged_output_dir,
        replicate + '_bridged.csv'
    )
    
    print('\t...Saving Bridged Data')
    print('\t...Shape After Bridge:', rna_matrix.shape)
    rna_matrix.to_csv(bridged_matrix_filename, index=False)
print('Done!')

Working on AN8
	...Importing Normalized Matrix
	...Shape Before Bridge: (4303, 10891)
	...Importing Bridge
	...Merging Bridge, shape: (51, 3)
	...Saving Bridged Data
	...Shape After Bridge: (51, 10893)
Working on AN9
	...Importing Normalized Matrix
	...Shape Before Bridge: (5609, 11707)
	...Importing Bridge
	...Merging Bridge, shape: (261, 3)
	...Saving Bridged Data
	...Shape After Bridge: (258, 11709)
Done!


### Generating Scrambles on bridged data

In [9]:
start = time.process_time()
for fname in os.listdir(bridged_output_dir):
    if fname.split('.')[-1] != 'csv':
        continue
    replicate = fname.split('_')[0]
    print('Scrambling Run:', replicate)

    mapped_matrix = pd.read_csv(
        os.path.join(
            bridged_output_dir,
            fname    
        )
    )

    cells = mapped_matrix['CBC'].unique()
    clones = mapped_matrix['TBC'].unique().tolist()

    for i in range(n_scrambles):
        scramble_clones = []
        print('\t...Scramble', i)
        for cell in cells:
            rand_clone = random.sample(clones, 1)[0]
            scramble_clones.append(rand_clone)

        scramble_matrix = mapped_matrix.copy()
        scramble_matrix['TBC'] = scramble_clones

        scramble_file = os.path.join(
            scramble_output_dir,
            replicate + '_scramble' + '-' + str(i) + '.csv'
        )
        scramble_matrix.to_csv(scramble_file, index=False)
print('Done!')
print('Time Elapsed:', round(time.process_time() - start), 'seconds')

Scrambling Run: AN8
	...Scramble 0
	...Scramble 1
	...Scramble 2
	...Scramble 3
	...Scramble 4
Scrambling Run: AN9
	...Scramble 0
	...Scramble 1
	...Scramble 2
	...Scramble 3
	...Scramble 4
Done!
Time Elapsed: 24 seconds


## Serial Transplant Data

In [6]:
scramble_output_dir = '/Users/akre96/Data/HSC_aging_project/serial_transplant/rna_seq/scrambles'
normalized_rna_seq_input_dir = '/Users/akre96/Data/HSC_aging_project/serial_transplant/rna_seq/experiments/normalized'
bridged_output_dir = '/Users/akre96/Data/HSC_aging_project/serial_transplant/rna_seq/experiments/bridged'


In [7]:
technical_replicates = [
    'AN6',
    'AN7'
]
n_scrambles = 5

### Bridge Cell Barcodes to Tracking Barcodes in RNA Seq Data and Save bridged CSV

In [8]:
for replicate in technical_replicates:
    print('Working on', replicate)
    print('\t...Importing Normalized Matrix')
    matrix_file_name = os.path.join(
        normalized_rna_seq_input_dir,
        replicate + '_normalized-matrix.csv'
    )
    rna_matrix = pd.read_csv(matrix_file_name)
    print('\t...Shape Before Bridge:', rna_matrix.shape)

    rna_matrix['CBC'] = rna_matrix['Unnamed: 0'].str.split('-').str[0]

    print('\t...Importing Bridge')

    bridge_file = os.path.join(
        normalized_rna_seq_input_dir,
        replicate + '_bridge.txt'
    )
    bridge = pd.read_csv(bridge_file, sep='\t', header=None, names=['CBC', 'TBC', 'n'])
    
    print('\t...Merging Bridge, shape:', bridge.shape)
    rna_matrix = rna_matrix.merge(
        bridge,
        how='inner',
        validate='m:1'
    )
    first_cols = ['CBC', 'TBC', 'n']
    gene_cols = [c for c in rna_matrix.columns if c not in first_cols + ['Unnamed: 0']]
    col_order = first_cols + gene_cols
    rna_matrix = rna_matrix[col_order]
    bridged_matrix_filename = os.path.join(
        bridged_output_dir,
        replicate + '_bridged.csv'
    )
    
    print('\t...Saving Bridged Data')
    print('\t...Shape After Bridge:', rna_matrix.shape)
    rna_matrix.to_csv(bridged_matrix_filename, index=False)
print('Done!')

Working on AN6
	...Importing Normalized Matrix
	...Shape Before Bridge: (9430, 12166)
	...Importing Bridge
	...Merging Bridge, shape: (3466, 3)
	...Saving Bridged Data
	...Shape After Bridge: (3446, 12168)
Working on AN7
	...Importing Normalized Matrix
	...Shape Before Bridge: (2865, 12201)
	...Importing Bridge
	...Merging Bridge, shape: (1071, 3)
	...Saving Bridged Data
	...Shape After Bridge: (1060, 12203)
Done!


### Generate Scramble Using Bridged Data

In [38]:
start = time.process_time()
for fname in os.listdir(bridged_output_dir):
    if fname.split('.')[-1] != 'csv':
        continue
    replicate = fname.split('_')[0]
    print('Scrambling Run:', replicate)

    mapped_matrix = pd.read_csv(
        os.path.join(
            bridged_output_dir,
            fname    
        )
    )

    cells = mapped_matrix['CBC'].unique()
    clones = mapped_matrix['TBC'].unique().tolist()

    for i in range(n_scrambles):
        scramble_clones = []
        print('\t...Scramble', i)
        for cell in cells:
            rand_clone = random.sample(clones, 1)[0]
            scramble_clones.append(rand_clone)

        scramble_matrix = mapped_matrix.copy()
        scramble_matrix['TBC'] = scramble_clones

        scramble_file = os.path.join(
            scramble_output_dir,
            replicate + '_scramble' + '-' + str(i) + '.csv'
        )
        scramble_matrix.to_csv(scramble_file, index=False)
print('Done!')
print('Time Elapsed:', round(time.process_time() - start), 'seconds')

Scrambling Mouse: AN7
	...Scramble 0
	...Scramble 1
	...Scramble 2
	...Scramble 3
	...Scramble 4
Scrambling Mouse: AN6
	...Scramble 0
	...Scramble 1
	...Scramble 2
	...Scramble 3
	...Scramble 4
Done!
Time Elapsed: 363 seconds
