In [None]:
# load label file
import numpy as np
import pandas as pd
import random
import pysam
import os

import preprocessing_utils as pp_utils

In [None]:

# inputs (change to preprocess data located elsewhere)
covid_path = '/home/coguztuzun/cerag_deconv/covid_data/test_sets/ScienceDirect_files_17Jun2024_19-51-01.836/1-s2.0-S2666634021000313-mmc2.xlsx'
bam_dir = '/res-workspace/res-sandbox/compbio/coguztuzun/covid_bam_file/s/0702_deconv_test/'


In [None]:
#### creating the label df

label_df = pd.read_excel(covid_path, index_col=0, sheet_name='master_table')
label_df = label_df[['sample_id','erythroblast', 'hsc','progenitor','spleen','kidney','heart','skin','lung','liver','pancreas','colon','megakaryocyte']]
label_df = label_df[label_df['sample_id'].notna()]
label_df = label_df[label_df['sample_id'].str.startswith('MCGILL').fillna(False)]

# sorting df wrt sample_id no
label_df = label_df.reset_index(drop=True)
label_df_sort_indices = label_df.sample_id.str.extract('(\d+)').astype(int)
label_df_sort_indices = label_df_sort_indices.sort_values(by=[0])
label_df_sorted = label_df.reindex(label_df_sort_indices.index) 

label_df_sorted['Contributions'] = label_df_sorted.drop('sample_id', axis = 1).apply(lambda row: row.tolist(), axis = 1)
label_df = label_df_sorted[['sample_id', 'Contributions']]


bam_files = [f for f in os.listdir(bam_dir) if f.endswith('.bam')]

In [None]:

sample_mapping = {
  'SRR13308049.bam': 'MCGILL38', 
  'SRR13308050.bam': 'MCGILL25', 
  'SRR13308051.bam': 'MCGILL3', 
  'SRR13308052.bam': 'MCGILL26', 
  'SRR13308053.bam': 'MCGILL23', 
  'SRR13308054.bam': 'MCGILL18', 
  'SRR13308055.bam': 'MCGILL16', 
  'SRR13308056.bam': 'MCGILL11', 
  'SRR13308057.bam': 'MCGILL8', 
  'SRR13308058.bam': 'MCGILL47', 
  'SRR13308059.bam': 'MCGILL42', 
  'SRR13308060.bam': 'MCGILL39', 
  'SRR13308061.bam': 'MCGILL22', 
  'SRR13308062.bam': 'MCGILL27', 
  'SRR13308063.bam': 'MCGILL24', 
  'SRR13308064.bam': 'MCGILL54', 
  'SRR13308065.bam': 'MCGILL19', 
  'SRR13308066.bam': 'MCGILL14', 
  'SRR13308067.bam': 'MCGILL12', 
  'SRR13308068.bam': 'MCGILL9', 
  'SRR13308069.bam': 'MCGILL6', 
  'SRR13308070.bam': 'MCGILL53', 
  'SRR13308071.bam': 'MCGILL52', 
  'SRR13308072.bam': 'MCGILL21', 
  'SRR13308073.bam': 'MCGILL4', 
  'SRR13308074.bam': 'MCGILL50', 
  'SRR13308075.bam': 'MCGILL48', 
  'SRR13308076.bam': 'MCGILL45', 
  'SRR13308077.bam': 'MCGILL43', 
  'SRR13308078.bam': 'MCGILL40', 
  'SRR13308079.bam': 'MCGILL37', 
  'SRR13308080.bam': 'MCGILL34', 
  'SRR13308081.bam': 'MCGILL2', 
  'SRR13308082.bam': 'MCGILL33', 
  'SRR13308083.bam': 'MCGILL20', 
  'SRR13308084.bam': 'MCGILL1', 
  'SRR13308085.bam': 'MCGILL32', 
  'SRR13308086.bam': 'MCGILL31', 
  'SRR13308087.bam': 'MCGILL30', 
  'SRR13308088.bam': 'MCGILL29', 
  'SRR13308089.bam': 'MCGILL17', 
  'SRR13308090.bam': 'MCGILL15', 
  'SRR13308091.bam': 'MCGILL28', 
  'SRR13308092.bam': 'MCGILL13', 
  'SRR13308093.bam': 'MCGILL10', 
  'SRR13308094.bam': 'MCGILL7', 
  'SRR13308095.bam': 'MCGILL5', 
  'SRR13308096.bam': 'MCGILL51', 
  'SRR13308097.bam': 'MCGILL49', 
  'SRR13308098.bam': 'MCGILL46', 
  'SRR13308099.bam': 'MCGILL44', 
  'SRR13308100.bam': 'MCGILL41'
}


In [None]:

key_to_name = {v: k for k, v in sample_mapping.items()}
label_df['SRR'] = label_df['sample_id'].map(key_to_name)

label_df.head()

### preprocessing the reads, saving the patient reads as csvs ###

for i in range(0, label_df.shape[0]):
    sequences = pp_utils.get_reads_for_SRR(bam_dir, label_df['SRR'][i])

    label_row = label_df.iloc[i]
    
    new_df = pd.DataFrame([label_row] * len(sequences))
    new_df['Seq'] = sequences

    new_df.to_csv(f'expanded_sequences_{label_df["sample_id"][i]}.csv', index = False)
    print(f'{i} encoded!')
    new_df.head()


memory calculation

In [None]:

num_params_bert = 110000000
hidden_size = 768
num_classes = 12
param_size = 4 # 32 bit floats
batch_size = 128
activation_memory_per_sample = 10 * 1e6
data_memory = 200 * 1e9

model_memory_bert = num_params_bert * param_size 
classifier_params = hidden_size * num_classes
classifier_bias = num_classes
classifier_memory = (classifier_params + classifier_bias) + param_size
total_model_memory = model_memory_bert + classifier_memory 

activation_memory = batch_size * activation_memory_per_sample

optimizer_memory = total_model_memory * 3

total_memory = total_model_memory + activation_memory + optimizer_memory + data_memory

print(total_memory / 1e6, 'MB')
