## The purpose of this is to incorporate Prof. Clare Smith's CC-panel TnSeq datasets into the MtbTnDB dataset: 

In [1]:
import pandas as pd
import os
import re

### Load q-value and log2-fold change matrices that Michael DeJesus generated with Transit: 

In [2]:
# Load dataset generated by M. DeJesus from C. Smith's raw datasets: 
# NOTE: The reason there are so many columns is because Michael ran all possible comparisons. # 
dir_data = '../../data/standardized_data/old_std_data/'
file_logfc = os.path.join(dir_data, 'result_logfc_matrix_2021_05_18.csv')
file_qvals = os.path.join(dir_data, 'result_qval_matrix_2021_05_18.csv')

df_logfc = pd.read_csv(file_logfc)
df_qvals = pd.read_csv(file_qvals)

df_logfc.rename(columns={df_logfc.columns[0]:'Rv_ID'}, inplace=True)
df_qvals.rename(columns={df_qvals.columns[0]:'Rv_ID'}, inplace=True)
df_logfc.shape, df_qvals.shape

((4055, 1970), (4055, 1970))

In [6]:
col_in_vitro = [col for col in df_logfc.columns if "in_vitro" in col and not "zhang" in col]
print(len(col_in_vitro))
col_in_vitro[:3]

61


['in_vitro_vs_129s1.SvImJ', 'in_vitro_vs_A.J', 'in_vitro_vs_C57BL.6J']

## Flip the order of in_vitro_vs_CCPanel log2 fold-changes (i.e. make the in_vitro condition the reference condition):

In [8]:
df_logfc_invitro = df_logfc[col_in_vitro].copy()
df_logfc_invitro_neg = -df_logfc[col_in_vitro].copy()
dict_col = {col:col.split('_vs_')[-1]+'_vs_'+col.split('_vs_')[0] for col in df_logfc_invitro_neg.columns}

df_logfc_invitro_neg.rename(columns=dict_col, inplace=True)
# concatenate to full log2FC dataframe: 
df_logfc = pd.concat([df_logfc, df_logfc_invitro_neg], axis=1)
# check on dataframe sizes: 
df_logfc.shape, df_logfc_invitro_neg.shape

((4055, 2031), (4055, 61))

#### Same for q-value dataframe:

In [9]:
df_qvals_invitro = df_qvals[col_in_vitro].copy()
dict_col = {col:col.split('_vs_')[-1]+'_vs_'+col.split('_vs_')[0] for col in df_qvals_invitro.columns}
df_qvals_invitro.rename(columns=dict_col, inplace=True)
df_qvals = pd.concat([df_qvals, df_qvals_invitro], axis=1)
df_qvals.shape, df_qvals_invitro.shape

((4055, 2031), (4055, 61))

### Dropping redundant columns: 

In [10]:
cols_to_drop = [col for col in df_qvals.columns if "carey" in col and "mbio_H37Rv" in col]
# cols_to_drop = cols_to_drop + ['C57BL.6J_vs_C57BL.6J', 'C57BL.6J_vs_A.J', 'C57BL.6J_vs_129s1.SvImJ'] + ['zhang_mhcii_mouse_d10_vs_zhang_wt_mouse_d10']
# ['C57BL.6J_vs_129s1.SvImJ', 'C57BL.6J_vs_mbio_H37Rv']

In [12]:
df_qvals.drop(cols_to_drop, axis = 1, inplace = True)
df_logfc.drop(cols_to_drop, axis = 1, inplace = True)

## Which columns overlap with the older datasets?

- What's going on? What's the difference between the dataset below and ''result_logfc_matrix_2020_08_27.csv'?

In [13]:
file_logfc_OLD = os.path.join(dir_data, 'result_logfc_matrix_2020_06_27.csv')
file_qvals_OLD = os.path.join(dir_data, 'result_qval_matrix_2020_06_27.csv')

df_logfc_OLD = pd.read_csv(file_logfc_OLD)
df_qvals_OLD = pd.read_csv(file_qvals_OLD)

cols_to_drop_OLD = [col for col in df_qvals.columns if "carey" in col and "mbio_H37Rv" in col]

df_qvals_OLD.drop(cols_to_drop_OLD, axis = 1, inplace = True)
df_logfc_OLD.drop(cols_to_drop_OLD, axis = 1, inplace = True)

In [14]:
cols_OLD = [col for col in df_logfc.columns if col in df_logfc_OLD.columns]
cols_NEW = [col for col in df_logfc.columns if not col in df_logfc_OLD.columns]

In [15]:
print(len(cols_OLD), len(cols_NEW))

65 1958


# Generate the binarized matrix by setting a threshold on the q-values and log2-fold changes: 

In [16]:
# set thresholds
qval_thresh = 0.05
log2fc_tresh = 1
# initialized binary dataframe
df_bin = df_qvals.copy()

cols_data = df_qvals.columns[1:]
for col in cols_data:
    df_bin.loc[ (df_qvals[col] <= qval_thresh) & (df_logfc[col].abs() >= log2fc_tresh), col ] = 1
    df_bin.loc[ (df_qvals[col] > qval_thresh), col ] = 0  
    df_bin.loc[(df_logfc[col].abs() < log2fc_tresh), col] = 0

In [17]:
df_bin.shape

(4055, 2023)

Write to file: 

In [10]:
file_qvals_out = os.path.join(dir_data, 'result_qval_matrix_2021_10_22.csv')
file_log2fc_out = os.path.join(dir_data, 'result_logfc_matrix_2021_10_22.csv')
file_bin_out = os.path.join(dir_data, 'result_bin_matrix_2021_10_22.csv')

df_bin.to_csv(file_bin_out, index = False)
df_qvals.to_csv(file_qvals_out, index = False)
df_logfc.to_csv(file_log2fc_out, index = False)

# Detect and define (or choose which to keep) the "basis set" columns for Clare's dataset: 

### Screen comparisons using Black 6 as the reference condition:   

In [32]:
cols_ref_mouse = [col for col in cols_NEW if 'vs_C57BL' in col]

df_logfc_BASIS_B6 = df_logfc[cols_OLD + cols_ref_mouse]
df_qvals_BASIS_B6 = df_qvals[cols_OLD + cols_ref_mouse]
df_bin_BASIS_B6 = df_bin[cols_OLD + cols_ref_mouse]

file_log2fc_out = os.path.join(dir_data, 'result_logfc_matrix_2023_02_21_CC_B6ref.csv')
file_qvals_out = os.path.join(dir_data, 'result_qval_matrix_2023_02_21_CC_B6ref.csv')
file_bin_out = os.path.join(dir_data, 'result_bin_matrix_2023_02_21_CC_B6ref.csv')

df_logfc_BASIS_B6.to_csv(file_log2fc_out, index = False)
df_qvals_BASIS_B6.to_csv(file_qvals_out, index = False)
df_bin_BASIS_B6.to_csv(file_bin_out, index = False)

In [33]:
print(len(cols_ref_mouse))
df_logfc_BASIS_B6.shape, df_qvals_BASIS_B6.shape, df_bin_BASIS_B6.shape

61


((4055, 126), (4055, 126), (4055, 126))

### Screen comparisons using mBio as the reference condition: 

In [34]:
str_ref = '_vs_mbio_H37Rv'
cols_ref_mBio = [col for col in cols_NEW if str_ref in col]

df_logfc_BASIS_mbio = df_logfc[cols_OLD + cols_ref_mBio]
df_qvals_BASIS_mbio = df_qvals[cols_OLD + cols_ref_mBio]
df_bin_BASIS_mbio = df_bin[cols_OLD + cols_ref_mBio]

file_log2fc_out = os.path.join(dir_data, 'result_logfc_matrix_2023_02_21_CC_mBioref.csv')
file_qvals_out = os.path.join(dir_data, 'result_qval_matrix_2023_02_21_CC_mBioref.csv')
file_bin_out = os.path.join(dir_data, 'result_bin_matrix_2023_02_21_CC_mBioref.csv')

df_logfc_BASIS_mbio.to_csv(file_log2fc_out, index = False)
df_qvals_BASIS_mbio.to_csv(file_qvals_out, index = False)
df_bin_BASIS_mbio.to_csv(file_bin_out, index = False)

In [35]:
print(len(cols_ref_mBio))
df_logfc_BASIS_mbio.shape, df_qvals_BASIS_mbio.shape, df_bin_BASIS_mbio.shape

61


((4055, 126), (4055, 126), (4055, 126))

### Screen comparisons using Smith's et al. internal in vitro TnSeq screen:

In [36]:
str_ref = '_vs_in_vitro'
cols_ref_in_vitro = [col for col in cols_NEW if str_ref in col]

df_logfc_BASIS_invitro = df_logfc[cols_OLD + cols_ref_in_vitro]
df_qvals_BASIS_invitro = df_qvals[cols_OLD + cols_ref_in_vitro]
df_bin_BASIS_invitro = df_bin[cols_OLD + cols_ref_in_vitro]

file_log2fc_out = os.path.join(dir_data, 'result_logfc_matrix_2023_02_21_CC_invitroref.csv')
file_qvals_out = os.path.join(dir_data, 'result_qval_matrix_2023_02_21_CC_invitroref.csv')
file_bin_out = os.path.join(dir_data, 'result_bin_matrix_2023_02_21_CC_invitroref.csv')

df_logfc_BASIS_invitro.to_csv(file_log2fc_out, index = False)
df_qvals_BASIS_invitro.to_csv(file_qvals_out, index = False)
df_bin_BASIS_invitro.to_csv(file_bin_out, index = False)

In [37]:
print(len(cols_ref_in_vitro))
df_logfc_BASIS_invitro.shape, df_qvals_BASIS_invitro.shape, df_bin_BASIS_invitro.shape

61


((4055, 126), (4055, 126), (4055, 126))

### In case you want to include both in-vitro and mBio screens? 

In [43]:
cols_ref_invitro_mBio = list(set(cols_ref_mBio + cols_ref_in_vitro))
cols_ref_invitro_mBio = sorted(cols_ref_invitro_mBio, key=lambda x: (x.split("_vs_")[1] if "_vs_" in x else "", x.split("_vs_")[0]))

In [44]:
df_logfc_BASIS_invitro_mbio = df_logfc[cols_OLD + cols_ref_invitro_mBio]
df_qvals_BASIS_invitro_mbio = df_qvals[cols_OLD + cols_ref_invitro_mBio]
df_bin_BASIS_invitro_mbio = df_bin[cols_OLD + cols_ref_invitro_mBio]

file_log2fc_out = os.path.join(dir_data, 'result_logfc_matrix_2023_02_21_CC_invitro_mbio_ref.csv')
file_qvals_out = os.path.join(dir_data, 'result_qval_matrix_2023_02_21_CC_invitro_mbio_ref.csv')
file_bin_out = os.path.join(dir_data, 'result_bin_matrix_2023_02_21_CC_invitro_mbio_ref.csv')

df_logfc_BASIS_invitro_mbio.to_csv(file_log2fc_out, index = False)
df_qvals_BASIS_invitro_mbio.to_csv(file_qvals_out, index = False)
df_bin_BASIS_invitro_mbio.to_csv(file_bin_out, index = False)

# Preparing additional rows for the descriptors spreadsheet (loaded by the dash app): 

In [95]:
cols_ref_in_vitro[:10]

['129s1.SvImJ_vs_in_vitro',
 'A.J_vs_in_vitro',
 'C57BL.6J_vs_in_vitro',
 'CC001.Unc_vs_in_vitro',
 'CC002.Unc_vs_in_vitro',
 'CC003.Unc_vs_in_vitro',
 'CC004.TauUnc_vs_in_vitro',
 'CC005.TauUnc_vs_in_vitro',
 'CC006.TauUnc_vs_in_vitro',
 'CC007.Unc_vs_in_vitro']

In [104]:
path = '/home/ajinich/Dropbox/tuberculosis/tuberculosis/Results/'
list_num_control = []
list_num_exp = []
for cID in cols_ref_in_vitro: 
    cID_flip = cID.split('_vs_')[-1] + '_vs_' + cID.split('_vs_')[0]
    fn = 'result_resampling_'+cID_flip+'.txt'
    with open(os.path.join(path, fn), 'r') as fin: 
        for line in fin:
            if '#Control Data' in line:
                num_control = len(re.findall('wig', line))
            if '#Experimental Data' in line:
                num_exp = len(re.findall('wig', line))
    list_num_control.append(num_control)
    list_num_exp.append(num_exp)

In [124]:
df_desc = pd.DataFrame()
df_desc['column_ID'] = cols_ref_in_vitro # cols_ref_mouse
df_desc['wig_files'] = True
df_desc['control'] = [col.split('_vs_')[-1] for col in cols_ref_in_vitro]
df_desc['experimental'] = [col.split('_vs_')[0] for col in cols_ref_in_vitro]
df_desc['column_ID_2'] = cols_ref_in_vitro
df_desc['column_ID_SI'] = cols_ref_in_vitro
df_desc['num_replicates_control'] = list_num_control
df_desc['num_replicates_experimental'] = list_num_exp

In [125]:
str_meaning = 'differential genetic requirements of Mtb (H37Rv) in %s mouse strain relative to H37Rv in vitro'
list_meaning = []
for ind, row in df_desc.iterrows():
    list_meaning.append(str_meaning%(row['experimental']))


In [126]:
df_desc['meaning'] = list_meaning
df_desc['year'] = 2022
df_desc['paper_title'] = 'Host-pathogen genetic interactions underlie tuberculosis susceptibility in genetically diverse mice'
df_desc['paper_URL'] = 'https://elifesciences.org/articles/74419'
df_desc['journal'] = 'eLife'
df_desc['first_author'] = 'Smith'
df_desc['last_author'] = 'Sassetti'
df_desc['in_vitro_cell_vivo'] = 'in_vivo' 
df_desc['in_vitro_media'] = ''
df_desc['carbon_source'] = ''
df_desc['stress_description'] = '' 
df_desc['GI_RvID'] = ''
df_desc['GI_name'] = ''
df_desc['MicArr_or_TnSeq'] = 'TnSeq'
df_desc['stat_analysis'] = 'Transit'
df_desc['mouse_strain'] = 'Collaborative Cross (CC) mouse panel' 
df_desc['cell_type'] = ''
df_desc['Mtb_strain'] = 'H37Rv'
df_desc['plot_SI_graph'] = ''

In [127]:
file_out = '../../data/column_descriptors_CC_panel.xlsx'
df_desc.to_excel(file_out, index=False)