## The purpose of this is to incorporate Prof. Clare Smith's TnSeq datasets into the MtbTnDB analysis / manuscript: 

In [1]:
import pandas as pd
import os
import re

### Load q-value and log2-fold change matrices that Michael DeJesus generated with Transit: 

In [58]:
dir_data = '../../data/standardized_data/'
file_logfc = os.path.join(dir_data, 'result_logfc_matrix_2021_05_18.csv')

#############
#############
# ERROR !!!!
# You should be using the q-value matrix!! :( :( :( :( 
#############
#############
# file_qvals = os.path.join(dir_data, 'result_pval_matrix_2021_05_18.csv')
file_qvals = os.path.join(dir_data, 'result_qval_matrix_2021_05_18.csv')

df_logfc = pd.read_csv(file_logfc)
df_qvals = pd.read_csv(file_qvals)

df_logfc.rename(columns={df_logfc.columns[0]:'Rv_ID'}, inplace=True)
df_qvals.rename(columns={df_qvals.columns[0]:'Rv_ID'}, inplace=True)


### Dropping redundant columns: 

In [59]:
cols_to_drop = [col for col in df_qvals.columns if "carey" in col and "mbio_H37Rv" in col]
# cols_to_drop = cols_to_drop + ['C57BL.6J_vs_C57BL.6J', 'C57BL.6J_vs_A.J', 'C57BL.6J_vs_129s1.SvImJ'] + ['zhang_mhcii_mouse_d10_vs_zhang_wt_mouse_d10']
# ['C57BL.6J_vs_129s1.SvImJ', 'C57BL.6J_vs_mbio_H37Rv']

In [60]:
df_qvals.drop(cols_to_drop, axis = 1, inplace = True)
df_logfc.drop(cols_to_drop, axis = 1, inplace = True)

## Which columns overlap with the older datasets?

- What's going on? What's the difference between the dataset below and ''result_logfc_matrix_2020_08_27.csv'?

In [61]:
file_logfc_OLD = os.path.join(dir_data, 'result_logfc_matrix_2020_06_27.csv')
file_qvals_OLD = os.path.join(dir_data, 'result_qval_matrix_2020_06_27.csv')

df_logfc_OLD = pd.read_csv(file_logfc_OLD)
df_qvals_OLD = pd.read_csv(file_qvals_OLD)

cols_to_drop_OLD = [col for col in df_qvals.columns if "carey" in col and "mbio_H37Rv" in col]

df_qvals_OLD.drop(cols_to_drop_OLD, axis = 1, inplace = True)
df_logfc_OLD.drop(cols_to_drop_OLD, axis = 1, inplace = True)

In [62]:
cols_OLD = [col for col in df_logfc.columns if col in df_logfc_OLD.columns]
cols_NEW = [col for col in df_logfc.columns if not col in df_logfc_OLD.columns]

In [63]:
print(len(cols_OLD), len(cols_NEW))

65 1897


# Generate the binarized matrix by setting a threshold on the q-values and log2-fold changes: 

In [64]:
# set thresholds
qval_thresh = 0.05
log2fc_tresh = 1
# initialized binary dataframe
df_bin = df_qvals.copy()

cols_data = df_qvals.columns[1:]
for col in cols_data:
    df_bin.loc[ (df_qvals[col] <= qval_thresh) & (df_logfc[col].abs() >= log2fc_tresh), col ] = 1
    df_bin.loc[ (df_qvals[col] > qval_thresh), col ] = 0  
    df_bin.loc[(df_logfc[col].abs() < log2fc_tresh), col] = 0

In [65]:
df_bin.shape

(4055, 1962)

Write to file: 

In [10]:
file_qvals_out = os.path.join(dir_data, 'result_qval_matrix_2021_10_22.csv')
file_log2fc_out = os.path.join(dir_data, 'result_logfc_matrix_2021_10_22.csv')
file_bin_out = os.path.join(dir_data, 'result_bin_matrix_2021_10_22.csv')

df_bin.to_csv(file_bin_out, index = False)
df_qvals.to_csv(file_qvals_out, index = False)
df_logfc.to_csv(file_log2fc_out, index = False)

# Detect the "basis set" columns for Clare's dataset: 

### Screen comparisons using Black 6 as the reference condition:   

In [66]:
cols_ref_mouse = [col for col in cols_NEW if 'C57BL' in col]
len(cols_ref_mouse)

64

In [67]:
df_logfc_BASIS = df_logfc[cols_OLD + cols_ref_mouse]
df_qvals_BASIS = df_qvals[cols_OLD + cols_ref_mouse]
df_bin_BASIS = df_bin[cols_OLD + cols_ref_mouse]

In [68]:
df_logfc_BASIS.shape

(4055, 129)

In [47]:
file_qvals_out = os.path.join(dir_data, 'result_qval_matrix_2021_10_22_BASIS.csv')
file_log2fc_out = os.path.join(dir_data, 'result_logfc_matrix_2021_10_22_BASIS.csv')
file_bin_out = os.path.join(dir_data, 'result_bin_matrix_2021_10_22_BASIS.csv')

df_bin_BASIS.to_csv(file_bin_out, index = False)
df_qvals_BASIS.to_csv(file_qvals_out, index = False)
df_logfc_BASIS.to_csv(file_log2fc_out, index = False)

### Screen comparisons using mBio as the reference condition: 

In [69]:
str_ref = 'mbio_H37Rv'
cols_ref_mBio = [col for col in cols_NEW if str_ref in col]
len(cols_ref_mBio)

61

In [70]:
str_ref = 'in_vitro'
cols_ref_in_vitro = [col for col in cols_NEW if str_ref in col]
len(cols_ref_in_vitro)

61

In [71]:
cols_ref_invitro_mBio = list(set(cols_ref_mBio + cols_ref_in_vitro))
cols_ref_invitro_mBio.sort()

In [72]:
df_logfc_BASIS_mbio = df_logfc[cols_OLD + cols_ref_mBio]
df_qvals_BASIS_mbio = df_qvals[cols_OLD + cols_ref_mBio]
df_bin_BASIS_mbio = df_bin[cols_OLD + cols_ref_mBio]

In [73]:
df_logfc_BASIS_invitro = df_logfc[cols_OLD + cols_ref_in_vitro]
df_qvals_BASIS_invitro = df_qvals[cols_OLD + cols_ref_in_vitro]
df_bin_BASIS_invitro = df_bin[cols_OLD + cols_ref_in_vitro]

PENDING: Talk to Clare and Michael!

In [None]:

# df_logfc_BASIS_mbio_invitro = df_logfc[cols_OLD + cols_ref_in_vitro]
# df_qvals_BASIS_invitro = df_qvals[cols_OLD + cols_ref_in_vitro]
# df_bin_BASIS_invitro = df_bin[cols_OLD + cols_ref_in_vitro]

In [56]:
df_bin_BASIS_invitro.shape

(4055, 125)

In [75]:
file_qvals_out = os.path.join(dir_data, 'result_qval_matrix_2021_11_15_BASIS_mbio.csv')
file_log2fc_out = os.path.join(dir_data, 'result_logfc_matrix_2021_11_15_BASIS_mbio.csv')
file_bin_out = os.path.join(dir_data, 'result_bin_matrix_2021_11_15_BASIS_mbio.csv')

df_bin_BASIS_mbio.to_csv(file_bin_out, index = False)
df_qvals_BASIS_mbio.to_csv(file_qvals_out, index = False)
df_logfc_BASIS_mbio.to_csv(file_log2fc_out, index = False)

In [76]:
file_qvals_out = os.path.join(dir_data, 'result_qval_matrix_2021_11_15_BASIS_invitro.csv')
file_log2fc_out = os.path.join(dir_data, 'result_logfc_matrix_2021_11_15_BASIS_invitro.csv')
file_bin_out = os.path.join(dir_data, 'result_bin_matrix_2021_11_15_BASIS_invitro.csv')

df_bin_BASIS_invitro.to_csv(file_bin_out, index = False)
df_qvals_BASIS_invitro.to_csv(file_qvals_out, index = False)
df_logfc_BASIS_invitro.to_csv(file_log2fc_out, index = False)

# Preparing additional rows for the descriptors spreadsheet (loaded by the dash app): 

In [95]:
path = '/home/ajinich/Dropbox/tuberculosis/tuberculosis/Results/'
list_num_control = []
list_num_exp = []
for cID in cols_ref_mouse: 
# cID = '129s1.SvImJ_vs_C57BL.6J'
    fn = 'result_resampling_'+cID+'.txt'
    with open(os.path.join(path, fn), 'r') as fin: 
        for line in fin:
            if '#Control Data' in line:
                num_control = len(re.findall('wig', line))
            if '#Experimental Data' in line:
                num_exp = len(re.findall('wig', line))
    list_num_control.append(num_control)
    list_num_exp.append(num_exp)

In [150]:
df_desc = pd.DataFrame()
df_desc['column_ID'] = cols_ref_mouse
df_desc['wig_files'] = True
df_desc['control'] = [col.split('_')[-1] for col in cols_ref_mouse]
df_desc['experimental'] = [col.split('_')[0] for col in cols_ref_mouse]
df_desc['column_ID_2'] = cols_ref_mouse
df_desc['column_ID_SI'] = cols_ref_mouse
df_desc['num_replicates_control'] = list_num_control
df_desc['num_replicates_experimental'] = list_num_exp

In [151]:
str_meaning = 'differential genetic requirements of Mtb (H37Rv) in %s mouse strain relative to %s control mouse strain'
list_meaning = []
for ind, row in df_desc.iterrows():
    list_meaning.append(str_meaning%(row['experimental'], row['control']))


In [152]:
df_desc['meaning'] = list_meaning
df_desc['year'] = 2021
df_desc['paper_title'] = 'Host-pathogen genetic interactions underlie tuberculosis susceptibility'
df_desc['paper_URL'] = 'https://doi.org/10.1101/2020.12.01.405514'
df_desc['journal'] = 'biorXiv'
df_desc['first_author'] = 'Smith'
df_desc['last_author'] = 'Sassetti'
df_desc['in_vitro_cell_vivo'] = 'in_vivo' 
df_desc['in_vitro_media'] = ''
df_desc['carbon_source'] = ''
df_desc['stress_description'] = '' 
df_desc['GI_RvID'] = ''
df_desc['GI_name'] = ''
df_desc['MicArr_or_TnSeq'] = 'TnSeq'
df_desc['stat_analysis'] = 'Transit'
df_desc['mouse_strain'] = 'Collaborative Cross (CC) mouse panel' 
df_desc['cell_type'] = ''
df_desc['Mtb_strain'] = 'H37Rv'
df_desc['plot_SI_graph'] = ''

In [153]:
file_out = '../../data/column_descriptors_CC_panel.xlsx'
df_desc.to_excel(file_out, index=False)