# Adding BxD dataset (Rachel Meade, Clare Smith) to standardized datafiles: 

In [1]:
import pandas as pd
import os

# INTERLUDE: 
### Checking out the number and identity of screens in the dash website database (updated as of 07/10/23)

We want to make sure both are equally up to date. 

In [2]:
path_dash = '../../../data'
fn_path = os.path.join(path_dash, 'standardized_data_dash.tsv')
df_dash = pd.read_csv(fn_path, sep = '\t')
list_exp = df_dash.Expt.unique().tolist()

In [3]:
print('there are this many total screens in the dash site:', len(list_exp))

there are this many total screens in the dash site: 146


In [4]:
list_CC = [exp for exp in list_exp if 'CC' in exp]
list_CC = list_CC + ['129s1.SvImJ_vs_in_vitro',
    'A.J_vs_in_vitro', 
    'Cast.EiJ_vs_in_vitro',
    'IFNG_vs_in_vitro',
    'NOD.ShiLtJ_vs_in_vitro',
    'NOS_vs_in_vitro',
    'NZO.H1LtJ_vs_in_vitro',
    'PWK.PhJ_vs_in_vitro',
    'Phox_vs_in_vitro',
    'Rag_vs_in_vitro',
    'Wsb.Eij_vs_in_vitro',
    'mbio_H37Rv_vs_in_vitro',
    'C57BL.6J_vs_in_vitro']
print('There are this many CC-panel screens in the dash site:', len(list_CC))

There are this many CC-panel screens in the dash site: 61


In [5]:
list_bxd = [exp for exp in list_exp if 'BXD' in exp]
list_bxd = list_bxd + ['C57BL.6J_vs_in_vitro_H37Rv', 'DBA.2J_vs_in_vitro_H37Rv']
print('There are this many BxD screens in the Dash site:', len(list_bxd))

There are this many BxD screens in the Dash site: 21


In [6]:
list_rem = [exp for exp in list_exp if exp not in list_CC + list_bxd]
print('There are this many remainder screens:', len(list_rem))

There are this many remainder screens: 64


In [7]:
list_carey = [exp for exp in list_rem if 'carey' in exp]
list_KO = [exp for exp in list_rem if 'KO' in exp]
list_dejesus = [exp for exp in list_rem if 'dejesus' in exp]
list_xu = [exp for exp in list_rem if 'xu_' in exp]
list_zhang = [exp for exp in list_rem if 'zhang' in exp]
list_temp = list_carey + list_KO + list_dejesus + list_xu + list_zhang

list_rem_rem = [exp for exp in list_rem if exp not in list_temp]
list_rem_rem

['bellerose_MB_HRZE_wk1_vs_bellerose_MB_d21_untreated',
 'bellerose_MB_HRZE_wk1_vs_bellerose_MB_pretreatment',
 'bellerose_MB_d21_untreated_vs_bellerose_MB_pretreatment',
 'griffin_cholesterol_vs_griffin_glycerol',
 'griffin_glycerol_vs_mbio_H37Rv',
 'kieser_dPonA1_vs_mbio_H37Rv',
 'korte_2016_otsa_trehalose_vs_korte_2016_otsa_7h9',
 'minato_minimal_plate_vs_minato_rich_plate',
 'mishra_C3H_vs_mishra_B6',
 'mishra_NOS2_vs_mishra_B6',
 'mishra_NOS2_vs_mishra_C3H',
 'nambi_2015_ctpC_vs_nambi_2015_wt',
 'ritterhaus_hypoxia_H3_vs_ritterhaus_hypoxia_input',
 'ritterhaus_hypoxia_H6_vs_ritterhaus_hypoxia_input']

In [8]:
len(list_KO)

13

### Load raw, full BxD datasets from Rachel Meade:

In [9]:
path_bxd = '../../../data/standardized_data/BXD_TnSeq'
fn_bxd = '2022.02.15_BXD_TRANSIT_BGC_1pseudo_output_RKM_wPvals_Unthresholded.csv'
fn_bxd_path = os.path.join(path_bxd, fn_bxd)

df_bxd = pd.read_csv(fn_bxd_path,  index_col = 0)
df_bxd.head(1)

Unnamed: 0,Orf,Name,Desc,Sites,Sites_Hit,C57BL.6J,C57BL.6J_pval,C57BL.6J_Qval,DBA.2J,DBA.2J_pval,...,BXD79_Qval,BXD90,BXD90_pval,BXD90_Qval,BXD93,BXD93_pval,BXD93_Qval,BXD102,BXD102_pval,BXD102_Qval
1,Rv0001,dnaA,chromosomal replication initiation protein,31,0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0


### Grab BxD L2FC columns: 

In [10]:
cols_lfc = list(set([col.split('_')[0] for col in df_bxd.columns[5:]]))
cols_lfc.sort()
cols = ['Orf'] + cols_lfc

df_bxd_lfc = df_bxd[cols].copy()
df_bxd_lfc.rename(columns={'Orf':'Rv_ID'}, inplace=True)
df_bxd_lfc.head(1)

Unnamed: 0,Rv_ID,BXD102,BXD29,BXD39,BXD40,BXD48a,BXD51,BXD54,BXD56,BXD60,...,BXD69,BXD73,BXD73b,BXD77,BXD79,BXD9,BXD90,BXD93,C57BL.6J,DBA.2J
1,Rv0001,0.0,0.0,0.0,0.0,0.0,0.86,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Grab BxD q-value columns: 

In [11]:
cols_qval = [col for col in df_bxd.columns[5:] if 'Qval' in col]
cols_qval.sort()
cols = ['Orf'] + cols_qval
df_bxd_qval = df_bxd[cols].copy()
df_bxd_qval.rename(columns={'Orf':'Rv_ID'}, inplace=True)

# Rename columns to get rid of the "Qval"
dict_cols = {col:col.split('_')[0] for col in df_bxd_qval.columns[1:]}
df_bxd_qval.rename(columns=dict_cols, inplace=True)
df_bxd_qval.head(1)

Unnamed: 0,Rv_ID,BXD102,BXD29,BXD39,BXD40,BXD48a,BXD51,BXD54,BXD56,BXD60,...,BXD69,BXD73,BXD73b,BXD77,BXD79,BXD90,BXD93,BXD9,C57BL.6J,DBA.2J
1,Rv0001,1.0,1.0,1.0,1.0,1.0,0.7414,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


##### Append control condition to column names: 

In [12]:
dict_cols_lfc = {col:col+'_vs_in_vitro_H37Rv' for col in df_bxd_lfc.columns[1:]}
dict_cols_qval = {col:col+'_vs_in_vitro_H37Rv' for col in df_bxd_qval.columns[1:]}
df_bxd_qval.rename(columns=dict_cols_qval, inplace=True)
df_bxd_lfc.rename(columns=dict_cols_lfc, inplace=True)

### Grab standardized dataset (L2FC dataframe, q-values dataframe)

#### PAUSE! 
- Make sure you grab the correct datasets. 
- Use this as an opportunity to clean up the data/standardized_data folder. Send everything else to an old dataset.
- Document where you added the CC-panel datasets. 

In [13]:
path_std = '../../../data/standardized_data/old_std_data'
fn_std_lfc = os.path.join(path_std, 'result_logfc_matrix_2023_02_21_CC_invitroref.csv')
fn_std_qval = os.path.join(path_std, 'result_qval_matrix_2023_02_21_CC_invitroref.csv')

# Load as dataframes: 
df_std_lfc = pd.read_csv(fn_std_lfc)
df_std_qval = pd.read_csv(fn_std_qval)
# shapes: 
print(df_std_lfc.shape, df_std_qval.shape)

#### Merging LFC dataframes: 
print(df_bxd_lfc.shape, df_std_lfc.shape)
df_std_bxd_lfc = df_std_lfc.merge(df_bxd_lfc, how = 'left', on='Rv_ID')
print(df_std_bxd_lfc.shape)

#### Merging q-value dataframes: 
print( df_bxd_qval.shape, df_std_qval.shape )
df_std_bxd_qval = df_std_qval.merge(df_bxd_qval, how = 'left', on='Rv_ID')
print( df_std_bxd_qval.shape )

### Write to files: 
fn_std_bxd_qval = os.path.join(path_std, 'result_qval_matrix_2023_02_20_CC_BxD.csv')
fn_std_bxd_lfc = os.path.join(path_std, 'result_logfc_matrix_2023_02_20_CC_BxD.csv')
df_std_bxd_qval.to_csv(fn_std_bxd_qval, index = False)
df_std_bxd_lfc.to_csv(fn_std_bxd_lfc, index = False)

(4055, 126) (4055, 126)
(3990, 22) (4055, 126)
(4055, 147)
(3990, 22) (4055, 126)
(4055, 147)


#### Repeat but with data that has both in-vitro and mBio controls: 

In [14]:
path_std = '../../../data/standardized_data/old_std_data'
fn_std_lfc = os.path.join(path_std, 'result_logfc_matrix_2023_02_21_CC_invitro_mbio_ref.csv')
fn_std_qval = os.path.join(path_std, 'result_qval_matrix_2023_02_21_CC_invitro_mbio_ref.csv')

# Load as dataframes: 
df_std_lfc = pd.read_csv(fn_std_lfc)
df_std_qval = pd.read_csv(fn_std_qval)
# shapes: 
print(df_std_lfc.shape, df_std_qval.shape)

#### Merging LFC dataframes: 
print(df_bxd_lfc.shape, df_std_lfc.shape)
df_std_bxd_lfc = df_std_lfc.merge(df_bxd_lfc, how = 'left', on='Rv_ID')
print(df_std_bxd_lfc.shape)

#### Merging q-value dataframes: 
print( df_bxd_qval.shape, df_std_qval.shape )
df_std_bxd_qval = df_std_qval.merge(df_bxd_qval, how = 'left', on='Rv_ID')
print( df_std_bxd_qval.shape )

### Write to files: 
fn_std_bxd_lfc = os.path.join(path_std, 'result_logfc_matrix_2023_02_20_CC_w_mbio_BxD.csv')
fn_std_bxd_qval = os.path.join(path_std, 'result_qval_matrix_2023_02_20_CC_w_mbio_wBxD.csv')
df_std_bxd_lfc.to_csv(fn_std_bxd_lfc, index = False)
df_std_bxd_qval.to_csv(fn_std_bxd_qval, index = False)

(4055, 187) (4055, 187)
(3990, 22) (4055, 187)
(4055, 208)
(3990, 22) (4055, 187)
(4055, 208)


## Make sure everything is in order in the SI datasets: 

In [15]:
path_SI = '../../../data/SI_datasets'
fn_SI = os.path.join(path_SI, 'SI_log2FC.csv')
df_lfc_SI = pd.read_csv(fn_SI)
df_lfc_SI.shape

(3990, 42)