# Adding BxD dataset (Rachel Meade, Clare Smith) to standardized datafiles: 

In [33]:
import pandas as pd
import os

### Load raw, full BxD datasets from Rachel Meade:

In [42]:
path_bxd = '../../data/standardized_data/BXD_TnSeq/'
fn_bxd = '2022.02.15_BXD_TRANSIT_BGC_1pseudo_output_RKM_wPvals_Unthresholded.csv'
fn_bxd_path = os.path.join(path_bxd, fn_bxd)

df_bxd = pd.read_csv(fn_bxd_path,  index_col = 0)
df_bxd.head(1)

Unnamed: 0,Orf,Name,Desc,Sites,Sites_Hit,C57BL.6J,C57BL.6J_pval,C57BL.6J_Qval,DBA.2J,DBA.2J_pval,...,BXD79_Qval,BXD90,BXD90_pval,BXD90_Qval,BXD93,BXD93_pval,BXD93_Qval,BXD102,BXD102_pval,BXD102_Qval
1,Rv0001,dnaA,chromosomal replication initiation protein,31,0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0


### Grab BxD L2FC columns: 

In [43]:
cols_lfc = list(set([col.split('_')[0] for col in df_bxd.columns[5:]]))
cols_lfc.sort()
cols = ['Orf'] + cols_lfc
df_bxd_lfc = df_bxd[cols].copy()
df_bxd_lfc.rename(columns={'Orf':'Rv_ID'}, inplace=True)
df_bxd_lfc.head(1)

Unnamed: 0,Rv_ID,BXD102,BXD29,BXD39,BXD40,BXD48a,BXD51,BXD54,BXD56,BXD60,...,BXD69,BXD73,BXD73b,BXD77,BXD79,BXD9,BXD90,BXD93,C57BL.6J,DBA.2J
1,Rv0001,0.0,0.0,0.0,0.0,0.0,0.86,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Grab BxD q-value columns: 

In [44]:
cols_qval = [col for col in df_bxd.columns[5:] if 'Qval' in col]
cols_qval.sort()
cols = ['Orf'] + cols_qval
df_bxd_qval = df_bxd[cols].copy()
df_bxd_qval.rename(columns={'Orf':'Rv_ID'}, inplace=True)

# Rename columns to get rid of the "Qval"
dict_cols = {col:col.split('_')[0] for col in df_bxd_qval.columns[1:]}
df_bxd_qval.rename(columns=dict_cols, inplace=True)
df_bxd_qval.head(1)

Unnamed: 0,Rv_ID,BXD102,BXD29,BXD39,BXD40,BXD48a,BXD51,BXD54,BXD56,BXD60,...,BXD69,BXD73,BXD73b,BXD77,BXD79,BXD90,BXD93,BXD9,C57BL.6J,DBA.2J
1,Rv0001,1.0,1.0,1.0,1.0,1.0,0.7414,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


##### Append control condition to column names: 

In [None]:
dict_cols_lfc = {col:col+'_vs_in_vitro_H37Rv' for col in df_bxd_lfc.columns[1:]}
dict_cols_qval = {col:col+'_vs_in_vitro_H37Rv' for col in df_bxd_qval.columns[1:]}
df_bxd_qval.rename(columns=dict_cols_qval, inplace=True)
df_bxd_lfc.rename(columns=dict_cols_lfc, inplace=True)

### Grab standardized dataset (L2FC dataframe, q-values dataframe)

#### PAUSE! 
- Make sure you grab the correct datasets. 
- Use this as an opportunity to clean up the data/standardized_data folder. Send everything else to an old dataset.
- Document where you added the CC-panel datasets. 

In [48]:
path_std = '../../data/standardized_data/'

# PAUSE: make sure you're grabbing the correct files:  
fn_std_lfc = os.path.join(path_std, 'result_logfc_matrix_2021_11_15_BASIS_invitro.csv')
fn_std_qval = os.path.join(path_std, 'result_qval_matrix_2021_11_15_BASIS_invitro.csv')
# Load as dataframes: 
df_std_lfc = pd.read_csv(fn_std_lfc)
df_std_qval = pd.read_csv(fn_std_qval)
# shapes: 
df_std_lfc.shape, df_std_qval.shape

((4055, 126), (4055, 126))

#### Merging LFC dataframes: 

In [49]:
df_bxd_lfc.shape, df_std_lfc.shape

((3990, 22), (4055, 126))

In [56]:
df_std_bxd_lfc = df_std_lfc.merge(df_bxd_lfc, how = 'left', on='Rv_ID')
df_std_bxd_lfc.shape

(4055, 147)

In [58]:
df_std_bxd_lfc.head(1)

Unnamed: 0,Rv_ID,PE35_KO_vs_mbio_H37Rv,PPE68_KO_vs_mbio_H37Rv,Rv0950c_KO_vs_CB_WT,Rv0954_KO_vs_RJ_WT,Rv1096_KO_vs_CB_WT,Rv3005c_KO_day32_vs_dejesus_H37Rv_day32,Rv3594_KO_vs_Rubin_FLUTE_WT,Rv3684_KO_vs_CB_WT,Rv3717_KO_vs_Rubin_FLUTE_WT,...,BXD69_vs_in_vitro_H37Rv,BXD73_vs_in_vitro_H37Rv,BXD73b_vs_in_vitro_H37Rv,BXD77_vs_in_vitro_H37Rv,BXD79_vs_in_vitro_H37Rv,BXD9_vs_in_vitro_H37Rv,BXD90_vs_in_vitro_H37Rv,BXD93_vs_in_vitro_H37Rv,C57BL.6J_vs_in_vitro_H37Rv,DBA.2J_vs_in_vitro_H37Rv
0,Rv0001,-0.41,0.01,0.0,0.0,0.0,3.12,-0.06,0.0,-0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Merging q-value dataframes: 

In [28]:
df_bxd_qval.shape, df_std_qval.shape

((3990, 22), (4055, 126))

In [54]:
df_std_bxd_qval = df_std_qval.merge(df_bxd_qval, how = 'left', on='Rv_ID')
df_std_bxd_qval.shape

(4055, 147)

In [55]:
df_std_bxd_qval.head(1)

Unnamed: 0,Rv_ID,PE35_KO_vs_mbio_H37Rv,PPE68_KO_vs_mbio_H37Rv,Rv0950c_KO_vs_CB_WT,Rv0954_KO_vs_RJ_WT,Rv1096_KO_vs_CB_WT,Rv3005c_KO_day32_vs_dejesus_H37Rv_day32,Rv3594_KO_vs_Rubin_FLUTE_WT,Rv3684_KO_vs_CB_WT,Rv3717_KO_vs_Rubin_FLUTE_WT,...,BXD69_vs_in_vitro_H37Rv,BXD73_vs_in_vitro_H37Rv,BXD73b_vs_in_vitro_H37Rv,BXD77_vs_in_vitro_H37Rv,BXD79_vs_in_vitro_H37Rv,BXD90_vs_in_vitro_H37Rv,BXD93_vs_in_vitro_H37Rv,BXD9_vs_in_vitro_H37Rv,C57BL.6J_vs_in_vitro_H37Rv,DBA.2J_vs_in_vitro_H37Rv
0,Rv0001,1.0,1.0,1.0,1.0,1.0,0.73174,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Write to files: 

In [None]:
fn_std_bxd_lfc = 'result_logfc_matrix_2023_02_20_BASIS_invitro.csv'
fn_std_bxd_qval = 'result_qval_matrix_2023_02_20_BASIS_invitro.csv'

df_std_bxd_qval.