In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt

## Load new matrices: 

In [5]:
path_data = '../../data/standardized_data/'
file_log2fc = os.path.join(path_data, 'result_logfc_matrix_2019_12_10.csv')
file_qval = os.path.join(path_data, 'result_qval_matrix_2019_12_10.csv')
file_bin = os.path.join(path_data, 'result_bin_matrix_2019_12_10.csv')

df_log2fc = pd.read_csv(file_log2fc)
df_qval = pd.read_csv(file_qval)
df_bin = pd.read_csv(file_bin)

## Let's compare old Carey results vs. new Carey results: 

### Load older carey datasets: 

In [6]:
file_old = '../../dep/data/Tn_library_DB.xlsx'
df_old = pd.read_excel(file_old)
cols_old = [col for col in df_old.columns if 'Carey' in col]

In [7]:
cols_new = [col for col in df_bin.columns if 'carey' in col]

In [8]:
col_map = list(zip(cols_old, cols_new))
col_map

[('2018_Carey_1A', 'carey_621_vs_mbio_H37Rv'),
 ('2018_Carey_1B', 'carey_630_vs_mbio_H37Rv'),
 ('2018_Carey_1C', 'carey_631_vs_mbio_H37Rv'),
 ('2018_Carey_1D', 'carey_632_vs_mbio_H37Rv'),
 ('2018_Carey_1E', 'carey_641_vs_mbio_H37Rv'),
 ('2018_Carey_1F', 'carey_662_vs_mbio_H37Rv'),
 ('2018_Carey_1G', 'carey_663_vs_mbio_H37Rv'),
 ('2018_Carey_1H', 'carey_667_vs_mbio_H37Rv')]

In [9]:
list_num_both_CES = []
list_num_both_nonCES = []
list_num_CES_old_nonCES_new = []
list_num_nonCES_old_CES_new = []

for cols in col_map:
    
    df_bin_col = df_bin[['Rv_ID', cols[1]]].copy()
    df_old_col = df_old[['Rv_ID', cols[0]]].copy()
    df_bin_both = df_bin_col.merge(df_old_col, how = 'inner', on = 'Rv_ID')

    num_both_CES = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==1) ].shape[0]
    num_both_nonCES = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_CES_old_nonCES_new = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_nonCES_old_CES_new = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==1) ].shape[0]

    list_num_both_CES.append(num_both_CES)
    list_num_both_nonCES.append(num_both_nonCES)
    list_num_CES_old_nonCES_new.append(num_CES_old_nonCES_new)
    list_num_nonCES_old_CES_new.append(num_nonCES_old_CES_new)
    
df_map = pd.DataFrame()
df_map['screen'] = [col[1] for col in col_map]
df_map['both_CES'] = list_num_both_CES
df_map['num_both_nonCES'] = list_num_both_nonCES
df_map['num_CES_old_nonCES_new'] = list_num_CES_old_nonCES_new
df_map['num_nonCES_old_CES_new'] = list_num_nonCES_old_CES_new

In [37]:
file_out = '../../dep/data/sanity_check_Carey.csv'
df_map.to_csv(file_out, index = False)

## Let's compare all screens:

In [42]:
file_col_desc = '../../data/column_descriptors_standardized.xlsx'
df_col_desc = pd.read_excel(file_col_desc)

In [125]:
col_map = list(zip(df_col_desc.column_ID.values, df_col_desc.column_ID_2.values))
col_map_pairs = [cm for cm in col_map if cm[0] in df_old.columns]

In [126]:
col_map_pairs

[('2011_Griffin_2', 'griffin_cholesterol_vs_griffin_glycerol'),
 ('2013_DeJesus', 'griffin_glycerol_vs_mbio_H37Rv'),
 ('2013_Zhang_3A', 'zhang_Tyloxapol_pH_6.5_vs_zhang_Tyloxapol_pH_4.5'),
 ('2013_Zhang_3B', 'zhang_Tyloxapol_pH_6.5_vs_zhang_pcit_pH_4.5'),
 ('2013_Zhang_3C', 'zhang_DETA-NO_pH_7.0_vs_zhang_pH_7.0_no_NO_control'),
 ('2013_Zhang_3D', 'zhang_Trp_Rescue_vs_zhang_in_vitro_control_Rescue'),
 ('2016_Korte', 'korte_2016_otsa_trehalose_vs_korte_2016_otsa_7h9'),
 ('2017_Xu_1E', 'xu_mero_2.5_vs_xu_mero_0'),
 ('2018_Carey_1A', 'carey_621_vs_mbio_H37Rv'),
 ('2018_Carey_1B', 'carey_630_vs_mbio_H37Rv'),
 ('2018_Carey_1C', 'carey_631_vs_mbio_H37Rv'),
 ('2018_Carey_1D', 'carey_632_vs_mbio_H37Rv'),
 ('2018_Carey_1E', 'carey_641_vs_mbio_H37Rv'),
 ('2018_Carey_1F', 'carey_662_vs_mbio_H37Rv'),
 ('2018_Carey_1G', 'carey_663_vs_mbio_H37Rv'),
 ('2018_Carey_1H', 'carey_667_vs_mbio_H37Rv')]

#### Which screens are we missing to pair up with the new dataset? 

In [127]:
col_map_missing = [cm for cm in col_map if cm[0] not in df_old.columns]
df_col_map_missing = pd.DataFrame()
df_col_map_missing['old_name'] = [cm[0] for cm in col_map_missing]
df_col_map_missing['new_name'] = [cm[1] for cm in col_map_missing]
df_col_map_missing.head(2)

Unnamed: 0,old_name,new_name
0,2013_Zhang_1A,zhang_wt_mouse_d10_vs_zhang_input_library
1,2013_Zhang_1B,zhang_wt_mouse_d45_vs_zhang_input_library


In [117]:
# file_missing = '../../dep/data/column_map_missing.xlsx'
# df_col_map_missing.to_excel(file_missing, index = False)

##### Also store the column names for your old dataset: 

In [50]:
df_col_names_old = pd.DataFrame()
df_col_names_old['old_DB_names'] = df_old.columns

file_col_names_old = '../../dep/data/column_names_old.xlsx'
df_col_names_old.to_excel(file_col_names_old, index = False)

## You can now add a few more screens to the set for which you have a column name mapping (old-to-new datasets)

In [128]:
file_missing = '../../dep/data/column_map_missing.xlsx'
df_col_map_missing = pd.read_excel(file_missing)
df_col_map_missing.head(3)

Unnamed: 0,old_name,new_name
0,2013_Zhang_1A,zhang_wt_mouse_d10_vs_zhang_input_library
1,2013_Zhang_1,zhang_wt_mouse_d45_vs_zhang_input_library
2,2013_Zhang_2A,zhang_mhcii_mouse_d10_vs_zhang_wt_mouse_d10


In [129]:
col_map_v2 = list(zip(df_col_map_missing.old_name.values, df_col_map_missing.new_name.values))
col_map_pairs_v2 = [cm for cm in col_map_v2 if cm[0] in df_old.columns]
col_map_pairs_v2

[('2013_Zhang_1', 'zhang_wt_mouse_d45_vs_zhang_input_library'),
 ('2017B_DeJesus_1A', 'dejesus_Rv1432_day32_vs_dejesus_H37Rv_day32'),
 ('2017B_DeJesus_1B', 'dejesus_Rv1565c_day32_vs_dejesus_H37Rv_day32'),
 ('2017B_DeJesus_1C', 'dejesus_Rv2680_day32_vs_dejesus_H37Rv_day32'),
 ('2017_Xu_1A', 'xu_van_16_vs_xu_van_0'),
 ('2017_Xu_1B', 'xu_rif_4_vs_xu_rif_0'),
 ('2018_Rittershaus_1B', 'ritterhaus_hypoxia_H3_vs_ritterhaus_hypoxia_input'),
 ('2018_Rittershaus_1A', 'ritterhaus_hypoxia_H6_vs_ritterhaus_hypoxia_input'),
 ('Rv0307c', 'dejesus_Rv0307c_day32_vs_dejesus_Rv0307c_day0'),
 ('Rv3916c', 'dejesus_Rv3916c_day32_vs_dejesus_Rv3916c_day0'),
 ('Rv0950', 'Rv0950c_KO_vs_CB_WT'),
 ('Rv0954', 'Rv0954_KO_vs_RJ_WT'),
 ('Rv1096', 'Rv1096_KO_vs_CB_WT'),
 ('Rv3005c', 'Rv3005c_KO_day32_vs_dejesus_H37Rv_day32'),
 ('Rv3594', 'Rv3594_KO_vs_Rubin_FLUTE_WT'),
 ('Rv3684', 'Rv3684_KO_vs_CB_WT'),
 ('Rv3717', 'Rv3717_KO_vs_Rubin_FLUTE_WT'),
 ('Rv3811', 'Rv3811_KO_vs_Rubin_FLUTE_WT'),
 ('marP', 'marP_KO_vs_marP_W

In [130]:
col_map_pairs += col_map_pairs_v2
col_map_pairs

[('2011_Griffin_2', 'griffin_cholesterol_vs_griffin_glycerol'),
 ('2013_DeJesus', 'griffin_glycerol_vs_mbio_H37Rv'),
 ('2013_Zhang_3A', 'zhang_Tyloxapol_pH_6.5_vs_zhang_Tyloxapol_pH_4.5'),
 ('2013_Zhang_3B', 'zhang_Tyloxapol_pH_6.5_vs_zhang_pcit_pH_4.5'),
 ('2013_Zhang_3C', 'zhang_DETA-NO_pH_7.0_vs_zhang_pH_7.0_no_NO_control'),
 ('2013_Zhang_3D', 'zhang_Trp_Rescue_vs_zhang_in_vitro_control_Rescue'),
 ('2016_Korte', 'korte_2016_otsa_trehalose_vs_korte_2016_otsa_7h9'),
 ('2017_Xu_1E', 'xu_mero_2.5_vs_xu_mero_0'),
 ('2018_Carey_1A', 'carey_621_vs_mbio_H37Rv'),
 ('2018_Carey_1B', 'carey_630_vs_mbio_H37Rv'),
 ('2018_Carey_1C', 'carey_631_vs_mbio_H37Rv'),
 ('2018_Carey_1D', 'carey_632_vs_mbio_H37Rv'),
 ('2018_Carey_1E', 'carey_641_vs_mbio_H37Rv'),
 ('2018_Carey_1F', 'carey_662_vs_mbio_H37Rv'),
 ('2018_Carey_1G', 'carey_663_vs_mbio_H37Rv'),
 ('2018_Carey_1H', 'carey_667_vs_mbio_H37Rv'),
 ('2013_Zhang_1', 'zhang_wt_mouse_d45_vs_zhang_input_library'),
 ('2017B_DeJesus_1A', 'dejesus_Rv1432_day32

### Do the sanity check for the screens for which you have a column name mapping (old-to-new datasets:)

In [139]:
list_num_both_CES = []
list_num_both_nonCES = []
list_num_CES_old_nonCES_new = []
list_num_nonCES_old_CES_new = []

for cols in col_map_pairs:
    
    df_bin_col = df_bin[['Rv_ID', cols[1]]].copy()
    df_old_col = df_old[['Rv_ID', cols[0]]].copy()
    df_bin_both = df_bin_col.merge(df_old_col, how = 'inner', on = 'Rv_ID')

    num_both_CES = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==1) ].shape[0]
    num_both_nonCES = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_CES_old_nonCES_new = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_nonCES_old_CES_new = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==1) ].shape[0]

    list_num_both_CES.append(num_both_CES)
    list_num_both_nonCES.append(num_both_nonCES)
    list_num_CES_old_nonCES_new.append(num_CES_old_nonCES_new)
    list_num_nonCES_old_CES_new.append(num_nonCES_old_CES_new)
    
df_map = pd.DataFrame()

df_map['screen_old_ID'] = [col[0] for col in col_map_pairs]
df_map['screen_new_ID'] = [col[1] for col in col_map_pairs]

df_map['both_CES'] = list_num_both_CES
df_map['both_nonCES'] = list_num_both_nonCES
df_map['CES_old_nonCES_new'] = list_num_CES_old_nonCES_new
df_map['nonCES_old_CES_new'] = list_num_nonCES_old_CES_new

In [64]:
file_out = '../../dep/data/sanity_check_.csv'
df_map.to_csv(file_out, index = False)

In [140]:
df_map.sort_values(by='screen_old_ID', ascending=True, inplace=True)
df_map.reset_index(inplace=True, drop=True)
df_map

Unnamed: 0,screen_old_ID,screen_new_ID,both_CES,both_nonCES,CES_old_nonCES_new,nonCES_old_CES_new
0,2011_Griffin_2,griffin_cholesterol_vs_griffin_glycerol,59,3877,37,17
1,2013_DeJesus,griffin_glycerol_vs_mbio_H37Rv,20,3214,647,109
2,2013_Zhang_1,zhang_wt_mouse_d45_vs_zhang_input_library,253,3335,327,75
3,2013_Zhang_3A,zhang_Tyloxapol_pH_6.5_vs_zhang_Tyloxapol_pH_4.5,3,3958,1,28
4,2013_Zhang_3B,zhang_Tyloxapol_pH_6.5_vs_zhang_pcit_pH_4.5,7,3909,14,60
5,2013_Zhang_3C,zhang_DETA-NO_pH_7.0_vs_zhang_pH_7.0_no_NO_con...,11,3949,1,29
6,2013_Zhang_3D,zhang_Trp_Rescue_vs_zhang_in_vitro_control_Rescue,2,3968,10,10
7,2016_Korte,korte_2016_otsa_trehalose_vs_korte_2016_otsa_7h9,2,3854,134,0
8,2017B_DeJesus_1A,dejesus_Rv1432_day32_vs_dejesus_H37Rv_day32,19,3841,27,103
9,2017B_DeJesus_1B,dejesus_Rv1565c_day32_vs_dejesus_H37Rv_day32,54,3701,36,199


# Follow-up in more detail on a few:

## FLUTE database TnSeq screens: 

In [97]:
col_map_pairs_FLUTE = [cm for cm in col_map_pairs if 'Rv' in cm[0] or 'marP' in cm[0]]

In [100]:
path_FLUTE = '/home/adrian/Documents/repos/mtb_tn_db/data/SI_datasets/FLUTE_KO_TnSeq/'

In [112]:
list_num_both_CES = []
list_num_both_nonCES = []
list_num_CES_old_nonCES_new = []
list_num_nonCES_old_CES_new = []

for cols in col_map_pairs_FLUTE:
    file_SI = os.path.join(path_FLUTE, 'H37Rv_'+cols[0]+'.xlsx')
    df_SI = pd.read_excel(file_SI)
    
    df_SI.loc[ (df_SI['p-adj'] <= 0.05) & (df_SI['log2FC'].abs() >= 1), cols[0]]=1
    df_SI.loc[ df_SI['p-adj'] > 0.05, cols[0]]= 0
    df_SI.loc[ df_SI['log2FC'].abs() < 1, cols[0]]= 0
    df_SI = df_SI[['Rv_ID', 'Name', cols[0]]]
    
    df_bin_col = df_bin[['Rv_ID', cols[1]]].copy()
    df_bin_both = df_SI.merge(df_bin_col, how = 'inner', on = 'Rv_ID')

    num_both_CES = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==1) ].shape[0]
    num_both_nonCES = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_CES_old_nonCES_new = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_nonCES_old_CES_new = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==1) ].shape[0]

    list_num_both_CES.append(num_both_CES)
    list_num_both_nonCES.append(num_both_nonCES)
    list_num_CES_old_nonCES_new.append(num_CES_old_nonCES_new)
    list_num_nonCES_old_CES_new.append(num_nonCES_old_CES_new)
    
df_map = pd.DataFrame()

df_map['screen_old_ID'] = [col[0] for col in col_map_pairs_FLUTE]
df_map['screen_new_ID'] = [col[1] for col in col_map_pairs_FLUTE]

df_map['both_CES'] = list_num_both_CES
df_map['both_nonCES'] = list_num_both_nonCES
df_map['CES_old_nonCES_new'] = list_num_CES_old_nonCES_new
df_map['nonCES_old_CES_new'] = list_num_nonCES_old_CES_new

In [113]:
df_map

Unnamed: 0,screen_old_ID,screen_new_ID,both_CES,both_nonCES,CES_old_nonCES_new,nonCES_old_CES_new
0,Rv0307c,dejesus_Rv0307c_day32_vs_dejesus_Rv0307c_day0,7,3433,543,7
1,Rv3916c,dejesus_Rv3916c_day32_vs_dejesus_Rv3916c_day0,10,3652,280,48
2,Rv0950,Rv0950c_KO_vs_CB_WT,13,3971,5,1
3,Rv0954,Rv0954_KO_vs_RJ_WT,40,3910,12,28
4,Rv1096,Rv1096_KO_vs_CB_WT,11,3972,7,0
5,Rv3005c,Rv3005c_KO_day32_vs_dejesus_H37Rv_day32,13,3532,445,0
6,Rv3594,Rv3594_KO_vs_Rubin_FLUTE_WT,24,3954,11,1
7,Rv3684,Rv3684_KO_vs_CB_WT,2,3987,1,0
8,Rv3717,Rv3717_KO_vs_Rubin_FLUTE_WT,67,3901,22,0
9,Rv3811,Rv3811_KO_vs_Rubin_FLUTE_WT,26,3949,15,0


* Why are the two datasets so different? 
* Did Michael use a different control screen?
* Where did those FLUTE files come from? 
    * They come from the .dat files I downloaded directly from FLUTE.

### Griffing cholesterol:

In [19]:
cols = col_map[0]
df_bin_col = df_bin[['Rv_ID', cols[1]]].copy()
df_old_col = df_old[['Rv_ID', cols[0]]].copy()
df_bin_both = df_bin_col.merge(df_old_col, how = 'inner', on = 'Rv_ID')

In [20]:
df_CES_old_nonCES_new = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ]
df_nonCES_old_CES_new = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==1) ]

Get the gene names:

In [26]:
file_mbio = '../../data/SI_datasets/2017A_DeJesus_Iorger/table_1.xlsx'
df_mbio = pd.read_excel(file_mbio)
df_mbio = df_mbio[['Rv_ID', 'Name']]

In [35]:
df_CES_old_nonCES_new_wNames = df_CES_old_nonCES_new.merge(df_mbio, how = 'inner', on = 'Rv_ID')
df_CES_old_nonCES_new_wNames = df_CES_old_nonCES_new_wNames[['Rv_ID', 'Name', 'griffin_cholesterol_vs_griffin_glycerol', '2011_Griffin_2']]

file_out = '../../dep/data/sanity_check_griffin_CES_OLD_nonCES_NEW.csv'
df_CES_old_nonCES_new_wNames.to_csv(file_out, index = False)

In [36]:
df_nonCES_old_CES_new_wNames = df_nonCES_old_CES_new.merge(df_mbio, how = 'inner', on = 'Rv_ID')
df_nonCES_old_CES_new_wNames = df_nonCES_old_CES_new_wNames[['Rv_ID', 'Name', 'griffin_cholesterol_vs_griffin_glycerol', '2011_Griffin_2']]

file_out = '../../dep/data/sanity_check_griffin_nonCES_OLD_CES_NEW.csv'
df_nonCES_old_CES_new_wNames.to_csv(file_out, index = False)

In [38]:
df_CES_old_nonCES_new_wNames

Unnamed: 0,Rv_ID,Name,griffin_cholesterol_vs_griffin_glycerol,2011_Griffin_2
0,Rv0009,ppiA,0.0,1
1,Rv0153c,ptbB,0.0,1
2,Rv0202c,mmpL11,0.0,1
3,Rv0655,mkl,0.0,1
4,Rv0695,-,0.0,1
5,Rv0696,-,0.0,1
6,Rv0876c,-,0.0,1
7,Rv1071c,echA9,0.0,1
8,Rv1084,-,0.0,1
9,Rv1096,-,0.0,1


### Korte 2016:

In [59]:
cols = [cm for cm in col_map if 'Korte' in cm[0]][0]
df_bin_col = df_bin[['Rv_ID', cols[1]]].copy()
df_old_col = df_old[['Rv_ID', cols[0]]].copy()
df_bin_both = df_bin_col.merge(df_old_col, how = 'inner', on = 'Rv_ID')

CES in the old dataset, but not the new one: 

In [63]:
df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ].Rv_ID.values

array(['Rv0012', 'Rv0092', 'Rv0166', 'Rv0169', 'Rv0191', 'Rv0210',
       'Rv0234c', 'Rv0244c', 'Rv0263c', 'Rv0270', 'Rv0279c', 'Rv0280',
       'Rv0281', 'Rv0386', 'Rv0443', 'Rv0449c', 'Rv0483', 'Rv0492c',
       'Rv0501', 'Rv0570', 'Rv0574c', 'Rv0613c', 'Rv0630c', 'Rv0727c',
       'Rv0746', 'Rv0747', 'Rv0754', 'Rv0767c', 'Rv0782', 'Rv0861c',
       'Rv0877', 'Rv0889c', 'Rv0890c', 'Rv0914c', 'Rv0976c', 'Rv1180',
       'Rv1181', 'Rv1206', 'Rv1212c', 'Rv1232c', 'Rv1235', 'Rv1236',
       'Rv1238', 'Rv1244', 'Rv1262c', 'Rv1266c', 'Rv1283c', 'Rv1323',
       'Rv1442', 'Rv1632c', 'Rv1710', 'Rv1727', 'Rv1737c', 'Rv1745c',
       'Rv1768', 'Rv1770', 'Rv1795', 'Rv1796', 'Rv1820', 'Rv1823',
       'Rv1836c', 'Rv1840c', 'Rv1864c', 'Rv1902c', 'Rv1908c', 'Rv1910c',
       'Rv1971', 'Rv1991c', 'Rv2000', 'Rv2052c', 'Rv2065', 'Rv2066',
       'Rv2074', 'Rv2115c', 'Rv2159c', 'Rv2209', 'Rv2214c', 'Rv2224c',
       'Rv2236c', 'Rv2241', 'Rv2328', 'Rv2329c', 'Rv2394', 'Rv2458',
       'Rv2487c', 'Rv255