In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt

# Load new matrices: 

In [2]:
path_data = '../../data/standardized_data/'
file_log2fc = os.path.join(path_data, 'result_logfc_matrix_2023_02_20_CC_BxD_processed.csv')
file_qval = os.path.join(path_data, 'result_qval_matrix_2023_02_20_CC_BxD_processed.csv')
file_bin = os.path.join(path_data, 'result_bin_matrix_2023_02_20_CC_BxD_processed.csv')

file_SI = '../../data/SI_datasets/SI_bin.csv'

df_log2fc = pd.read_csv(file_log2fc)
df_qval = pd.read_csv(file_qval)
df_bin = pd.read_csv(file_bin)

df_old = pd.read_csv(file_SI)


In [3]:
df_bin.shape

(4055, 147)

In [4]:
cols_new = df_bin.columns.tolist()[1:]

# Comparing all screens:

In [5]:
file_col_desc = '../../data/meta_data/column_descriptors_standardized_w_mBio_101823.xlsx'
df_col_desc = pd.read_excel(file_col_desc)

# We are excluding two publications from this analysis since we processed the datasets directly with C. Smith
df_col_desc = df_col_desc[~df_col_desc.first_author.isin(['Smith', 'Meade'])].copy()

In [6]:
col_map = list(zip(df_col_desc.column_ID_SI.values, df_col_desc.column_ID_2.values))
col_map_pairs = [cm for cm in col_map if cm[0] in df_old.columns]

PENDING: increase this set of mapped screens!

In [7]:
col_map_pairs

[('2003A_Sassetti', nan),
 ('2003B_Sassetti', nan),
 ('2005_Rengarajan', nan),
 ('2006_Joshi_GI_1', nan),
 ('2006_Joshi_GI_2', nan),
 ('2011_Griffin_2', 'griffin_cholesterol_vs_griffin_glycerol'),
 ('2012_Zhang', nan),
 ('2013_Zhang_1A', 'zhang_wt_mouse_d10_vs_zhang_input_library'),
 ('2013_Zhang_1B', 'zhang_wt_mouse_d45_vs_zhang_input_library'),
 ('2013_Zhang_2', 'zhang_mhcii_mouse_d45_vs_zhang_wt_mouse_d45'),
 ('2013_Zhang_3A', 'zhang_Tyloxapol_pH_6.5_vs_zhang_Tyloxapol_pH_4.5'),
 ('2013_Zhang_3B', 'zhang_Tyloxapol_pH_6.5_vs_zhang_pcit_pH_4.5'),
 ('2013_Zhang_3C', 'zhang_DETA-NO_pH_7.0_vs_zhang_pH_7.0_no_NO_control'),
 ('2013_Zhang_3D', 'zhang_Trp_Rescue_vs_zhang_in_vitro_control_Rescue'),
 ('2015_Kieser_GI_1', 'kieser_dPonA1_vs_mbio_H37Rv'),
 ('2015_Kieser_GI_2', nan),
 ('2015_Kieser_GI_3', nan),
 ('2015_Mendum', nan),
 ('2016_Nambi', 'nambi_2015_ctpC_vs_nambi_2015_wt'),
 ('2016_Korte', 'korte_2016_otsa_trehalose_vs_korte_2016_otsa_7h9'),
 ('2017B_DeJesus_1A', nan),
 ('2017B_DeJesus

In [8]:
col_map_pairs = [cm for cm in col_map if type(cm[0])==str and type(cm[1])==str]

In [9]:
len(col_map_pairs)

28

#### Which screens are we missing to pair up with the new dataset? 

In [10]:
col_map_missing = [cm for cm in col_map if type(cm[0])!=str or type(cm[1])!=str]
df_col_map_missing = pd.DataFrame()
df_col_map_missing['SI_name'] = [cm[0] for cm in col_map_missing]
df_col_map_missing['standardized_name'] = [cm[1] for cm in col_map_missing]
df_col_map_missing.shape

(48, 2)

##### Also store the column names for your old dataset: 

In [11]:
df_col_names_old = pd.DataFrame()
df_col_names_old['old_DB_names'] = df_old.columns

file_col_names_old = '../../dep/data/column_names_old_08022020.xlsx'
df_col_names_old.to_excel(file_col_names_old, index = False)

### Sanity check for the screens for which you have a column name mapping (old-to-new datasets:)

In [12]:
df_old_col = df_old[['Rv_ID', '2013_Zhang_1A']]

In [13]:
list_num_both_CES = []
list_num_both_nonCES = []
list_num_CES_old_nonCES_new = []
list_num_nonCES_old_CES_new = []

for cols in col_map_pairs:
    print(cols)
    df_bin_col = df_bin[['Rv_ID', cols[1]]].copy()
    df_old_col = df_old[['Rv_ID', cols[0]]].copy()
    df_bin_both = df_bin_col.merge(df_old_col, how = 'inner', on = 'Rv_ID')

    num_both_CES = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==1) ].shape[0]
    num_both_nonCES = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_CES_old_nonCES_new = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_nonCES_old_CES_new = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==1) ].shape[0]

    list_num_both_CES.append(num_both_CES)
    list_num_both_nonCES.append(num_both_nonCES)
    list_num_CES_old_nonCES_new.append(num_CES_old_nonCES_new)
    list_num_nonCES_old_CES_new.append(num_nonCES_old_CES_new)
    
df_map = pd.DataFrame()

df_map['screen_old_ID'] = [col[0] for col in col_map_pairs]
df_map['screen_new_ID'] = [col[1] for col in col_map_pairs]

df_map['CE_consensus'] = list_num_both_CES
df_map['notCE_consensus'] = list_num_both_nonCES
df_map['CE_SI_notCE_standardized'] = list_num_CES_old_nonCES_new
df_map['notCE_SI_CE_standardized'] = list_num_nonCES_old_CES_new

('2011_Griffin_2', 'griffin_cholesterol_vs_griffin_glycerol')
('2013_Zhang_1A', 'zhang_wt_mouse_d10_vs_zhang_input_library')
('2013_Zhang_1B', 'zhang_wt_mouse_d45_vs_zhang_input_library')
('2013_Zhang_2', 'zhang_mhcii_mouse_d45_vs_zhang_wt_mouse_d45')
('2013_Zhang_3A', 'zhang_Tyloxapol_pH_6.5_vs_zhang_Tyloxapol_pH_4.5')
('2013_Zhang_3B', 'zhang_Tyloxapol_pH_6.5_vs_zhang_pcit_pH_4.5')
('2013_Zhang_3C', 'zhang_DETA-NO_pH_7.0_vs_zhang_pH_7.0_no_NO_control')
('2013_Zhang_3D', 'zhang_Trp_Rescue_vs_zhang_in_vitro_control_Rescue')
('2015_Kieser_GI_1', 'kieser_dPonA1_vs_mbio_H37Rv')
('2016_Nambi', 'nambi_2015_ctpC_vs_nambi_2015_wt')
('2016_Korte', 'korte_2016_otsa_trehalose_vs_korte_2016_otsa_7h9')
('2017_Xu_1A', 'xu_van_16_vs_xu_van_0')
('2017_Xu_1B', 'xu_rif_4_vs_xu_rif_0')
('2017_Xu_1C', 'xu_inh_02_vs_xu_inh_0')
('2017_Xu_1D', 'xu_emb_2.5_vs_xu_emb_0')
('2017_Xu_1E', 'xu_mero_2.5_vs_xu_mero_0')
('2017_Mishra_1', 'mishra_C3H_vs_mishra_B6')
('2017_Mishra_2', 'mishra_NOS2_vs_mishra_B6')
('2018

In [14]:
df_map.sum()

screen_old_ID               2011_Griffin_22013_Zhang_1A2013_Zhang_1B2013_Z...
screen_new_ID               griffin_cholesterol_vs_griffin_glycerolzhang_w...
CE_consensus                                                             1157
notCE_consensus                                                        107952
CE_SI_notCE_standardized                                                 1205
notCE_SI_CE_standardized                                                 1292
dtype: object

In [15]:
import numpy as np

In [16]:
consensus = 1157 + 107952
non_consensus = 1205 + 1292
total = consensus + non_consensus

frac_consensus = consensus / total
frac_non_consensus = non_consensus / total

np.round(frac_non_consensus, 3)

0.022

In [17]:
CE_consensus = 1157
CE_SI_notCE_standardized = 1205
notCE_SI_CE_standardized = 1292
CE_all = CE_consensus + CE_SI_notCE_standardized + notCE_SI_CE_standardized
frac_CE_concensus = CE_consensus / CE_all
frac_CE_nonconcensus = (CE_SI_notCE_standardized + notCE_SI_CE_standardized) / CE_all

print(np.round(frac_CE_concensus, 2))

print(np.round(frac_CE_nonconcensus, 2))

0.32
0.68


In [19]:
file_out = '../../data/sanity_check_09012020.csv'
file_out_xls = '../../data/sanity_check_09012020.xlsx'

df_map.to_csv(file_out, index = False)
df_map.to_excel(file_out_xls, index = False)

# Dig into more detail for a few subset of screens:

## FLUTE database TnSeq screens: 

In [None]:
col_map_pairs_FLUTE = [cm for cm in col_map_pairs if 'Rv' in cm[0] or 'marP' in cm[0]]

In [None]:
path_FLUTE = '/home/adrian/Documents/repos/mtb_tn_db/data/SI_datasets/FLUTE_KO_TnSeq/'

In [None]:
list_num_both_CES = []
list_num_both_nonCES = []
list_num_CES_old_nonCES_new = []
list_num_nonCES_old_CES_new = []

for cols in col_map_pairs_FLUTE:
    file_SI = os.path.join(path_FLUTE, 'H37Rv_'+cols[0]+'.xlsx')
    df_SI = pd.read_excel(file_SI)
    
    df_SI.loc[ (df_SI['p-adj'] <= 0.05) & (df_SI['log2FC'].abs() >= 1), cols[0]]=1
    df_SI.loc[ df_SI['p-adj'] > 0.05, cols[0]]= 0
    df_SI.loc[ df_SI['log2FC'].abs() < 1, cols[0]]= 0
    df_SI = df_SI[['Rv_ID', 'Name', cols[0]]]
    
    df_bin_col = df_bin[['Rv_ID', cols[1]]].copy()
    df_bin_both = df_SI.merge(df_bin_col, how = 'inner', on = 'Rv_ID')

    num_both_CES = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==1) ].shape[0]
    num_both_nonCES = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_CES_old_nonCES_new = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_nonCES_old_CES_new = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==1) ].shape[0]

    list_num_both_CES.append(num_both_CES)
    list_num_both_nonCES.append(num_both_nonCES)
    list_num_CES_old_nonCES_new.append(num_CES_old_nonCES_new)
    list_num_nonCES_old_CES_new.append(num_nonCES_old_CES_new)
    
df_map = pd.DataFrame()

df_map['screen_old_ID'] = [col[0] for col in col_map_pairs_FLUTE]
df_map['screen_new_ID'] = [col[1] for col in col_map_pairs_FLUTE]

df_map['both_CES'] = list_num_both_CES
df_map['both_nonCES'] = list_num_both_nonCES
df_map['CES_old_nonCES_new'] = list_num_CES_old_nonCES_new
df_map['nonCES_old_CES_new'] = list_num_nonCES_old_CES_new

In [None]:
df_map

* Why are the two datasets so different? 
* Did Michael use a different control screen?
* Where did those FLUTE files come from? 
    * They come from the .dat files I downloaded directly from FLUTE.

### Griffing cholesterol:

In [None]:
cols = col_map[0]
df_bin_col = df_bin[['Rv_ID', cols[1]]].copy()
df_old_col = df_old[['Rv_ID', cols[0]]].copy()
df_bin_both = df_bin_col.merge(df_old_col, how = 'inner', on = 'Rv_ID')

In [None]:
df_CES_old_nonCES_new = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ]
df_nonCES_old_CES_new = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==1) ]

Get the gene names:

In [None]:
file_mbio = '../../data/SI_datasets/2017A_DeJesus_Iorger/table_1.xlsx'
df_mbio = pd.read_excel(file_mbio)
df_mbio = df_mbio[['Rv_ID', 'Name']]

In [None]:
df_CES_old_nonCES_new_wNames = df_CES_old_nonCES_new.merge(df_mbio, how = 'inner', on = 'Rv_ID')
df_CES_old_nonCES_new_wNames = df_CES_old_nonCES_new_wNames[['Rv_ID', 'Name', 'griffin_cholesterol_vs_griffin_glycerol', '2011_Griffin_2']]

file_out = '../../dep/data/sanity_check_griffin_CES_OLD_nonCES_NEW.csv'
df_CES_old_nonCES_new_wNames.to_csv(file_out, index = False)

In [None]:
df_nonCES_old_CES_new_wNames = df_nonCES_old_CES_new.merge(df_mbio, how = 'inner', on = 'Rv_ID')
df_nonCES_old_CES_new_wNames = df_nonCES_old_CES_new_wNames[['Rv_ID', 'Name', 'griffin_cholesterol_vs_griffin_glycerol', '2011_Griffin_2']]

file_out = '../../dep/data/sanity_check_griffin_nonCES_OLD_CES_NEW.csv'
df_nonCES_old_CES_new_wNames.to_csv(file_out, index = False)

In [None]:
df_CES_old_nonCES_new_wNames

### Korte 2016:

In [None]:
cols = [cm for cm in col_map if 'Korte' in cm[0]][0]
df_bin_col = df_bin[['Rv_ID', cols[1]]].copy()
df_old_col = df_old[['Rv_ID', cols[0]]].copy()
df_bin_both = df_bin_col.merge(df_old_col, how = 'inner', on = 'Rv_ID')

CES in the old dataset, but not the new one: 

In [None]:
df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ].Rv_ID.values

## Analysis of Carey datasets:

Load Carey et al. SI file on excluded repetitive regions:

In [None]:
path_temp = '../../dep/data/'
file_rep = os.path.join(path_temp, 'Carey_S1_Table.xlsx')
df_rep = pd.read_excel(file_rep)
df_rep.rename(columns = {'Rv Number': 'Rv_ID'}, inplace = True)
df_rep.shape

Deleted genes: 

In [None]:
file_del = os.path.join(path_temp, 'Carey_S2_Table.xlsx')
xl = pd.ExcelFile(file_del)
sheets = xl.sheet_names  # see all sheet names
dict_df_del = {}
for sheet in sheets:
    df_temp =  xl.parse(sheet)
    df_temp.rename(columns = {'Rv Number': 'Rv_ID'}, inplace = True)
    dict_df_del[sheet] = df_temp # read a specific sheet to DataFrame

Duplicated regions

In [None]:
df_rv_id_num = df_bin[['Rv_ID']].copy()
rv_num = [ rv[2:].strip('c').strip('A').strip('B') for rv in df_rv_id_num.Rv_ID.values]
df_rv_id_num['rv_num'] = rv_num
df_rv_id_num.head(2)

In [None]:
dict_list_dup = {}

(PENDING): Explain what is this: 

In [None]:
list_dup_temp_num = [str(i) for i in range(3219, 3429)]
list_dup_temp_rv = df_rv_id_num[df_rv_id_num.rv_num.isin(list_dup_temp_num)].Rv_ID.tolist()
dict_list_dup['621'] = list_dup_temp_rv
dict_list_dup['631'] = list_dup_temp_rv
dict_list_dup['632'] = list_dup_temp_rv

list_dup_temp_num = [str(i) for i in range(3188, 3429)]
list_dup_temp_rv = df_rv_id_num[df_rv_id_num.rv_num.isin(list_dup_temp_num)].Rv_ID.tolist()
dict_list_dup['662'] = list_dup_temp_rv
dict_list_dup['667'] = list_dup_temp_rv

dict_list_dup['630'] = []
dict_list_dup['641'] = []
dict_list_dup['663'] = []


## Let's compare old Carey results vs. new Carey results: 

### Load older carey datasets: 

In [None]:
cols_old = [col for col in df_old.columns if 'Carey' in col]

In [None]:
df_old.shape

In [None]:
cols_new = [col for col in df_bin.columns if 'carey_rv' in col]
# cols_new = [col for col in df_bin.columns if 'carey' in col and 'mbio' in col]
col_map = list(zip(cols_old, cols_new))
col_map

How do I exclude genes in the repetitive regions list?

In [None]:
list_num_both_CES = []
list_num_both_nonCES = []
list_num_CES_old_nonCES_new = []
list_num_nonCES_old_CES_new = []

for cols in col_map:
    key_strain = cols[1].split('_')[1]
    
    df_bin_col = df_bin[['Rv_ID', cols[1]]].copy()
    df_old_col = df_old[['Rv_ID', cols[0]]].copy()
    
    df_bin_both = df_bin_col.merge(df_old_col, how = 'inner', on = 'Rv_ID')
    
    # excluding repetitive genes (same across all clinical strains)
    df_bin_both = df_bin_both[~df_bin_both.Rv_ID.isin(df_rep.Rv_ID)]
    
    # excluding deleted genes (strain specific)
    df_del = dict_df_del[key_strain]
    df_bin_both = df_bin_both[~df_bin_both.Rv_ID.isin(df_del.Rv_ID)]
    print('Excluded', df_del.shape[0], 'genes deleted in strain:', key_strain )
    
    # excluding genes in duplicated regions (same across all clinical strains)
    list_dup = dict_list_dup[key_strain]
    df_bin_both = df_bin_both[~df_bin_both.Rv_ID.isin(list_dup)]
    print('Excluded', len(list_dup), 'genes in duplicated region of strain:', key_strain, '\n' )
    
    #####
    file_out = os.path.join('../../dep/data/', cols[1]+'.xlsx')
    writer = pd.ExcelWriter(file_out, engine='xlsxwriter')
    df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ].to_excel(writer, sheet_name = 'CES_old_nonCES_new', index = False)
    df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==1) ].to_excel(writer, sheet_name = 'nonCES_old_CES_new', index = False)
    writer.save()

    num_both_CES = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==1) ].shape[0]
    num_both_nonCES = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_CES_old_nonCES_new = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_nonCES_old_CES_new = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==1) ].shape[0]

    list_num_both_CES.append(num_both_CES)
    list_num_both_nonCES.append(num_both_nonCES)
    list_num_CES_old_nonCES_new.append(num_CES_old_nonCES_new)
    list_num_nonCES_old_CES_new.append(num_nonCES_old_CES_new)
    
df_map = pd.DataFrame()
df_map['screen'] = [col[1] for col in col_map]
df_map['both_CES'] = list_num_both_CES
df_map['num_both_nonCES'] = list_num_both_nonCES
df_map['num_CES_old_nonCES_new'] = list_num_CES_old_nonCES_new
df_map['num_nonCES_old_CES_new'] = list_num_nonCES_old_CES_new

In [None]:
file_out = '../../dep/data/sanity_check_Carey_no_reps_09_2020.csv'
df_map.to_csv(file_out, index = False)

# 08/01/2020 from Michael: 
#### "comparing one of the conditions, with different flags for the resampling (e.g. LOESS, ignoring N/C terminal sites), which may explain discrepancies." 

In [None]:
path_data_temp = '../../data/standardized_data/'
file_663_1 = os.path.join(path_data_temp, 'result_resampling_carey_663_vs_carey_rv_LOESS.txt')
file_663_2 = os.path.join(path_data_temp, 'result_resampling_carey_663_vs_carey_rv_LOESS_term15.txt')
file_663_3 = os.path.join(path_data_temp, 'result_resampling_carey_663_vs_carey_rv_LOESS_term15_2.txt')

list_files = [file_663_1, file_663_2, file_663_3]

list_file_names = ['663_'+f.split('rv_')[-1].split('.')[0] for f in list_files]

Testing with one dataset:

In [None]:
def file_to_bin(file, col):
    
    df = pd.read_csv(file, sep = '\t', skiprows=[0, 1, 2, 3, 4, 5])
    df.rename(columns = {'#Orf':'Rv_ID'}, inplace=True)
    df = df[['Rv_ID', 'log2FC', 'Adj. p-value']]

    df_qvals = df[['Rv_ID', 'Adj. p-value']].copy()
    df_qvals.rename(columns = {'Adj. p-value': col}, inplace = True)
    df_log2fc = df[['Rv_ID', 'log2FC']].copy()
    df_log2fc.rename(columns = {'log2FC': col}, inplace = True)
    df_bin = df_qvals.copy()

    # binarize
    # set thresholds
    qval_thresh = 0.05
    log2fc_tresh = 1
    # binarize
    df_bin.loc[ (df_qvals[col] <= qval_thresh) & (df_log2fc[col].abs() >= log2fc_tresh), col ] = 1
    df_bin.loc[ (df_qvals[col] > qval_thresh), col ] = 0  
    df_bin.loc[(df_log2fc[col].abs() < log2fc_tresh), col] = 0
    
    return df_bin, df_log2fc, df_qvals


In [None]:
#file = list_files[0]
list_num_both_CES = []
list_num_both_nonCES = []
list_num_CES_old_nonCES_new = []
list_num_nonCES_old_CES_new = []
col = cols[-1]

for i in range(len(list_files)):
    file = list_files[i]
    print(file)
    df_bin, df_log2fc, df_qvals = file_to_bin(file, col)
    # Compare SI data vs. Michael's new data:

    key_strain = cols[1].split('_')[1]
    df_bin_col = df_bin[['Rv_ID', cols[1]]].copy()
    df_old_col = df_old[['Rv_ID', cols[0]]].copy()
    df_bin_both = df_bin_col.merge(df_old_col, how = 'inner', on = 'Rv_ID')

    # excluding repetitive genes (same across all clinical strains)
    df_bin_both = df_bin_both[~df_bin_both.Rv_ID.isin(df_rep.Rv_ID)]

    # excluding deleted genes (strain specific)
    df_del = dict_df_del[key_strain]
    df_bin_both = df_bin_both[~df_bin_both.Rv_ID.isin(df_del.Rv_ID)]
    print('Excluded', df_del.shape[0], 'genes deleted in strain:', key_strain )

    # excluding genes in duplicated regions (same across all clinical strains)
    list_dup = dict_list_dup[key_strain]
    df_bin_both = df_bin_both[~df_bin_both.Rv_ID.isin(list_dup)]
    print('Excluded', len(list_dup), 'genes in duplicated region of strain:', key_strain, '\n' )
    
    #####
    file_out = os.path.join('../../dep/data/', list_file_names[i]+'.xlsx')
    writer = pd.ExcelWriter(file_out, engine='xlsxwriter')
    df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ].to_excel(writer, sheet_name = 'CES_old_nonCES_new', index = False)
    df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==1) ].to_excel(writer, sheet_name = 'nonCES_old_CES_new', index = False)
    writer.save()

    num_both_CES = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==1) ].shape[0]
    num_both_nonCES = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_CES_old_nonCES_new = df_bin_both[ (df_bin_both[cols[0]]==1) & (df_bin_both[cols[1]]==0) ].shape[0]
    num_nonCES_old_CES_new = df_bin_both[ (df_bin_both[cols[0]]==0) & (df_bin_both[cols[1]]==1) ].shape[0]
    
    list_num_both_CES.append(num_both_CES)
    list_num_both_nonCES.append(num_both_nonCES)
    list_num_CES_old_nonCES_new.append(num_CES_old_nonCES_new)
    list_num_nonCES_old_CES_new.append(num_nonCES_old_CES_new)
    

In [None]:
df_map = pd.DataFrame()
df_map['screen'] = list_file_names
df_map['both_CES'] = list_num_both_CES
df_map['num_both_nonCES'] = list_num_both_nonCES
df_map['num_CES_old_nonCES_new'] = list_num_CES_old_nonCES_new
df_map['num_nonCES_old_CES_new'] = list_num_nonCES_old_CES_new

file_out = '../../dep/data/sanity_check_Carey_663_08012020.csv'
df_map.to_csv(file_out, index = False)