# Data wrangling for *Mtb*-Tn-Mat project

* reads in SI tables / files and compiles them into Pandas dataframe 
* version 2.0: goal is to have 3 matrices:
    1. log2FC matrix
    2. q-value matrix

## Imports modules:

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from statsmodels.stats.multitest import multipletests
from Tn_data_wrangling import *

%load_ext autoreload
%autoreload 2

pd.options.mode.chained_assignment = None  # default='warn

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Read in full set of Rv_IDs and gene_names from tuberculist annotation: 

In [5]:
root_dir = '../..'
data_path = os.path.join(root_dir, 'data')

In [6]:
file_WG = 'annotations/H37Rv_tuberculist_functional_categories.xlsx'
df_WG = pd.read_excel(os.path.join(data_path, file_WG))
df_WG = df_WG[['Rv_ID', 'gene_name']]
df_WG.head(3)

Unnamed: 0,Rv_ID,gene_name
0,Rv0001,dnaA
1,Rv0002,dnaN
2,Rv0003,recF


## The first set of cases:

These are the SI files and tables (19 columns) that are in the following format:

they list the set of genes that are called (conditionally) essential in the Tn screen

In [13]:
# read in path/file_name to column name mapping for 'simple' cases:
col_name_file = os.path.join( data_path, 'files_and_columns.csv') 
df_col_info_set1 = pd.read_csv(col_name_file)
df_col_info_set1.head(2)

Unnamed: 0,file,col_name,q_val_col_name,ratio_col_name,is_ratio_log2FC
0,2003_Sassetti_Boyd_Rubin/table_1.xls,2003A_Sassetti,p_val,ratio,False
1,2003_Sassetti_Rubin/table_3.xls,2003B_Sassetti,Week_8_P_value,Week_8_Ratio,False


#### Building q-value and log2FC matrix for first set of files: 

In [14]:
tn_data_path = os.path.join(root_dir, 'data/Tn_datasets')

In [18]:
counter = 0

for index, row in df_col_info_set1.iterrows():
    
    file_in = row['file']
    print(file_in)
    
    # get column names (both from SI file and target column names in Tn-Matrix )
    qval_col, log2FC_col, is_ratio_log2FC, col_name, target_qval_col, target_log2FC_col = get_col_names(file_in, df_col_info_set1)
    
    # get data from SI file and populate dataframe. 
    df_tn_qval_log2FC = build_q_val_ratio_df( os.path.join(tn_data_path, file_in), col_name, qval_col, log2FC_col, is_ratio_log2FC )
    
    # merge with whole genome (Rv-IDs and gene-names for all of H37Rv)
    df_tn_qval_log2FC_WG = merge_with_whole_genome_qval_log2FC( df_WG, target_qval_col, target_log2FC_col, df_tn_qval_log2FC )

    # merge with rest of data-files / columns. 
    if counter == 0:
        df_tn_qval_log2FC_ALL = df_tn_qval_log2FC_WG.copy()
    else:
        df_tn_qval_log2FC_ALL = df_tn_qval_log2FC_ALL.merge(df_tn_qval_log2FC_WG, how = 'inner', on = ['Rv_ID', 'gene_name'])
    
    counter +=1

2003_Sassetti_Boyd_Rubin/table_1.xls
2003_Sassetti_Rubin/table_3.xls
2005_Rengarajan_Rubin/table_2.xls
2006_Joshi_Sassetti/table_1.xlsx
2006_Joshi_Sassetti/table_2.xlsx
2011_Griffin_Sassetti/table_4.xlsx
2013_Zhang_Rubin/table_2.xlsx
2013_Zhang_Rubin/table_3.xlsx
2013_Zhang_Rubin/table_4A.xlsx
2013_Zhang_Rubin/table_4B.xlsx
2013_Zhang_Rubin/table_4C.xlsx


  df_tn[target_log2FC_col] = np.log2(df_tn[log2FC_col])


2013_Zhang_Rubin/table_4D.xlsx
2016_Korte_Kalscheuer/table_1.xlsx
2017_Mishra_Sassetti/table_1A.xlsx
2017_Mishra_Sassetti/table_1B.xlsx
2017_Mishra_Sassetti/table_1C.xlsx
2017_Mishra_Sassetti/table_1D.xlsx
2018_Rittershaus_Sassetti/table_2A.xlsx
2018_Rittershaus_Sassetti/table_2B.xlsx


In [19]:
df_tn_qval_log2FC_ALL.shape

(3990, 40)

## Second set of cases: 

These are SI datafiles and tables that have, for all genes in H37Rv, a column for q-vals and ratios.

Load spreasheet with file name-to-column mappings

In [21]:
col_name_file = os.path.join( data_path, 'files_and_columns_set2.csv') 
df_col_info_set2 = pd.read_csv(col_name_file) 
df_col_info_set2.shape

(39, 5)

Get list of files (the last two are Michael's and in a different format): 

In [26]:
file_list = df_col_info_set2.file.values[:-2]

In [27]:
df_tn_qval_log2FC_ALL = get_qval_log2FC_func(file_list, df_col_info_set2, df_WG, df_tn_qval_log2FC_ALL)

2012_Zhang_Rubin/table_4_sheet_600bp.xlsx
2015_Kieser_Rubin/table_1A_edited.xlsx
2015_Kieser_Rubin/table_1B_edited.xlsx
2015_Kieser_Rubin/table_1C_edited.xlsx
2015_Mendum_Stewart/table_1A.xlsx
2016_Nambi_Sassetti/table_1.xlsx
2017_Xu_Ehrt/table_3A.xlsx
2017_Xu_Ehrt/table_3B.xlsx
2017_Xu_Ehrt/table_3C.xlsx
2017_Xu_Ehrt/table_3D.xlsx
2017_Xu_Ehrt/table_3E.xlsx
2018_Carey_Fortune/table_1A.xlsx
2018_Carey_Fortune/table_1B.xlsx
2018_Carey_Fortune/table_1C.xlsx
2018_Carey_Fortune/table_1D.xlsx
2018_Carey_Fortune/table_1E.xlsx
2018_Carey_Fortune/table_1F.xlsx
2018_Carey_Fortune/table_1G.xlsx
2018_Carey_Fortune/table_1H.xlsx
2017B_DeJesus_Iorger/table_1A.xlsx
2017B_DeJesus_Iorger/table_1B.xlsx
2017B_DeJesus_Iorger/table_1C.xlsx
FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_Rv1432_day32_TTR.xlsx
FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_Rv2680_day32_s10000_pc0.00.xlsx
FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_Rv1096_day0_s10000_pc0.00.xlsx
FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_WT_marP_s10

In [161]:
df_tn_qval_log2FC_ALL.shape

(3990, 114)

In [28]:
out_file = os.path.join(data_path, 'Tn_library_DB_qval_log2FC.xlsx')
df_tn_qval_log2FC_ALL.to_excel(out_file, index=False)

In [29]:
df_tn_qval_log2FC_ALL.shape

(3990, 114)

### [PENDING] 

### DeJesus and Iorger (2013)

In [38]:
file = '2013_DeJesus_Iorger/table_1.xls'
col_name = '2013_DeJesus'
df_tn = pd.read_excel(os.path.join(dir_name, file))
df_tn.head(2)

Unnamed: 0,Rv_ID,Name,Insertions,# TA Sites,Length of Maximum Run,Span of Nucleotides,Sassetti-03,Post. Prob. Of Ess.,Call
0,Rv0001,dnaA,1,32,31,1365,1,1.0,E
1,Rv0002,dnaN,0,31,31,1167,-1,1.0,E


### DeJesus, Iorger (2017A)

In [138]:
file_list = ['2017A_DeJesus_Iorger/table_1.xlsx']
# you need to talk about this with Michael

____________
____________
____________
# OTHER stuff 

### Kieser Rubin (2015 data) - tables A, B, C

pre-formatting: 

In [109]:
# file_list = ['2015_Kieser_Rubin/table_1A.xlsx', 
#              '2015_Kieser_Rubin/table_1B.xlsx',
#              '2015_Kieser_Rubin/table_1C.xlsx']

# for file in file_list:
#     # read the data. 
#     df_tn = pd.read_excel(os.path.join(dir_name, file))

#     # this fixes the Rv_IDs for this particular dataset. 
#     df_tn = df_tn[~df_tn.Rv_ID.str.contains('IG')]
#     rv_id_list = [rv_id.split('_')[-1].strip('\'') for rv_id in df_tn['Rv_ID'].values]
#     df_tn['Rv_ID'] = rv_id_list

#     # write edited file:
#     outfile = file.split('.xlsx')[0]+'_edited.xlsx'
#     df_tn.to_excel(os.path.join(dir_name, outfile), index=False)

### FLUTE datasets pre-processing:

In [149]:
flute_path = 'Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/original'
file_list_flute = [os.path.join(flute_path, file) for file in os.listdir(flute_path)]

In [150]:
for file in file_list_flute:
    file_out = file.split('/')[-1].split('.dat')[0]+'.tsv'
    file_out = os.path.join('Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/', file_out)

    with open(file, 'r') as fin:
        lines_temp = fin.read()
        lines_new = lines_temp.split('\n')[6:]
        with open(file_out, 'w') as fout:
            fout.write('\n'.join(lines_new))

In [151]:
flute_path = 'Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/'
tsv_list_flute = [os.path.join(flute_path, file) for file in os.listdir(flute_path) if 'resampling' in file]

In [152]:
for file in tsv_list_flute:
    print(file)
    file_out = file.split('/')[-1].split('.tsv')[0]+'.xlsx'
    file_out = os.path.join('Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/', file_out)
    df_flute = pd.read_csv(file, sep='\t')
    df_flute.rename(columns={'#Orf':'Rv_ID','Name':'gene_name', 'p-adj':'Adj. p-value', 'log2 FC':'log2FC'}, inplace=True)
    df_flute.to_excel(file_out, index=False)

Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_WT_marP_s10000_pc0.00_1.tsv
Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_Rv0950_day0_s10000_pc0.00.tsv
Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_Rv3916c_day0_s10000_pc0.00.tsv
Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_Rv0307c_day0_s10000_pc0.00.tsv
Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_vs_delta_Rv3717_TTR.tsv
Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_glycerol_ponA1KO_s10000_pc0.00.tsv
Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_Rv1565c_day32_TTR.tsv
Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_Rv3684_day0_s10000_pc0.00.tsv
Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_WT_Rv0954_s10000_pc0.00.tsv
Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_vs_delta_Rv3811_TTR.tsv
Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_Rv1096_day0_s10000_pc0.00

In [153]:
f_temp = 'Tn_library_studies/FLUTE_KO_TnSeq/FLUTE_log2fc/resampling_H37Rv_Rv1432_day32_TTR.xlsx'

In [154]:
df = pd.read_excel(f_temp)

In [155]:
df.head()

Unnamed: 0,Rv_ID,gene_name,Description,N,TAs Hit,Sum Rd 1,Sum Rd 2,Delta Rd,log2FC,p-value,Adj. p-value
0,Rv0001,dnaA,chromosomal replication initiation protein,31,1,0.2,0.0,-0.2,3.83,1.0,1.0
1,Rv0002,dnaN,DNA polymerase III subunit beta,31,0,0.0,0.0,0.0,0.0,1.0,1.0
2,Rv0003,recF,recombination protein F,35,9,19.1,54.0,34.9,1.5,0.3056,0.69974
3,Rv0004,-,hypothetical protein Rv0004,7,0,0.0,0.0,0.0,0.0,1.0,1.0
4,Rv0005,gyrB,DNA gyrase subunit B,42,3,35.2,1.0,-34.2,-5.08,0.1396,0.46795
