# Pre-process TnSeq data for classification learning: 

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd

### These were generated using: 

code/data_wrangling/Clare_take1.ipynb

Choose data source / version:

In [2]:
# version = 'old'
# version = 'clare_all'
version = 'clare_basis'

In [3]:
if version == 'old':
    fn_lfc = 'result_logfc_matrix_2020_08_27.csv'
    fn_qval = 'result_qval_matrix_2020_08_27.csv'
elif version == 'clare_all':
    fn_lfc = 'result_logfc_matrix_2021_08_27.csv'
    fn_qval = 'result_qval_matrix_2021_08_27.csv'
elif version == 'clare_basis':
    fn_lfc = 'result_logfc_matrix_2021_08_27_BASIS.csv'
    fn_qval = 'result_qval_matrix_2021_08_27_BASIS.csv'

path = '../data/standardized_data/'
lfc = pd.read_csv(os.path.join(path, fn_lfc))
qval = pd.read_csv(os.path.join(path, fn_qval))

### Drop missing data:

In [4]:
print(lfc.shape, qval.shape)
lfc = lfc.dropna(axis=0)
qval = qval.dropna(axis=0)
print(lfc.shape, qval.shape)

(4055, 126) (4055, 126)
(3971, 126) (3971, 126)


### Load mycobrowser dataset: 

In [5]:
fn = 'Mycobacterium_tuberculosis_H37Rv_txt_v3.xlsx'
mcbwser = pd.read_excel(pathlib.Path.cwd().parents[0].joinpath('data', 'annotations', fn))
mcbwser.head(1)

Unnamed: 0,Refseq_ID,Feature,Rv_ID,Name,Function,Product,Comments,Functional_Category
0,NC_000962.3,CDS,Rv3728,Rv3728,"Unknown, but seems involved in efflux system (...",Probable conserved two-domain membrane protein,"Rv3728, (MTV025.076), len: 1065 aa. Probable c...",cell wall and cell processes


Drop duplicates: 

In [6]:
mcbwser = mcbwser.drop_duplicates(subset=['Rv_ID'])

Merge using the Rv_ID

In [7]:
lfc_mb = pd.merge(lfc, mcbwser[['Rv_ID', 'Functional_Category']], how='left', on='Rv_ID')
qval_mb = pd.merge(qval, mcbwser[['Rv_ID', 'Functional_Category']], how='left', on='Rv_ID')

### Filter out orphan gene categories: 

In [8]:
list_orphans = ['conserved hypotheticals', 'unknown']

lfc_mb_filt = lfc_mb[~lfc_mb['Functional_Category'].isin(list_orphans)]
qval_mb_filt = qval_mb[~qval_mb['Functional_Category'].isin(list_orphans)]

Write to file: 

In [9]:
if version == 'old':
    fn_out_lfc = 'lfc_mb_filt_OLD.csv'
    fn_out_qval = 'qval_mb_filt_OLD.csv'
elif version == 'clare_all':
    fn_out_lfc = 'lfc_mb_filt.csv'
    fn_out_qval = 'qval_mb_filt.csv'
elif version == 'clare_basis':
    fn_out_lfc = 'lfc_mb_filt_BASIS.csv'
    fn_out_qval = 'qval_mb_filt_BASIS.csv'

In [10]:
path_out = '../data/standardized_data/cleaned_ML/'

lfc_mb_filt.to_csv(os.path.join(path_out, fn_out_lfc), index=False)
qval_mb_filt.to_csv(os.path.join(path_out, fn_out_qval),  index=False)

# OTHER: 

### Binarized version of datasets: 

In [88]:
data_cols = [col for col in qval_mb.columns if col not in [
    'Rv_ID', 'Functional_Category']]

In [89]:
bin_matrix_lfc = lfc_mb[data_cols].applymap(lambda x: x >= 1 or x <= -1)
bin_matrix_qval = qval_mb[data_cols].applymap(lambda x: x <= 0.05)

In [90]:
bin_matrix = bin_matrix_lfc & bin_matrix_qval
bin_matrix = bin_matrix.astype(int)

In [91]:
bin_matrix['Rv_ID'] = qval_mb['Rv_ID']
bin_matrix['Functional_Category'] = qval_mb['Functional_Category']

In [92]:
bin_matrix = bin_matrix[['Rv_ID'] +
                        [col for col in bin_matrix.columns if col != 'Rv_ID']]

In [93]:
bin_matrix.to_csv('../data/standardized_data/cleaned_ML/bin_mb.csv',  index=False)

In [94]:
bin_matrix_filt = bin_matrix[~bin_matrix['Functional_Category'].isin(
    ['conserved hypotheticals', 'unknown'])]

In [95]:
bin_matrix_filt.to_csv(
    '../data/standardized_data/cleaned_ML/bin_mb_filt.csv', index=False)