## Import modules:

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns

## Function definitions: 

In [3]:
def essentiality_calls(df, gene_id):
# Return the columns in which a given gene is called essential:
    cols = df.columns[2:]
    col_list = []
    for col in cols:
        if df[df['Rv_ID']==gene_id][col].values[0] == 1:
            col_list.append(col)
            
    return col_list

## Load datasets:

Screen condition descriptions:

In [4]:
dir_data = '../../data'
column_file = os.path.join(dir_data, 'column_descriptors.csv')
df_column = pd.read_csv(column_file)

TnSeq matrix:

In [5]:
tn_file = os.path.join(dir_data, 'Tn_library_DB.xlsx')
df_tn = pd.read_excel(tn_file)
df_tn.shape

(3990, 60)

## Read in list of genes to query:

File names: 

In [6]:
# gene_list_file = '../../data/other_data/orphan_redox_wet_lab.xlsx'
gene_list_file = '../../data/other_data/SDR_mtb.xlsx'
xls_name = gene_list_file.split('.xlsx')[0] + '_TnMat.xlsx'

# Create a Pandas Excel output writer:
excel_output = os.path.join(dir_data, 'other_data', xls_name)
writer = pd.ExcelWriter(excel_output, engine='xlsxwriter')

Using excel file: 

In [7]:
df_gene_list = pd.read_excel(os.path.join(dir_data, 'other_data', gene_list_file))
gene_list = df_gene_list.Rv_ID.values

Using python list:

In [7]:
gene_list = ['Rv1139c']

# Create a Pandas Excel output writer:
xls_name = 'Rv1139c.xlsx'
excel_output = os.path.join(dir_data, 'other_data', xls_name)
writer = pd.ExcelWriter(excel_output, engine='xlsxwriter')

## Query genes:

output results to excel sheets:

In [8]:
save_to_xls = True

In [9]:
essential_dict = {}
for gene in gene_list:
    print( gene )
    hits = essentiality_calls(df_tn, gene)
    df_temp = df_column[df_column.column_ID.isin(hits)]
    essential_dict[gene] = hits
    if save_to_xls:
        df_temp.to_excel(writer, sheet_name=gene, index = False)

if save_to_xls:
    writer.save()

Rv3791
Rv1483
Rv2002
Rv2214c
Rv3085
Rv0148
Rv0945
Rv0687
Rv0927c
Rv2073c
Rv2750
Rv1350
Rv1144
Rv0112
Rv0242c
Rv3502c
Rv1928c
Rv0765c
Rv1941
Rv1865c
Rv3485c
Rv1245c
Rv1544
Rv2766c
Rv2509
Rv3548c
Rv3549c
Rv1882c
Rv0851c
Rv3057c
Rv1714
Rv0484c
Rv1543
Rv0769
Rv3559c
Rv0303
Rv0068
Rv2263
Rv1050
Rv3391
Rv3174
Rv0439c
Rv3530c
Rv2857c
Rv3224
Rv2129c
Rv0547c


### This is a simple format in which to save the dataframe:

In [10]:
df_by_genes = pd.DataFrame()
df_by_genes['Rv_ID'] = essential_dict.keys()
df_by_genes['TnSeq_calls'] = essential_dict.values()
df_by_genes['num_TnSeq'] = [len(TnSeq) for TnSeq in df_by_genes.TnSeq_calls.values]
df_by_genes.sort_values(by='num_TnSeq', ascending=False, inplace=True)

df_by_genes = df_by_genes[['Rv_ID', 'num_TnSeq', 'TnSeq_calls']]

In [11]:
df_by_genes.sample(5)

Unnamed: 0,Rv_ID,num_TnSeq,TnSeq_calls
37,Rv2263,1,[Rv3916c]
35,Rv0303,1,[Rv3005c]
41,Rv0439c,0,[]
4,Rv3085,0,[]
20,Rv3485c,1,[Rv3005c]


In [12]:
xls_out_name = 'SDR_Mtb_TnSeq_by_genes.xlsx'
excel_output = os.path.join(dir_data, 'other_data', xls_out_name)
df_by_genes.to_excel(excel_output, index = False)


## For a list of genes, make an inverse mapping of TnSeq screens --> to --> genes

In [13]:
TnSeq_screens = []
for val in essential_dict.values():
    TnSeq_screens += val
TnSeq_screens = list(set(TnSeq_screens))
TnSeq_screens.sort()

In [14]:
TnSeq_dict = {}
for TnSeq in TnSeq_screens:
    genes_in_TnSeq = [ gene for gene in essential_dict.keys() if TnSeq in essential_dict[gene]]
    genes_in_TnSeq.sort()
    TnSeq_dict[TnSeq] = genes_in_TnSeq
    
df_TnSeq = pd.DataFrame()
df_TnSeq['TnSeq_screen'] = TnSeq_dict.keys()
df_TnSeq['genes'] = TnSeq_dict.values()
df_TnSeq['num_genes'] = [len(gs) for gs in df_TnSeq.genes.values]
df_TnSeq.sort_values(by='num_genes', ascending=False, inplace=True)

df_TnSeq = df_TnSeq[['TnSeq_screen', 'num_genes', 'genes']]


In [15]:
xls_out_name = 'SDR_Mtb_genes_by_TnSeq.xlsx'
excel_output = os.path.join(dir_data, 'other_data', xls_out_name)

df_TnSeq.to_excel(excel_output, index = False)
