## Import modules:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns

## Function definitions: 

In [2]:
def essentiality_calls(df, gene_id):
# Return the columns in which a given gene is called essential:
    cols = df.columns[2:]
    col_list = []
    for col in cols:
        if df[df['Rv_ID']==gene_id][col].values[0] == 1:
            col_list.append(col)
            
    return col_list

## Load datasets:

Screen condition descriptions:

In [3]:
dir_data = '../../data'
column_file = os.path.join(dir_data, 'column_descriptors.csv')
df_column = pd.read_csv(column_file)

TnSeq matrix:

In [4]:
tn_file = os.path.join(dir_data, 'Tn_library_DB.xlsx')
df_tn = pd.read_excel(tn_file)
df_tn.shape

(3990, 60)

## Read in list of genes to query:

File names: 

In [19]:
gene_list_file = '../../data/other_data/orphan_redox_wet_lab.xlsx'
xls_name = gene_list_file.split('.xlsx')[0] + '_TnMat.xlsx'

# Create a Pandas Excel output writer:
excel_output = os.path.join(dir_data, 'other_data', xls_name)
writer = pd.ExcelWriter(excel_output, engine='xlsxwriter')

Using excel file: 

In [21]:
df_gene_list = pd.read_excel(os.path.join(dir_data, 'other_data', gene_list_file))
gene_list = df_gene_list.Rv_ID.values

Using python list:

In [7]:
gene_list = ['Rv1139c']

# Create a Pandas Excel output writer:
xls_name = 'Rv1139c.xlsx'
excel_output = os.path.join(dir_data, 'other_data', xls_name)
writer = pd.ExcelWriter(excel_output, engine='xlsxwriter')

## Query genes:

output results to excel sheets:

In [23]:
save_to_xls = True

In [24]:
essential_dict = {}
for gene in gene_list:
    print( gene )
    hits = essentiality_calls(df_tn, gene)
    df_temp = df_column[df_column.column_ID.isin(hits)]
    essential_dict[gene] = hits
    if save_to_xls:
        df_temp.to_excel(writer, sheet_name=gene, index = False)

if save_to_xls:
    writer.save()

Rv0385
Rv0697
Rv1144
Rv1432
Rv1544
Rv2047c
Rv2509
Rv2857c
Rv3057c
Rv3170
Rv3230c
Rv3502c
Rv3520c
Rv3548c
Rv1714
Rv0926c
Rv3559c
Rv3719


### This is a simple format in which to save the dataframe:

In [42]:
df_by_genes = pd.DataFrame()
df_by_genes['Rv_ID'] = essential_dict.keys()
df_by_genes['TnSeq_calls'] = essential_dict.values()
df_by_genes['num_TnSeq'] = [len(TnSeq) for TnSeq in df_by_genes.TnSeq_calls.values]
df_by_genes.sort_values(by='num_TnSeq', ascending=False, inplace=True)

df_by_genes = df_by_genes[['Rv_ID', 'num_TnSeq', 'TnSeq_calls']]

In [43]:
df_by_genes.sample(5)

Unnamed: 0,Rv_ID,num_TnSeq,TnSeq_calls
302,Rv1360,0,[]
193,Rv3359,2,"[2015_Mendum, Rv0307c]"
81,Rv1512,8,"[2013_DeJesus, 2013_Zhang_1, 2013_Zhang_2, 201..."
17,Rv0468,2,"[2013_Zhang_1, Rv3916c]"
208,Rv3093c,0,[]


In [44]:
xls_out_name = 'redox_all_TnSeq_by_genes.xlsx'
excel_output = os.path.join(dir_data, 'other_data', xls_out_name)
df_by_genes.to_excel(excel_output, index = False)


## For a list of genes, make an inverse mapping of TnSeq screens --> to --> genes

In [27]:
TnSeq_screens = []
for val in essential_dict.values():
    TnSeq_screens += val
TnSeq_screens = list(set(TnSeq_screens))
TnSeq_screens.sort()

In [28]:
TnSeq_dict = {}
for TnSeq in TnSeq_screens:
    genes_in_TnSeq = [ gene for gene in essential_dict.keys() if TnSeq in essential_dict[gene]]
    genes_in_TnSeq.sort()
    TnSeq_dict[TnSeq] = genes_in_TnSeq
    
df_TnSeq = pd.DataFrame()
df_TnSeq['TnSeq_screen'] = TnSeq_dict.keys()
df_TnSeq['genes'] = TnSeq_dict.values()
df_TnSeq['num_genes'] = [len(gs) for gs in df_TnSeq.genes.values]
df_TnSeq.sort_values(by='num_genes', ascending=False, inplace=True)

df_TnSeq = df_TnSeq[['TnSeq_screen', 'num_genes', 'genes']]


In [29]:
xls_out_name = 'redox_orphan_genes_by_TnSeq.xlsx'
excel_output = os.path.join(dir_data, 'other_data', xls_out_name)

df_TnSeq.to_excel(excel_output, index = False)
