In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from random import random
import seaborn as sns
import numpy as np

### Function definitions here: 

In [None]:
def unknown_essential_xy(TnSeq_screen, df_data, df_uk, rand_param = 0.85):

    # Grab data for a single TnSeq screen
    cols = [col for col in df_data.columns if TnSeq_screen in col]
    df_data_test = df_data[['Rv_ID', 'gene_name'] + cols].copy()
    
    # Discretize q-values: 
    col_q_val = [col for col in df_data_test.columns if 'q_val' in col][0]
    df_data_test['q_val_D'] = df_data_test.apply(discretize_q_values, 1, args=[col_q_val])
    
    # Merge with unknowns: 
    df_vis = df_data_test.merge(df_uk, on = ['Rv_ID', 'gene_name'], how = 'inner')
    
    # Get x-y datasets: 
    rv_ids = df_vis.Rv_ID.values
    uk_list = np.array(df_vis.UK_score_4)
    q_list = np.array(df_vis.q_val_D)
    
    # randomize: 
    uk_rd = np.array([uk + rand_param*random()-rand_param/2 for uk in uk_list])
    q_rd = np.array([q + rand_param*random()-rand_param/2 for q in q_list])
    
    # color the unknown-essentials differently: 
    current_palette = sns.color_palette()
    # all genes are gray by default. 
    color_list = np.array([(0.85,0.85,0.85)]*df_vis.shape[0])
    # Unknown essentials in a different color. 
    ind_temp = list(df_vis[(df_vis.q_val_D == 3) & (df_vis.UK_score_4 == 4)].index)
    color_list[ind_temp] = current_palette[0]
    
    return uk_rd, q_rd, color_list, rv_ids

### Load annotation score data: 

In [4]:
file_uk = '../../data/annotations/unknown_essentials/unknown_ALL_levels_essential_scores.csv'
df_uk = pd.read_csv(file_uk)
df_uk = df_uk[['Rv_ID', 'gene_name', 'UK_score_4']]
df_uk.head()

Unnamed: 0,Rv_ID,gene_name,UK_score_4
0,Rv0001,dnaA,0.0
1,Rv3582c,ispD,1.0
2,Rv0510,hemC,0.0
3,Rv0509,hemA,0.0
4,Rv0500,proC,0.0


### Load log2FC and q-value dataset: 

In [5]:
file_log2FC = '../../data/standardized_data/result_logfc_matrix_2019_12_10.csv'
df_log2FC = pd.read_csv(file_log2FC)

file_qval = '../../data/standardized_data/result_qval_matrix_2019_12_10.csv'
df_qval = pd.read_csv(file_qval)

In [6]:
df_log2FC.head(2)

Unnamed: 0.1,Unnamed: 0,PE35_KO_vs_mbio_H37Rv,PPE68_KO_vs_mbio_H37Rv,Rv0950c_KO_vs_CB_WT,Rv0954_KO_vs_RJ_WT,Rv1096_KO_vs_CB_WT,Rv3005c_KO_day32_vs_dejesus_H37Rv_day32,Rv3594_KO_vs_Rubin_FLUTE_WT,Rv3684_KO_vs_CB_WT,Rv3717_KO_vs_Rubin_FLUTE_WT,...,zhang_AA_Rescue_vs_zhang_in_vitro_control_Rescue,zhang_DETA-NO_pH_7.0_vs_zhang_pH_7.0_no_NO_control,zhang_Fe_1.5mM_vs_zhang_Fe_450uM,zhang_Trp_Rescue_vs_zhang_in_vitro_control_Rescue,zhang_Tyloxapol_pH_6.5_vs_zhang_Tyloxapol_pH_4.5,zhang_Tyloxapol_pH_6.5_vs_zhang_pcit_pH_4.5,zhang_mhcii_mouse_d10_vs_zhang_wt_mouse_d10,zhang_mhcii_mouse_d45_vs_zhang_wt_mouse_d45,zhang_wt_mouse_d10_vs_zhang_input_library,zhang_wt_mouse_d45_vs_zhang_input_library
0,Rv0001,-0.41,0.01,0.0,0.0,0.0,3.12,-0.06,0.0,-0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Rv0002,3.28,2.38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Unit test for a single TnSeq screen: 