Testing different ways to interact with the GLS-based dataset and generate visualiations: 

In [114]:
import pandas as pd
import os
import numpy as np
from statsmodels.stats.multitest import fdrcorrection
import re

In [2]:
path_data = '/home/ajinich/Dropbox/KyuRhee/unknown_function/unknown_redox/data/GLS_TnSeq/'

## [Exercise 1]: Generate the equivalent of SI_Data-1 from the Nat. Genet. paper: 

Load Bonferroni-corrected p-values: 

In [9]:
fn_p = 'GLS_p_AJ.npy'
fn_p_path = os.path.join(path_data, fn)
fn_genes = 'genes_AJ.txt'
fn_genes_path = os.path.join(path_data, fn_genes)

genes = pd.read_csv( fn_genes_path, header=None, squeeze=True)
GLS_p = pd.DataFrame( np.load(fn_path), columns=genes, index=genes)


In [56]:
# Compute and save weights for ClusterONE
stacked_p = GLS_p.stack()
stacked_p = stacked_p[stacked_p.index.get_level_values(0) < stacked_p.index.get_level_values(1)]
# Bonferroni-corrected p-values: 
fdr = pd.Series(fdrcorrection(stacked_p)[1], index=stacked_p.index)

Query fdr:

In [76]:
th = 0.001
fdr_th = fdr[fdr.values<=th].copy()

In [63]:
rvid = 'Rv2808'
[val for val in fdr_th.index.tolist() if val[0]==rvid or val[1]==rvid]

[('Rv0554', 'Rv2808'),
 ('Rv2808', 'Rv2930'),
 ('Rv2808', 'Rv2932'),
 ('Rv2808', 'Rv2933'),
 ('Rv2808', 'Rv2934'),
 ('Rv2808', 'Rv2939'),
 ('Rv2808', 'Rv2941'),
 ('Rv2808', 'Rv3386')]

[WARNING]: This is probably not the right way to do it: for a given RvID_x, shows as pairs genes with RvID_y > RvID_x

How do I fix that? 

In [None]:
rvid = 'Rv2939'
rvid_pair = [ind for ind in fdr_th.index if ind[0]==rvid or ind[1]==rvid]
fdr_th[rvid_pair].values.tolist()

In [77]:
list_lead_gene = []
list_partner_gene = []
list_p_fdr = []
# rvid = 'Rv2940c'
for rvid in genes.values.tolist():
    rvid_pair = [ind for ind in fdr_th.index if ind[0]==rvid or ind[1]==rvid]
    # how do I fetch the p-value for the interaction?
    p_fdr = fdr_th[rvid_pair].values.tolist()
    rvid_pair_sort = [(rv[0], rv[1]) if rv[0]==rvid else (rv[1], rv[0]) for rv in rvid_pair ]

    lead_gene = [ rvid_pair_sort[i][0] if i==0 else '' for i in range(len(rvid_pair_sort))]
    partner_gene = [rvid[1] for rvid in rvid_pair_sort]

    list_lead_gene+=lead_gene
    list_partner_gene+=partner_gene
    list_p_fdr+=p_fdr

In [78]:
df_interact = pd.DataFrame()
df_interact['lead_gene'] = list_lead_gene
df_interact['partner_gene'] = list_partner_gene
df_interact['p_value_FDR'] = list_p_fdr


Adding more columns to this data: 

In [79]:
fn_out = 'test_SI_data_1_fdr.001.xlsx'
fn_out_path = os.path.join(path_data, fn_out)
df_interact.to_excel(fn_out_path, index=False)

## [Exercise 2]: Generate the equivalent of SI_Data-2 from the Nat. Genet. paper: 

In [106]:
cols = ['Cluster', 'Density', 'Size', 'Members']
list_d = [0.2, 0.5, 0.9]
df_cone_all = pd.DataFrame()
for d in list_d:
    fn = f'modules_d_{d}.csv'
    df_cone = pd.read_csv(os.path.join(path_data, fn))[cols]
    df_cone
    print(d, df_cone.shape)
    df_cone_all = pd.concat([df_cone_all, df_cone], axis=0)
df_cone_all.reset_index(inplace = True, drop=True)
df_cone_all['Cluster'] = range(1, df_cone_all.shape[0]+1)

0.2 (164, 4)
0.5 (850, 4)
0.9 (328, 4)


In [108]:
df_cone_all.drop_duplicates(subset='Members', inplace=True)

#### [PENDING]: You need to remove duplicate Clusters (different d-values)

Write to file: 

In [109]:
fn_out = 'test_SI_data_2_clustOne.xlsx'
fn_out_path = os.path.join(path_data, fn_out)
df_cone_all.to_excel(fn_out_path, index=False)

## [Exercise 3]: Given a query gene, return dataframe with genes in its cluster(s)

In [124]:
fn_up_func = '../data/tests/uniprot-proteome_UP000001584.xlsx'
df_up_func = pd.read_excel(fn_up_func)
re_str = 'Rv\d\d\d\dc?'
list_rvids = [re.findall(re_str, str_temp)[0] for str_temp in df_up_func['Gene names']]
df_up_func['Rv_ID'] = list_rvids
df_up_func.sort_values(by = 'Rv_ID', inplace=True)
df_up_func['annot_int'] = [int(annot.split()[0]) for annot in df_up_func.Annotation]
cols = ['Rv_ID', 'Gene names', 'Protein names', 'Annotation', 'Function [CC]']
df_up_func = df_up_func[cols]
df_up_func.reset_index(inplace = True, drop=True)

In [126]:
df_up_func.head(3)

Unnamed: 0,Rv_ID,Gene names,Protein names,Annotation,Function [CC]
0,Rv0001,dnaA Rv0001 MTV029.01,Chromosomal replication initiator protein DnaA,4 out of 5,FUNCTION: Plays an important role in the initi...
1,Rv0002,dnaN Rv0002 MTCY10H4.0 MTV029.02,Beta sliding clamp (Beta clamp) (Sliding clamp...,4 out of 5,FUNCTION: Confers DNA tethering and processivi...
2,Rv0003,recF Rv0003 MTCY10H4.01,DNA replication and repair protein RecF,2 out of 5,FUNCTION: The RecF protein is involved in DNA ...


In [130]:
query = 'Rv3502c'


In [134]:
df_query.head(1)

Unnamed: 0,Cluster,Density,Size,Members
6,7,0.1783,141,Rv0069c Rv0096 Rv0097 Rv0098 Rv0099 Rv0100 Rv0...


In [153]:
path_out = os.path.join(path_data, 'Rv_ID_clusters')


for query in df_up_func.Rv_ID.tolist():

    fn_out = query+'_clusters.xlsx'
    fn_out_path = os.path.join(path_out, fn_out)
    df_query = df_cone_all[df_cone_all.Members.str.contains(query)].copy()
    
    with pd.ExcelWriter(fn_out_path) as writer:  

        for index, row in df_query.iterrows():
            df_temp = pd.DataFrame()
            list_members = row.Members.split()
            list_cluster = [row.Cluster]*len(list_members)
            df_temp['Cluster'] = list_cluster
            df_temp['Rv_ID'] = list_members
            # Add UniProt annotation info:
            df_temp_func = df_temp.merge(df_up_func, how = 'left', on = 'Rv_ID').sort_values(by = 'Annotation', ascending=False)
            df_temp_func.reset_index(inplace=True, drop=True)
            df_temp_func.to_excel(writer, sheet_name='cluster_'+str(row.Cluster), index=False)
