# Map patients to relapse networks

In [1]:
import os,importlib,numpy,json

import scipy,scipy.stats

import miner_20190520 as miner

import pandas
pandas.options.display.max_rows=100
pandas.set_option('display.max_colwidth',-1)

import matplotlib,matplotlib.pyplot
matplotlib.pyplot.rcParams['axes.axisbelow']=True
matplotlib.rcParams.update({'font.size':18,'font.family':'Arial','xtick.labelsize':20,'ytick.labelsize':20,'axes.labelsize':30})
matplotlib.rcParams['pdf.fonttype']=42

ModuleNotFoundError: No module named 'miner_20190520'

# 0. User defined variables

In [None]:
expression_data_file='/Users/adrianlopezgarciadelomana/gd/projects/MINER/shared/MINER/data/expression/IA12Zscore.csv'
regulonDf_file='/Users/adrianlopezgarciadelomana/gd/projects/MINER/shared/MINER/results_minCorrelation_0o2_50_allFiles/regulonDf.csv'
overExpressedMembersMatrix_file='/Users/adrianlopezgarciadelomana/gd/projects/MINER/shared/MINER/results_minCorrelation_0o2_50_allFiles/overExpressedMembers.csv'

In [None]:
expression_data_file='/Users/alomana/Google Drive File Stream/My Drive/projects/MINER/shared/MINER/data/expression/IA12Zscore.csv'
regulonDf_file='/Users/alomana/Google Drive File Stream/My Drive/projects/MINER/shared/MINER/results_minCorrelation_0o2_50_allFiles/regulonDf.csv'
overExpressedMembersMatrix_file='/Users/alomana/Google Drive File Stream/My Drive/projects/MINER/shared/MINER/results_minCorrelation_0o2_50_allFiles/overExpressedMembers.csv'

# 1. Read data

In [None]:
expressionData, conversionTable = miner.preprocess(expression_data_file)

In [None]:
regulonDf = pandas.read_csv(regulonDf_file,index_col=0,header=0)

In [None]:
overExpressedMembersMatrix = pandas.read_csv(overExpressedMembersMatrix_file,index_col=0,header=0)

# 2.  Analysis

## 2.1 Find relapse signatures using all baseline

In [None]:
# Define phenotypes of interest
all_patients=[patient for patient in list(expressionData.columns) if patient.split('_')[-1] == 'BM']
baseline_patients=[pat for pat in all_patients if pat.split("_")[2] == '1']
first_relapse_patients=[pat for pat in all_patients if pat.split("_")[2] == '2']
relapse_patients=[pat for pat in all_patients if pat.split("_")[2] != '1']
multiple_relapse_patients=list(set(relapse_patients)-set(first_relapse_patients))

phenotype1=baseline_patients
phenotype2=relapse_patients

print('Found {} background events.'.format(len(phenotype1)))
print(phenotype1[:10])
print('')
print('Found {} relapse events.'.format(len(phenotype2)))
print(phenotype2[:10])

In [None]:
volcano_data = miner.differentialActivity(regulon_matrix = regulonDf.copy(),reference_matrix = overExpressedMembersMatrix.copy(),baseline_patients=phenotype1,relapse_patients=phenotype2,maxRegulons = 3,minRegulons = 3,useAllRegulons = False,savefile = 'figure.pdf')

In [None]:
volcano_data.head()

In [None]:
insigvoldata_patients = volcano_data.index[volcano_data["-log10(p)"]<=-numpy.log10(0.05)]
sigvoldata_patients_plus = volcano_data.index[(volcano_data["-log10(p)"] > -numpy.log10(0.05)) & (volcano_data['log2(phenotype2/phenotype1)'] > 0)]
sigvoldata_patients_minus = volcano_data.index[(volcano_data["-log10(p)"] > -numpy.log10(0.05)) & (volcano_data['log2(phenotype2/phenotype1)'] < 0)]

insigvoldata = volcano_data.loc[insigvoldata_patients,:]
sigvoldata_plus = volcano_data.loc[sigvoldata_patients_plus,:]
sigvoldata_minus = volcano_data.loc[sigvoldata_patients_minus,:]

fig = matplotlib.pyplot.figure(figsize=(20,10))

matplotlib.pyplot.scatter(sigvoldata_plus["phenotype2_frequency"],numpy.array(sigvoldata_plus["log2(phenotype2/phenotype1)"]),color = 'red',alpha=0.2,s=75,linewidths=0)

matplotlib.pyplot.scatter(sigvoldata_minus["phenotype2_frequency"],numpy.array(sigvoldata_minus["log2(phenotype2/phenotype1)"]),color = 'blue',alpha=0.2,s=100,linewidths=0)

matplotlib.pyplot.scatter(insigvoldata["phenotype2_frequency"],numpy.array(insigvoldata["log2(phenotype2/phenotype1)"]),color ='black',edgecolor = [0.1,0.1,0.1],alpha=0.2,s=75,linewidths=0)

matplotlib.pyplot.ylabel("log2 FC R/B gene activity ")
matplotlib.pyplot.xlabel("Relapse overexpression frequency")
matplotlib.pyplot.title('Relapse vs background',fontsize=40)
matplotlib.pyplot.grid()
matplotlib.pyplot.tight_layout()

In [None]:
print(sigvoldata_plus.shape)
print(sigvoldata_minus.shape)
full_df=pandas.concat([sigvoldata_plus,sigvoldata_minus])
print(full_df.shape)
full_df.rename(columns={'phenotype1_frequency':'background_frequency','phenotype2_frequency':'relapse_frequency','log2(phenotype2/phenotype1)':'log2(relapse/background)'},inplace=True)
full_df.to_csv(path_or_buf='relapse_vs_background.v2.csv')
full_df.head()

## 2.2 Find relapse signatures using matched patients

In [None]:
relapse_events = [pat for pat in all_patients if pat.split("_")[2] != '1']
relapse_events.sort()

baseline_patients=[]; relapse_patients=[]

for event in relapse_events:
    label=event.split('_')[1]
    baseline_label='MMRF_{}_1_BM'.format(label)
    if baseline_label in all_patients:
        baseline_patients.append(baseline_label)
        if 'BM' in event:
            relapse_patients.append(event)
        
baseline_patients=list(set(baseline_patients))
relapse_patients=list(set(relapse_patients))

baseline_patients.sort()
relapse_patients.sort()

phenotype1 = baseline_patients
phenotype2 = relapse_patients

print('Found {} baseline events.'.format(len(phenotype1)))
print(phenotype1[:10])
print('')
print('Found {} relapse events.'.format(len(phenotype2)))
print(phenotype2[:10])

In [None]:
volcano_data = miner.differentialActivity(regulon_matrix = regulonDf.copy(),reference_matrix = overExpressedMembersMatrix.copy(),baseline_patients=phenotype1,relapse_patients=phenotype2,maxRegulons = 3,minRegulons = 3,useAllRegulons = False)
volcano_data.head()

In [None]:
insigvoldata_patients = volcano_data.index[volcano_data["-log10(p)"]<=-numpy.log10(0.05)]
sigvoldata_patients_plus = volcano_data.index[(volcano_data["-log10(p)"] > -numpy.log10(0.05)) & (volcano_data['log2(phenotype2/phenotype1)'] > 0)]
sigvoldata_patients_minus = volcano_data.index[(volcano_data["-log10(p)"] > -numpy.log10(0.05)) & (volcano_data['log2(phenotype2/phenotype1)'] < 0)]

insigvoldata = volcano_data.loc[insigvoldata_patients,:]
sigvoldata_plus = volcano_data.loc[sigvoldata_patients_plus,:]
sigvoldata_minus = volcano_data.loc[sigvoldata_patients_minus,:]

fig = matplotlib.pyplot.figure(figsize=(20,10))

matplotlib.pyplot.scatter(sigvoldata_plus["phenotype2_frequency"],numpy.array(sigvoldata_plus["log2(phenotype2/phenotype1)"]),color = 'red',alpha=0.2,s=75,linewidths=0)

matplotlib.pyplot.scatter(sigvoldata_minus["phenotype2_frequency"],numpy.array(sigvoldata_minus["log2(phenotype2/phenotype1)"]),color = 'blue',alpha=0.2,s=100,linewidths=0)

matplotlib.pyplot.scatter(insigvoldata["phenotype2_frequency"],numpy.array(insigvoldata["log2(phenotype2/phenotype1)"]),color ='black',edgecolor = [0.1,0.1,0.1],alpha=0.2,s=75,linewidths=0)

matplotlib.pyplot.ylabel("log2 FC R/B gene activity ")
matplotlib.pyplot.xlabel("Relapse overexpression frequency")
matplotlib.pyplot.title('Relapse vs baseline',fontsize=40)

matplotlib.pyplot.grid()
matplotlib.pyplot.tight_layout()

In [None]:
print(sigvoldata_plus.shape)
print(sigvoldata_minus.shape)
full_df=pandas.concat([sigvoldata_plus,sigvoldata_minus])
print(full_df.shape)
full_df.rename(columns={'phenotype1_frequency':'baseline_frequency','phenotype2_frequency':'relapse_frequency','log2(phenotype2/phenotype1)':'log2(baseline/background)'},inplace=True)
full_df.to_csv(path_or_buf='relapse_vs_baseline.v2.csv')
full_df.head()

# 3. Functional annotation of relapse-baseline networks

In [None]:
def reactome_formatter(json_file):
        
    output_file=json_file.replace('.json','_formatted.txt')
    synonyms_up='/Volumes/omics4tb2/alomana/projects/MINER/up_synonyms.txt'
    synonyms_down='/Volumes/omics4tb2/alomana/projects/MINER/down_synonyms.txt'
    
    # read synonyms files
    synonyms={}
    for reg in [synonyms_up,synonyms_down]:
        with open(reg,'r') as s:
            for line in s:
                v=line.split('\t')
                ensemblID=v[1]
                gene_name=v[2].split(';')[0]
                gene_symbol=v[2].split(';')[1]
                synonyms[ensemblID]=[gene_symbol,gene_name]
    # end read synonyms file
    
    g=open(output_file,'w')
    g.write('Level\tTerm\tBackground rank\tFound rank\tExpected rank\tFold enrichment\tSign\tP-value\tTranscripts\tGene symbols\tGene names\n')
    
    with open(json_file,'r') as f:
        data = json.load(f)
        for group in data['overrepresentation']['group']:
            if type(group) == dict:
                if type(group['result']) == list:
                    pathways=group['result']
                elif type(group['result']) == dict:
                    pathways=[group['result']]
                else:
                    print('ERROR: value not considered for variable group[result].')
                
                for element in pathways:
                        
                    sign=element['input_list']['plus_minus']
                    if sign == '+':
                                                                        
                        level=element['term']['level']
                        term=element['term']['label']
                        background_rank=element['number_in_reference']
                        found_rank=element['input_list']['number_in_list']
                        expected_rank=element['input_list']['expected']
                        fold_enrichment=element['input_list']['fold_enrichment']
                        pvalue=element['input_list']['pValue']
                        
                        working_variable=element['input_list']['mapped_id_list']['mapped_id']
                        if type(working_variable) == list:
                            transcripts=working_variable
                            gene_symbols=[synonyms[ID][0] for ID in transcripts]
                            gene_names=[synonyms[ID][1] for ID in transcripts]
                        elif type(working_variable) == str:
                            transcripts=[working_variable]
                            gene_symbols=[synonyms[transcripts[0]][0]]
                            gene_names=[synonyms[transcripts[0]][1]]
                        else:
                            print('ERROR: value not considered for variable working_variable.')
                            
                        transcripts_string=', '.join(transcripts)
                        gene_symbols_string=', '.join(gene_symbols)
                        gene_names_string=', '.join(gene_names)

                        g.write('{}\t'.format(level))
                        g.write('{}\t'.format(term))
                        g.write('{}\t'.format(background_rank))
                        g.write('{}\t'.format(found_rank))
                        g.write('{}\t'.format(expected_rank))
                        g.write('{}\t'.format(fold_enrichment))
                        g.write('{}\t'.format(sign))
                        g.write('{}\t'.format(pvalue))
                        g.write('{}\t'.format(transcripts_string))
                        g.write('{}\t'.format(gene_symbols_string))
                        g.write('{}'.format(gene_names_string))

                        g.write('\n')
        
    g.close()
    
    return output_file

In [None]:
def panther_formatter(json_file):
        
    output_file=json_file.replace('.json','_formatted.txt')
    synonyms_up='/Volumes/omics4tb2/alomana/projects/MINER/up_synonyms.txt'
    synonyms_down='/Volumes/omics4tb2/alomana/projects/MINER/down_synonyms.txt'
    
    # read synonyms files
    synonyms={}
    for reg in [synonyms_up,synonyms_down]:
        with open(reg,'r') as s:
            for line in s:
                v=line.split('\t')
                ensemblID=v[1]
                gene_name=v[2].split(';')[0]
                gene_symbol=v[2].split(';')[1]
                synonyms[ensemblID]=[gene_symbol,gene_name]
    # end read synonyms file
    
    g=open(output_file,'w')
    g.write('Term\tBackground rank\tFound rank\tExpected rank\tFold enrichment\tSign\tP-value\tTranscripts\tGene symbols\tGene names\n')
    
    with open(json_file,'r') as f:
        data = json.load(f)
        for group in data['overrepresentation']['group']:
            if type(group) == dict:
                if type(group['result']) == list:
                    pathways=group['result']
                elif type(group['result']) == dict:
                    pathways=[group['result']]
                else:
                    print('ERROR: value not considered for variable group[result].')
                
                for element in pathways:
                    pvalue=element['input_list']['pValue']
                    sign=element['input_list']['plus_minus']
                    
                    if pvalue < 0.05 and sign == '+':
                        
                        term=element['term']['label']
                        background_rank=element['number_in_reference']
                        found_rank=element['input_list']['number_in_list']
                        expected_rank=element['input_list']['expected']
                        fold_enrichment=element['input_list']['fold_enrichment']
                        
                        working_variable=element['input_list']['mapped_id_list']['mapped_id']
                        transcripts=working_variable
                        gene_symbols=[synonyms[ID][0] for ID in transcripts]
                        gene_names=[synonyms[ID][1] for ID in transcripts]
                        
                        transcripts_string=', '.join(transcripts)
                        gene_symbols_string=', '.join(gene_symbols)
                        gene_names_string=', '.join(gene_names)

                        g.write('{}\t'.format(term))
                        g.write('{}\t'.format(background_rank))
                        g.write('{}\t'.format(found_rank))
                        g.write('{}\t'.format(expected_rank))
                        g.write('{}\t'.format(fold_enrichment))
                        g.write('{}\t'.format(sign))
                        g.write('{}\t'.format(pvalue))
                        g.write('{}\t'.format(transcripts_string))
                        g.write('{}\t'.format(gene_symbols_string))
                        g.write('{}'.format(gene_names_string))

                        g.write('\n')
                    
    g.close()
    
    return output_file

## 3.1. Upregulation

In [None]:
json_file='/Volumes/omics4tb2/alomana/projects/MINER/up_reactome_pathways.json'
output_file=reactome_formatter(json_file)
df=pandas.read_csv(output_file,sep='\t',index_col='Term')
print(df.shape)
df.head(n=df.shape[0])

In [None]:
specifics=df['Level'] == 0
new=df[specifics]
print(new.shape)
new.head(n=new.shape[0])

## 3.2. Downregulation

In [None]:

json_file='/Volumes/omics4tb2/alomana/projects/MINER/down_panther_pathways.json'
output_file=panther_formatter(json_file)
df=pandas.read_csv(output_file,sep='\t',index_col='Term')
print(df.shape)
df.head(n=df.shape[0])