In [1]:
import pandas,os,importlib,numpy

import matplotlib,matplotlib.pyplot
import scipy,scipy.stats
import miner_20190520 as miner

matplotlib.pyplot.rcParams['axes.axisbelow'] = True
matplotlib.rcParams.update({'font.size':18,'font.family':'Arial','xtick.labelsize':20,'ytick.labelsize':20,'axes.labelsize':30})
matplotlib.rcParams['pdf.fonttype']=42

ModuleNotFoundError: No module named 'miner_20190520'

In [None]:
patient=['amp1q','t414','del13'] # plus trisomy 5, not found in MMRF. Trisomy 5 is better prognosis on t4;14 patients

# 1. Read cytogenetics

In [None]:
cytogenetics_file='/Users/adrianlopezgarciadelomana/gd/projects/MINER/shared/MINER/data/mutations/cytogenetics.csv'
calls_file='/Users/adrianlopezgarciadelomana/gd/projects/MINER/shared/MINER/data/mutations/translocationsIA12.csv'

In [None]:
cytogenetics_file='/Users/alomana/Google Drive File Stream/My Drive/projects/MINER/shared/MINER/data/mutations/cytogenetics.csv'
calls_file='/Users/alomana/Google Drive File Stream/My Drive/projects/MINER/shared/MINER/data/mutations/translocationsIA12.csv'

In [None]:
df=pandas.read_csv(cytogenetics_file,index_col=0)
df

In [None]:
calls=pandas.read_csv(calls_file,index_col=0)
calls

# 2. Find similar patients

In [None]:
dft=df.T
callsT=calls.T

In [None]:
#! make sure I clean for available rnaseq calls too.
similar=dft[
    (dft['amp1q'] == 1) & # 
    (dft['t414'] == 1) &  # whc1 
    (dft['del13'] == 1) & # 
    #
    (dft['del17'] == 0) & 
    (dft['t1114'] == 0) & # ccnd1
    (dft['t1416'] == 0) & # maf
    (dft['del17p'] == 0) & #
    (dft['del1p'] == 0)
]
print(similar.shape)
similar

In [None]:
callsT.head()
a=similar.index.tolist()
b=['RNASeq_WHSC1_Call','RNASeq_CCND1_Call','RNASeq_MAF_Call']
callsT.loc[a][b]

# 3. Map patients to relapse networks

## 3.0. Info paths

In [None]:
expression_data_file='/Users/adrianlopezgarciadelomana/gd/projects/MINER/shared/MINER/data/expression/IA12Zscore.csv'
regulonDf_file='/Users/adrianlopezgarciadelomana/gd/projects/MINER/shared/MINER/results_minCorrelation_0o2_50_allFiles/regulonDf.csv'
overExpressedMembersMatrix_file='/Users/adrianlopezgarciadelomana/gd/projects/MINER/shared/MINER/results_minCorrelation_0o2_50_allFiles/overExpressedMembers.csv'

In [None]:
expression_data_file='/Users/alomana/Google Drive File Stream/My Drive/projects/MINER/shared/MINER/data/expression/IA12Zscore.csv'
regulonDf_file='/Users/alomana/Google Drive File Stream/My Drive/projects/MINER/shared/MINER/results_minCorrelation_0o2_50_allFiles/regulonDf.csv'
overExpressedMembersMatrix_file='/Users/alomana/Google Drive File Stream/My Drive/projects/MINER/shared/MINER/results_minCorrelation_0o2_50_allFiles/overExpressedMembers.csv'

## 3.1. Read data

In [None]:
expressionData, conversionTable = miner.preprocess(expression_data_file)

In [None]:
regulonDf = pandas.read_csv(regulonDf_file,index_col=0,header=0)

In [None]:
overExpressedMembersMatrix = pandas.read_csv(overExpressedMembersMatrix_file,index_col=0,header=0)

# explore relapse MMRF

In [None]:
target_labels=similar.index.tolist()
target_ids=[element.split('_')[1] for element in target_labels]

matches=[]
mmrf_labels=expressionData.columns
for element in mmrf_labels:
    reference=element.split('_')[1]
    if reference in target_ids:
        matches.append(element)
matches.sort()
print(matches)

## 3.2 Find relapse signatures using all baseline

In [None]:
# Define phenotypes of interest
all_patients = expressionData.columns
baseline_patients = [pat for pat in all_patients if pat.split("_")[2]=='1']
first_relapse_patients = [pat for pat in all_patients if pat.split("_")[2]=='2']
relapse_patients = [pat for pat in all_patients if pat.split("_")[2]!='1']
multiple_relapse_patients = list(set(relapse_patients)-set(first_relapse_patients))

phenotype1 = baseline_patients
phenotype2 = relapse_patients

print(len(phenotype1))
print(len(phenotype2))

print(phenotype2[:10])

In [None]:
#importlib.reload(miner)
volcano_data = miner.differentialActivity(regulon_matrix = regulonDf.copy(),reference_matrix = overExpressedMembersMatrix.copy(),baseline_patients=phenotype1,relapse_patients=phenotype2,maxRegulons = 3,minRegulons = 3,useAllRegulons = False,savefile = 'figure.pdf')

In [None]:
volcano_data.head()

In [None]:
ikzf1 = "ENSG00000185811"

insigvoldata_patients = volcano_data.index[volcano_data["-log10(p)"]<=-numpy.log10(0.05)]
sigvoldata_patients_plus = volcano_data.index[(volcano_data["-log10(p)"] > -numpy.log10(0.05)) & (volcano_data['log2(phenotype2/phenotype1)'] > 0)]
sigvoldata_patients_minus = volcano_data.index[(volcano_data["-log10(p)"] > -numpy.log10(0.05)) & (volcano_data['log2(phenotype2/phenotype1)'] < 0)]

print(sigvoldata_patients_plus,len(sigvoldata_patients_plus))

insigvoldata = volcano_data.loc[insigvoldata_patients,:]
sigvoldata_plus = volcano_data.loc[sigvoldata_patients_plus,:]
sigvoldata_minus = volcano_data.loc[sigvoldata_patients_minus,:]

fig = matplotlib.pyplot.figure(figsize=(20,10))

matplotlib.pyplot.scatter(sigvoldata_plus["phenotype2_frequency"],numpy.array(sigvoldata_plus["log2(phenotype2/phenotype1)"]),color = 'red',alpha=0.2,s=75,linewidths=0)

matplotlib.pyplot.scatter(sigvoldata_minus["phenotype2_frequency"],numpy.array(sigvoldata_minus["log2(phenotype2/phenotype1)"]),color = 'blue',alpha=0.2,s=100,linewidths=0)

matplotlib.pyplot.scatter(insigvoldata["phenotype2_frequency"],numpy.array(insigvoldata["log2(phenotype2/phenotype1)"]),color ='black',edgecolor = [0.1,0.1,0.1],alpha=0.2,s=75,linewidths=0)

matplotlib.pyplot.scatter(volcano_data.loc[ikzf1,"phenotype2_frequency"],numpy.array(volcano_data.loc[ikzf1,"log2(phenotype2/phenotype1)"]),color = 'green',s=75,linewidths=0)

matplotlib.pyplot.xlim(-0.05,0.65)
matplotlib.pyplot.ylim(-3.2,3.2)
matplotlib.pyplot.ylabel("log2 FC R/B gene activity ")
matplotlib.pyplot.xlabel("Relapse overexpression frequency")
matplotlib.pyplot.grid()
matplotlib.pyplot.tight_layout()

In [None]:
subset_patients = volcano_data.index[(volcano_data["-log10(p)"] > -numpy.log10(0.05)) & (volcano_data['log2(phenotype2/phenotype1)'] > 0.5) & (volcano_data['phenotype2_frequency'] > 0.5)]

subset_data = volcano_data.loc[subset_patients,:]

fig = matplotlib.pyplot.figure(figsize=(20,10))

matplotlib.pyplot.scatter(subset_data["phenotype2_frequency"],numpy.array(subset_data["log2(phenotype2/phenotype1)"]),color = 'orange',alpha=0.2,s=75,linewidths=0)

matplotlib.pyplot.xlim(-0.05,0.65)
matplotlib.pyplot.ylim(-3.2,3.2)
matplotlib.pyplot.ylabel("log2 FC R/B gene activity ")
matplotlib.pyplot.xlabel("Relapse overexpression frequency")
matplotlib.pyplot.grid()
matplotlib.pyplot.tight_layout()

In [None]:
subset_data.to_csv(path_or_buf='case_1.csv')

## 3.3 Find relapse signatures using matched patients

In [None]:
print(len(all_patients))
a=[element for element in all_patients if 'BM' in element]
b=[element for element in all_patients if 'PB' in element]
print(len(a),len(b),len(a)+len(b))

In [None]:
relapse_events = [pat for pat in all_patients if pat.split("_")[2]!='1']
relapse_events.sort()

baseline_patients=[]; relapse_patients=[]

for event in relapse_events:
    label=event.split('_')[1]
    baseline_label='MMRF_{}_1_BM'.format(label)
    if baseline_label in all_patients:
        baseline_patients.append(baseline_label)
        if 'BM' in event:
            relapse_patients.append(event)
        
baseline_patients=list(set(baseline_patients))
relapse_patients=list(set(relapse_patients))

baseline_patients.sort()
relapse_patients.sort()

print(baseline_patients,len(baseline_patients))
print(relapse_patients,len(relapse_patients))

In [None]:
phenotype1 = baseline_patients
phenotype2 = relapse_patients

In [None]:
volcano_data = miner.differentialActivity(regulon_matrix = regulonDf.copy(),reference_matrix = overExpressedMembersMatrix.copy(),baseline_patients=phenotype1,relapse_patients=phenotype2,maxRegulons = 3,minRegulons = 3,useAllRegulons = False)
volcano_data.head()

In [None]:
insigvoldata_patients = volcano_data.index[volcano_data["-log10(p)"]<=-numpy.log10(0.05)]
sigvoldata_patients_plus = volcano_data.index[(volcano_data["-log10(p)"] > -numpy.log10(0.05)) & (volcano_data['log2(phenotype2/phenotype1)'] > 0)]
sigvoldata_patients_minus = volcano_data.index[(volcano_data["-log10(p)"] > -numpy.log10(0.05)) & (volcano_data['log2(phenotype2/phenotype1)'] < 0)]

insigvoldata = volcano_data.loc[insigvoldata_patients,:]
sigvoldata_plus = volcano_data.loc[sigvoldata_patients_plus,:]
sigvoldata_minus = volcano_data.loc[sigvoldata_patients_minus,:]

fig = matplotlib.pyplot.figure(figsize=(20,10))

matplotlib.pyplot.scatter(sigvoldata_plus["phenotype2_frequency"],numpy.array(sigvoldata_plus["log2(phenotype2/phenotype1)"]),color = 'red',alpha=0.2,s=75,linewidths=0)

matplotlib.pyplot.scatter(sigvoldata_minus["phenotype2_frequency"],numpy.array(sigvoldata_minus["log2(phenotype2/phenotype1)"]),color = 'blue',alpha=0.2,s=100,linewidths=0)

matplotlib.pyplot.scatter(insigvoldata["phenotype2_frequency"],numpy.array(insigvoldata["log2(phenotype2/phenotype1)"]),color ='black',edgecolor = [0.1,0.1,0.1],alpha=0.2,s=75,linewidths=0)

matplotlib.pyplot.scatter(volcano_data.loc[ikzf1,"phenotype2_frequency"],numpy.array(volcano_data.loc[ikzf1,"log2(phenotype2/phenotype1)"]),color = 'green',s=75,linewidths=0)

matplotlib.pyplot.xlim(-0.05,0.7)
matplotlib.pyplot.ylim(-3.5,3.5)
matplotlib.pyplot.ylabel("log2 FC R/B gene activity ")
matplotlib.pyplot.xlabel("Relapse overexpression frequency")
matplotlib.pyplot.grid()
matplotlib.pyplot.tight_layout()

In [None]:
subset_patients = volcano_data.index[(volcano_data["-log10(p)"] > -numpy.log10(0.05)) & (volcano_data['log2(phenotype2/phenotype1)'] > 0.5) & (volcano_data['phenotype2_frequency'] > 0.55)]

subset_data = volcano_data.loc[subset_patients,:]

fig = matplotlib.pyplot.figure(figsize=(20,10))

matplotlib.pyplot.scatter(subset_data["phenotype2_frequency"],numpy.array(subset_data["log2(phenotype2/phenotype1)"]),color = 'orange',alpha=0.2,s=75,linewidths=0)

matplotlib.pyplot.xlim(-0.05,0.7)
matplotlib.pyplot.ylim(-3.5,3.5)
matplotlib.pyplot.ylabel("log2 FC R/B gene activity ")
matplotlib.pyplot.xlabel("Relapse overexpression frequency")
matplotlib.pyplot.grid()
matplotlib.pyplot.tight_layout()

In [None]:
subset_data.to_csv(path_or_buf='case_2.csv')

## 3.4 Find relapse signatures using matched patients of specific cytogenetics

In [None]:
baseline_patients=['MMRF_1783_1_BM','MMRF_1433_1_BM']
relapse_patients=['MMRF_1783_1_BM','MMRF_1433_2_BM','MMRF_1433_4_BM']
phenotype1 = baseline_patients
phenotype2 = relapse_patients

In [None]:
importlib.reload(miner)

volcano_data = miner.differentialActivity(regulon_matrix = regulonDf.copy(),reference_matrix = overExpressedMembersMatrix.copy(),baseline_patients=phenotype1,relapse_patients=phenotype2,maxRegulons = 3,minRegulons = 3,useAllRegulons = True)
volcano_data.head()

In [None]:
insigvoldata_patients = volcano_data.index[volcano_data["-log10(p)"]<=-numpy.log10(0.05)]
sigvoldata_patients_plus = volcano_data.index[(volcano_data["-log10(p)"] > -numpy.log10(0.05)) & (volcano_data['log2(phenotype2/phenotype1)'] > 0)]
sigvoldata_patients_minus = volcano_data.index[(volcano_data["-log10(p)"] > -numpy.log10(0.05)) & (volcano_data['log2(phenotype2/phenotype1)'] < 0)]

insigvoldata = volcano_data.loc[insigvoldata_patients,:]
sigvoldata_plus = volcano_data.loc[sigvoldata_patients_plus,:]
sigvoldata_minus = volcano_data.loc[sigvoldata_patients_minus,:]

fig = matplotlib.pyplot.figure(figsize=(20,10))

matplotlib.pyplot.scatter(sigvoldata_plus["phenotype2_frequency"],numpy.array(sigvoldata_plus["log2(phenotype2/phenotype1)"]),color = 'red',alpha=0.2,s=75,linewidths=0)

matplotlib.pyplot.scatter(sigvoldata_minus["phenotype2_frequency"],numpy.array(sigvoldata_minus["log2(phenotype2/phenotype1)"]),color = 'blue',alpha=0.2,s=100,linewidths=0)

matplotlib.pyplot.scatter(insigvoldata["phenotype2_frequency"],numpy.array(insigvoldata["log2(phenotype2/phenotype1)"]),color ='black',edgecolor = [0.1,0.1,0.1],alpha=0.2,s=75,linewidths=0)

matplotlib.pyplot.scatter(volcano_data.loc[ikzf1,"phenotype2_frequency"],numpy.array(volcano_data.loc[ikzf1,"log2(phenotype2/phenotype1)"]),color = 'green',s=75,linewidths=0)

#matplotlib.pyplot.xlim(-0.05,0.7)
#matplotlib.pyplot.ylim(-3.5,3.5)
matplotlib.pyplot.ylabel("log2 FC R/B gene activity ")
matplotlib.pyplot.xlabel("Relapse overexpression frequency")
matplotlib.pyplot.grid()
matplotlib.pyplot.tight_layout()

In [None]:
subset_patients = volcano_data.index[(volcano_data['log2(phenotype2/phenotype1)'] > 0.25) & (volcano_data['phenotype2_frequency'] > 2/3)]

subset_data = volcano_data.loc[subset_patients,:]

fig = matplotlib.pyplot.figure(figsize=(20,10))

matplotlib.pyplot.scatter(subset_data["phenotype2_frequency"],numpy.array(subset_data["log2(phenotype2/phenotype1)"]),color = 'orange',alpha=0.2,s=75,linewidths=0)

#matplotlib.pyplot.xlim(0.3,1.05)
#matplotlib.pyplot.ylim(-1.6,0.90)
matplotlib.pyplot.ylabel("log2 FC R/B gene activity ")
matplotlib.pyplot.xlabel("Relapse overexpression frequency")
matplotlib.pyplot.grid()
matplotlib.pyplot.tight_layout()

In [None]:
subset_data.to_csv(path_or_buf='case_3.csv')

In [None]:
# consider finding DETs in relapse, patient by patient with specific cytogenetics, then map to regulons

# 4. Map patients to states