In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import string
import random
import re
import collections
import matplotlib.pyplot as plt
from scipy.signal import welch, hanning

In [None]:
full_dataset = pd.read_csv('/Users/anamaria1/Desktop/Conferences/2019/Novartis/cfDNA/Hack9-DataAnalysisFolders/DATA.csv',sep='\t')

In [None]:
full_dataset.head(2)

In [None]:
del full_dataset['Unnamed: 0']
del full_dataset['5']
full_dataset.rename(columns={'0':'PatientId', '1':'Chromosome', '2':'ExonStart', '3':'ExonEnd', 
                               '4':'WPS'},inplace=True)

In [None]:
print('Number of  samples %s'% full_dataset['PatientId'].unique().shape[0])

In [None]:
full_dataset.head(2)

In [None]:
a = full_dataset.groupby(['PatientId','Chromosome'])['WPS'].apply(','.join).reset_index()

In [None]:
a.head()

In [None]:
def chr_profile(x):
    p = x.split(',')
    results = list(map(float, p))
    return results

In [None]:
def len_wps(x):
    return len(x.split(','))

In [None]:
AllChrProfiles = a['WPS'].apply(lambda row : chr_profile(row))

In [None]:
AllChrProfiles.head()

In [None]:
ind_chr1 = a[a['Chromosome']=='chr1'].index.values

In [None]:
chr1_matrix = np.array(AllChrProfiles[ind_chr1].values.tolist())
chr1_matrix_mean = np.mean(chr1_matrix,axis=0)

In [None]:
for i in range(0,20):
    plt.plot(AllChrProfiles[ind_chr1[i]],color='0.75',linewidth=2)
    plt.plot(chr1_matrix_mean,color='k',linewidth=0.25)
    plt.title('Chromosome 1')
    plt.xlabel('Chromosome Position')
    plt.ylabel('WPS')
    plt.ylim((-15,5))  
plt.show()

In [None]:
labels = pd.read_csv('Data/20190819_WPS_NORM.hackathon_sample_info_updated.csv')
labels = labels.fillna('N/A')
labels.rename(columns={'library':'PatientId', 'hotspot':'Oncogene'},inplace=True)

In [None]:
zeroPurity = labels[(labels['estimated.purity']==0) & (labels['ctdna.score']<=0.01)]

In [None]:
highPurity = labels[labels['estimated.purity']>=0.5]

In [None]:
zero_ids = zeroPurity['PatientId'].tolist()
high_ids = highPurity['PatientId'].tolist()

In [None]:
zero_a = a[a['PatientId'].isin(zero_ids)]
high_a = a[a['PatientId'].isin(high_ids)]

In [None]:
zero_a['PatientId'].unique().shape[0]

In [None]:
high_a['PatientId'].unique().shape[0]

In [None]:
# Remove X/Y chromosomes
high_a.drop(high_a[high_a['Chromosome']=='chrX'].index,axis=0,inplace=True)
high_a.drop(high_a[high_a['Chromosome']=='chrY'].index,axis=0,inplace=True)
zero_a.drop(zero_a[zero_a['Chromosome']=='chrX'].index,axis=0,inplace=True)
zero_a.drop(zero_a[zero_a['Chromosome']=='chrY'].index,axis=0,inplace=True)

In [None]:
Zero_ChrProfiles = zero_a['WPS'].apply(lambda row : chr_profile(row))
High_ChrProfiles = high_a['WPS'].apply(lambda row : chr_profile(row))

In [None]:
n = high_a['PatientId'].unique().shape[0]
r_ind = np.random.randint(zero_a['PatientId'].unique().shape[0], size=n)
j = 1
label = ('chr%s'%(j+1))
k1 = high_a[high_a['Chromosome']==label].index.values
k2 = zero_a[zero_a['Chromosome']==label].index.values

In [None]:
tmp1_matrix = np.array(High_ChrProfiles[k1].values.tolist())
tmp2_matrix = np.array(Zero_ChrProfiles[k2].values.tolist())

tmp1_matrix_mean = np.mean(tmp1_matrix,axis=0)
tmp2_matrix_mean = np.mean(tmp2_matrix,axis=0)

In [None]:
from matplotlib.pyplot import style, savefig
# Plot side-by-side zero and high purity chromosome WPS data
_, (ax1, ax2) = plt.subplots(1,2)
for i in range(0,n): 
    ax1.plot(Zero_ChrProfiles[k2[i]],color='0.75',linewidth=2)
    ax1.plot(tmp1_matrix_mean,color='k',linewidth=0.25)
    ax2.plot(High_ChrProfiles[k1[i]],color='0.75',linewidth=2)
    ax2.plot(tmp2_matrix_mean,color='k',linewidth=0.25)
    _.suptitle('Chromosome %s'%(j+1))
ax1.set_xlabel('Chromosome Position')
ax1.set_ylabel('WPS')
ax1.set_ylim((-15,5)) 
ax1.set_title('Zero Purity')
ax1.set_xticklabels([])
ax2.set_xlabel('Chromosome Position')
ax2.set_xticklabels([])
ax2.set_ylim((-15,5)) 
ax2.set_title('Above 50% Purity')
#savefig('chr%s_plots.eps'%(j+1),bbox_inches='tight')
plt.show()

In [None]:
####################################################################################
############### Wavelet Transformations of the WPS #################################
####################################################################################

In [None]:
from scipy import signal 
import math 
import pywt

In [None]:
_, axs = plt.subplots(5,5)
_.suptitle('Zero purity sample')
scales = range(1,128)
waveletname = 'morl'
for j in range(0,22):
    label = ('chr%s'%(j+1))
    k2 = zero_a[zero_a['Chromosome']==label].index.values
    k1 = high_a[high_a['Chromosome']==label].index.values
    
    j_row = math.floor(j/5)
    j_col = j%5
    #print(j,j_row,j_col)
    coeff1, freq1 = pywt.cwt(High_ChrProfiles[k1[5]], scales, waveletname, 1)
    coeff1_ = coeff1[:,:127]
    
    coeff2, freq2 = pywt.cwt(Zero_ChrProfiles[k2[5]], scales, waveletname, 1)
    coeff2_ = coeff2[:,:127]
    
    im = axs[j_row,j_col].imshow(coeff2_,vmin=0, vmax=np.max(coeff2_))
    axs[j_row,j_col].set_title('Chr %s'%(j+1))
    axs[j_row,j_col].axis('off')
    
_.tight_layout()
#plt.savefig('ZeroPurity_CWT_PerChromosome.eps', dpi=80, bbox_inches='tight')
plt.show()