In [1]:
import requests
import gzip
import numpy as np
import pandas as pd
from Bio import SeqIO


In [2]:
GENOME_PATH = "hg38.fa"


In [3]:
class DataSource:
    # Sourced from https://github.com/meuleman/SynthSeqs/blob/main/make_data/source.py

    def __init__(self, data, filepath):
        self.raw_data = data
        self.filepath = filepath

    @property
    def data(self):
        return self.raw_data


class ReferenceGenome(DataSource):
    """Object for quickly loading and querying the reference genome."""

    @classmethod
    def from_path(cls, path):
        genome_dict = {record.id: str(record.seq).upper() for record in SeqIO.parse(path, "fasta")}
        return cls(genome_dict, path)

    @classmethod
    def from_dict(cls, data_dict):
        return cls(data_dict, filepath=None)

    @property
    def genome(self):
        return self.data

    def sequence(self, chrom, start, end):
        chrom_sequence = self.genome[chrom]

        assert end < len(chrom_sequence), (
            f"Sequence position bound out of range for chromosome {chrom}. "
            f"{chrom} length {len(chrom_sequence)}, requested position {end}."
        )
        return chrom_sequence[start:end]


genome = ReferenceGenome.from_path(GENOME_PATH)

In [4]:
#!wget https://www.meuleman.org/DHS_Index_and_Vocabulary_metadata.tsv

# Last row is empty
DHS_Index_and_Vocabulary_metadata = pd.read_table('./DHS_Index_and_Vocabulary_metadata.tsv').iloc[:-1]
with pd.option_context('display.max_rows', 5, 'display.max_columns', None):
    display(DHS_Index_and_Vocabulary_metadata)

Unnamed: 0,library order,Biosample name,Vocabulary representative,DCC Experiment ID,DCC Library ID,DCC Biosample ID,DCC File ID,Altius Aggregation ID,Altius Library ID,Altius Biosample ID,Replicate indicators,System,Subsystem,Organ,Biosample type,Biological state,Germ layer,Description,Growth stage,Age,Sex,Ethnicity,Donor ID,Unique cellular condition,Used in Figure 1b,Biosample protocol,Experiment protocol,Library kit method,Library cleanup,DNaseI units/mL,Amount Nucleic Acid (ng),Nuclei count,Protease inhibitor,Library sequencing date,Reads used,DCC SPOT score,Per-biosample peaks,DHSs in Index
0,1.0,GM06990,,ENCSR000EMQ,ENCLB435ZZZ,ENCBS057ENC,ENCFF983CTQ,AG5636,LN1203,DS7748,DS7784,Hematopoietic,Lymphoid,Blood,Lines,Immortalized,Mesoderm,Lymphoblastoid,Adult,41Y,F,,,0,,Biosample protocol,Experiment protocol,,Sucrose,,50,,,2009-02-23,142681590.0,0.6790,83639.0,82918.0
1,2.0,HepG2,,ENCSR000ENP,ENCLB480ZZZ,ENCBS114ENC,ENCFF419JVG,AG5635,LN1207,DS7764,DS7768,Hepatic,,Liver,Cancer,Cancer,Endoderm,hepatocellular carcinoma,Child,15Y,M,,,1,,Biosample protocol,Experiment protocol,,Sucrose,,50,,,2009-02-23,138826342.0,0.5858,89748.0,89235.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,732.0,fPlacenta,,ENCSR552RKI,ENCLB423VBC,ENCBS565KNL,ENCFF084UVH,AG8805,LN45072C,DS37386C,,Fetal Life Support,,Placenta,Primary,Primary,Endoderm,placenta,Fetal,102D,U,,H26550,1,,Biosample protocol,Experiment protocol,Thruplex DNA-Seq Q,,,1.325,1050000.0,A+Sucrose,,203699532.0,0.3869,107611.0,106022.0
732,733.0,fPlacenta,Placental / trophoblast,ENCSR552XJI,ENCLB711ZZZ,ENCBS723HLT,ENCFF593AWN,AG7450,LN45076C,DS37716C,,Fetal Life Support,,Placenta,Primary,Primary,Endoderm,Placenta,Fetal,56D,U,,H26598,1,,Biosample protocol,Experiment protocol,Thruplex DNA-Seq Q,,,0.972,1380000.0,A+Sucrose,,206456483.0,0.4356,115898.0,114344.0


In [5]:
# Contains a 733 row (biosample) x 16 (component) peak presence/abscence matrix (not a binary matrix)
# Used later to map component number within metadata dataframe and find proportion for given component

# Downloading basis
#basis_array = requests.get("https://zenodo.org/record/3838751/files/2018-06-08NC16_NNDSVD_Basis.npy.gz?download=1")

#with open('2018-06-08NC16_NNDSVD_Basis.npy.gz', 'wb') as f:
#    f.write(basis_array.content)

!gzip -d 2018-06-08NC16_NNDSVD_Basis.npy.gz

# Converting npy file to csv
basis_array = np.load('2018-06-08NC16_NNDSVD_Basis.npy')
np.savetxt("2018-06-08NC16_NNDSVD_Basis.csv", basis_array, delimiter=",")

# Creating nmf_loadings matrix from csv
nmf_loadings = pd.read_csv('2018-06-08NC16_NNDSVD_Basis.csv', header=None)
nmf_loadings.columns = ['C' + str(i) for i in range(1, 17)]


# Joining metadata with component presence matrix
DHS_Index_and_Vocabulary_metadata = pd.concat([DHS_Index_and_Vocabulary_metadata, nmf_loadings], axis=1)


gzip: 2018-06-08NC16_NNDSVD_Basis.npy.gz: No such file or directory


In [6]:
COMPONENT_COLUMNS = [
    'C1',
    'C2',
    'C3',
    'C4',
    'C5',
    'C6',
    'C7',
    'C8',
    'C9',
    'C10',
    'C11',
    'C12',
    'C13',
    'C14',
    'C15',
    'C16',
]

DHS_Index_and_Vocabulary_metadata['component'] = (
    DHS_Index_and_Vocabulary_metadata[COMPONENT_COLUMNS].idxmax(axis=1).apply(lambda x: int(x[1:]))
)

In [7]:
DHS_Index_and_Vocabulary_metadata

Unnamed: 0,library order,Biosample name,Vocabulary representative,DCC Experiment ID,DCC Library ID,DCC Biosample ID,DCC File ID,Altius Aggregation ID,Altius Library ID,Altius Biosample ID,...,C8,C9,C10,C11,C12,C13,C14,C15,C16,component
0,1.0,GM06990,,ENCSR000EMQ,ENCLB435ZZZ,ENCBS057ENC,ENCFF983CTQ,AG5636,LN1203,DS7748,...,0.000000,0.000000,0.102685,0.000000,0.00000,0.026774,0.000000,0.000000,0.000000,5
1,2.0,HepG2,,ENCSR000ENP,ENCLB480ZZZ,ENCBS114ENC,ENCFF419JVG,AG5635,LN1207,DS7764,...,0.000000,0.074557,0.095928,0.000000,0.00000,3.190564,0.416094,0.000000,0.000000,13
2,3.0,hTH1,,ENCSR000EQC,ENCLB591ZZZ,ENCBS345AAA,ENCFF575KOF,AG5634,LN1222,DS7840,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,5
3,4.0,Hela,,ENCSR000ENO,ENCLB479ZZZ,ENCBS890POO,ENCFF503PAE,AG4219,LN1264,DS8200,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.407768,0.113676,0.000000,2.420549,16
4,5.0,CACO2,,ENCSR000EMI,ENCLB422ZZZ,ENCBS391ENC,ENCFF977BRD,AG4218,LN1269,DS8235,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.936955,0.000000,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728,729.0,fUmbilical_cord,,ENCSR512CWR,ENCLB771UER,ENCBS518LEK,ENCFF267RUD,AG7441,LN45036A,DS24820A,...,0.026686,1.119037,1.127675,0.374285,0.98447,0.439221,0.000000,0.000000,0.000000,10
729,730.0,fBone_femur,Musculoskeletal,ENCSR805XIF,ENCLB236BWV,ENCBS337FPV,ENCFF604WIO,AG7442,LN45038B,DS36206B,...,0.122105,0.632395,1.053348,0.000000,4.37714,0.045913,0.000000,0.176015,0.000000,12
730,731.0,fLiver,,ENCSR562FNN,ENCLB638FEH,ENCBS275VNY,ENCFF795ZXN,AG7443,LN45070C,DS37372C,...,0.000000,0.298063,0.148477,0.000000,0.00000,0.698113,0.000000,0.191680,0.000000,1
731,732.0,fPlacenta,,ENCSR552RKI,ENCLB423VBC,ENCBS565KNL,ENCFF084UVH,AG8805,LN45072C,DS37386C,...,0.000000,0.000000,0.154762,0.000000,0.00000,0.221821,0.000000,0.000000,0.000000,7


In [8]:
# File loaded from drive available from below link
#mixture_array = requests.get("https://zenodo.org/record/3838751/files/2018-06-08NC16_NNDSVD_Mixture.npy.gz?download=1")

# Downloading mixture array that contains 3.5M x 16 matrix of peak presence/absence decomposed into 16 components
#with open('2018-06-08NC16_NNDSVD_Mixture.npy.gz', 'wb') as f:
#    f.write(mixture_array.content)
!gzip -d 2018-06-08NC16_NNDSVD_Mixture.npy.gz

# Turning npy file into csv
mixture_array = np.load('2018-06-08NC16_NNDSVD_Mixture.npy').T
np.savetxt("2018-06-08NC16_NNDSVD_Mixture.csv", mixture_array, delimiter=",")

# Creating nmf_loadings matrix from csv and renaming columns
nmf_loadings = pd.read_csv('2018-06-08NC16_NNDSVD_Mixture.csv', header=None, names=COMPONENT_COLUMNS)
#

gzip: 2018-06-08NC16_NNDSVD_Mixture.npy.gz: No such file or directory


In [9]:
# Loading in DHS_Index_and_Vocabulary_metadata that contains the following information:
# seqname, start, end, identifier, mean_signal, numsaples, summit, core_start, core_end, component
!gunzip -d DHS_Index_and_Vocabulary_hg38_WM20190703.txt.gz

# Loading sequence metadata
sequence_metadata = pd.read_table('./DHS_Index_and_Vocabulary_hg38_WM20190703.txt', sep='\t')

# Dropping component column that contains associated tissue rather than component number (We will use the component number from DHS_Index_and_Vocabulary_metadata)
sequence_metadata = sequence_metadata.drop(columns=['component'], axis=1)

# Join metadata with component presence matrix
df = pd.concat([sequence_metadata, nmf_loadings], axis=1, sort=False)

gzip: DHS_Index_and_Vocabulary_hg38_WM20190703.txt.gz: No such file or directory


  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
# Functions used to create sequence column
def sequence_bounds(summit: int, start: int, end: int, length: int):
    """Calculate the sequence coordinates (bounds) for a given DHS.
    https://github.com/meuleman/SynthSeqs/blob/main/make_data/process.py
    """
    half = length // 2

    if (summit - start) < half:
        return start, start + length
    elif (end - summit) < half:
        return end - length, end

    return summit - half, summit + half


def add_sequence_column(df: pd.DataFrame, genome, length: int):
    """
    Query the reference genome for each DHS and add the raw sequences
    to the dataframe.
    Parameters
    ----------
    df : pd.DataFrame
        The dataframe of DHS annotations and NMF loadings.
    genome : ReferenceGenome(DataSource)
        A reference genome object to query for sequences.
    length : int
        Length of a DHS.

    https://github.com/meuleman/SynthSeqs/blob/main/make_data/process.py
    """
    seqs = []
    for rowi, row in df.iterrows():
        l, r = sequence_bounds(row['summit'], row['start'], row['end'], length)
        seq = genome.sequence(row['seqname'], l, r)

        seqs.append(seq)

    df['sequence'] = seqs
    return df


# Recreating some of the columns from our original dataset
df['component'] = df[COMPONENT_COLUMNS].idxmax(axis=1).apply(lambda x: int(x[1:]))
df['proportion'] = df[COMPONENT_COLUMNS].max(axis=1) / df[COMPONENT_COLUMNS].sum(axis=1)
df['total_signal'] = df['mean_signal'] * df['numsamples']
df['proportion'] = df[COMPONENT_COLUMNS].max(axis=1) / df[COMPONENT_COLUMNS].sum(axis=1)
df['dhs_id'] = df[['seqname', 'start', 'end', 'summit']].apply(lambda x: '_'.join(map(str, x)), axis=1)
df['DHS_width'] = df['end'] - df['start']

# Creating sequence column
df = add_sequence_column(df, genome, 200)

# Changing seqname column to chr
df = df.rename(columns={'seqname': 'chr'})

# Reordering and unselecting columns
df = df[
    [
        'dhs_id',
        'chr',
        'start',
        'end',
        'DHS_width',
        'summit',
        'numsamples',
        'total_signal',
        'component',
        'proportion',
        'sequence',
        'C1',
        'C2',
        'C3',
        'C4',
        'C5',
        'C6',
        'C7',
        'C8',
        'C9',
        'C10',
        'C11',
        'C12',
        'C13',
        'C14',
        'C15',
        'C16',
    ]
]
df


Unnamed: 0,dhs_id,chr,start,end,DHS_width,summit,numsamples,total_signal,component,proportion,...,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16
0,chr1_16140_16200_16170,chr1,16140,16200,60,16170,1,0.129388,1,0.855153,...,0.000040,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00000,0.000000
1,chr1_51868_52040_51970,chr1,51868,52040,172,51970,1,0.080034,7,0.973545,...,0.011431,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00000,0.000000
2,chr1_57280_57354_57350,chr1,57280,57354,74,57350,4,1.093002,8,1.000000,...,0.000000,0.025745,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00000,0.000000
3,chr1_66370_66482_66430,chr1,66370,66482,112,66430,8,1.469725,3,0.332213,...,0.002904,0.001445,0.000000,0.000000,0.000000,0.0,0.003885,0.0,0.00000,0.000000
4,chr1_79100_79231_79150,chr1,79100,79231,131,79150,2,0.226098,7,0.501840,...,0.006965,0.000000,0.000208,0.001768,0.003912,0.0,0.000000,0.0,0.00000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3591893,chrY_56882540_56882719_56882610,chrY,56882540,56882719,179,56882610,1,0.038079,5,0.803229,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00000,0.000000
3591894,chrY_56882864_56882980_56882930,chrY,56882864,56882980,116,56882930,1,0.115489,5,0.742349,...,0.000000,0.000000,0.000249,0.000000,0.000000,0.0,0.000035,0.0,0.00018,0.000146
3591895,chrY_56883733_56883960_56883830,chrY,56883733,56883960,227,56883830,5,2.456885,7,0.559734,...,0.017065,0.000000,0.000000,0.000177,0.000000,0.0,0.000000,0.0,0.00000,0.012509
3591896,chrY_56884440_56884580_56884510,chrY,56884440,56884580,140,56884510,1,0.053759,5,0.803229,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00000,0.000000


In [11]:
df = df.iloc[:500000,:]

In [12]:
import pandas as pd
import gc
gc.collect()

42

In [13]:
s = 0
for chunk in pd.read_table('/home/ali/Desktop/genome/dat_bin_FDR01_hg38.txt', header=None, chunksize=100000):
    gc.collect()
    
    if s == 0:
        x = chunk.astype('int16')
    else:
        x = pd.concat([x, chunk.astype('int16')], axis=0)
        print(x.shape)
        if(x.shape[0] == 500000):
            break
    s = s+1
    

(200000, 733)
(300000, 733)
(400000, 733)
(500000, 733)


In [14]:
celltype_encodeID = [
    row['Biosample name'] + "_" + row['DCC Library ID'] for _, row in DHS_Index_and_Vocabulary_metadata.iterrows()
]

# Renaming columns using celltype_encodeID list
x.columns = celltype_encodeID

In [15]:
x.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [16]:
cancer_dataset = pd.concat([df, x], axis=1, sort=False)

In [17]:
cancer_dataset

Unnamed: 0,dhs_id,chr,start,end,DHS_width,summit,numsamples,total_signal,component,proportion,...,fKidney_ENCLB005SRL,fKidney_ENCLB704GMQ,fKidney_ENCLB759USM,fLung_ENCLB594BSZ,fKidney_ENCLB049MNH,fUmbilical_cord_ENCLB771UER,fBone_femur_ENCLB236BWV,fLiver_ENCLB638FEH,fPlacenta_ENCLB423VBC,fPlacenta_ENCLB711ZZZ
0,chr1_16140_16200_16170,chr1,16140,16200,60,16170,1,0.129388,1,0.855153,...,0,0,0,0,0,0,0,0,0,0
1,chr1_51868_52040_51970,chr1,51868,52040,172,51970,1,0.080034,7,0.973545,...,0,0,0,0,0,0,0,0,0,0
2,chr1_57280_57354_57350,chr1,57280,57354,74,57350,4,1.093002,8,1.000000,...,0,0,0,0,0,0,0,0,0,0
3,chr1_66370_66482_66430,chr1,66370,66482,112,66430,8,1.469725,3,0.332213,...,0,0,0,0,0,0,0,0,0,0
4,chr1_79100_79231_79150,chr1,79100,79231,131,79150,2,0.226098,7,0.501840,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,chr11_8252120_8252520_8252370,chr11,8252120,8252520,400,8252370,5,0.835269,3,0.473127,...,1,0,0,0,0,0,0,0,0,0
499996,chr11_8252446_8252629_8252550,chr11,8252446,8252629,183,8252550,34,19.184663,6,0.473461,...,0,0,0,1,1,0,0,0,0,1
499997,chr11_8252520_8252840_8252710,chr11,8252520,8252840,320,8252710,11,1.538787,3,0.436661,...,1,0,0,1,1,0,1,0,0,0
499998,chr11_8252960_8253200_8253070,chr11,8252960,8253200,240,8253070,13,1.753646,3,0.526524,...,1,0,0,1,1,0,0,0,0,0


In [18]:
cancer_dataset.to_feather('cancer_dataset.ftr')

In [19]:
cancer_dataset = pd.read_feather('./cancer_dataset.ftr')


In [21]:
DHS_Index_and_Vocabulary_metadata['Biological state'].unique()

array(['Immortalized', 'Cancer', 'Primary', 'ESC', 'Stem Cells', 'iPSC'],
      dtype=object)

In [22]:
system_dict = {}
x = 0
for system in ['Primary', 'Cancer']:#get biosample belonging to cancer and primary
    system_dict[system] = []
for _, row in DHS_Index_and_Vocabulary_metadata.iterrows():
    if row['Biological state'] in ['Primary', 'Cancer']:
        system_dict[row['Biological state']].append(row['Biosample name'] + "_" + row['DCC Library ID'])
for key in system_dict.keys():
    print(key, len(system_dict[key]))
    x += len(system_dict[key])
print(x)

Primary 506
Cancer 121
627


In [25]:
for key in system_dict.keys():#obtain dhss belonging to cancer and primary based on biosamples code
    tmp = system_dict[key]
    cancer_dataset[key] = cancer_dataset[tmp].sum(axis=1)
    

['hTH1_ENCLB591ZZZ', 'HMEC_ENCLB493ZZZ', 'HUVEC_ENCLB533ZZZ', 'SAEC_ENCLB581ZZZ', 'SAEC_ENCLB582ZZZ', 'HRE_ENCLB523ZZZ', 'HRE_ENCLB524ZZZ', 'HRCE_ENCLB521ZZZ', 'HRCE_ENCLB522ZZZ', 'hTH17_ENCLB597ZZZ', 'NHEK_ENCLB567ZZZ', 'NHEK_ENCLB568ZZZ', 'HConF_ENCLB470ZZZ', 'HConF_ENCLB471ZZZ', 'HGF_ENCLB486ZZZ', 'HGF_ENCLB487ZZZ', 'fBrain_ENCLB217VMW', 'fBrain_ENCLB833XAU', 'NHDF_Neo_ENCLB565ZZZ', 'NHDF_Neo_ENCLB566ZZZ', 'NHBE_RA_ENCLB561ZZZ', 'NHBE_RA_ENCLB562ZZZ', 'PrEC_ENCLB575ZZZ', 'PrEC_ENCLB576ZZZ', 'AG04450_ENCLB404ZZZ', 'AG04450_ENCLB405ZZZ', 'AG09319_ENCLB408ZZZ', 'AG09319_ENCLB409ZZZ', 'AG04449_ENCLB402ZZZ', 'AG04449_ENCLB403ZZZ', 'AG09309_ENCLB406ZZZ', 'AG09309_ENCLB407ZZZ', 'AG10803_ENCLB410ZZZ', 'AG10803_ENCLB411ZZZ', 'HCPEpiC_ENCLB472ZZZ', 'HCPEpiC_ENCLB473ZZZ', 'HNPCEpiC_ENCLB512ZZZ', 'HNPCEpiC_ENCLB513ZZZ', 'HCF_ENCLB464ZZZ', 'HCF_ENCLB465ZZZ', 'fAdrenal_ENCLB551YSO', 'fHeart_ENCLB224ZRL', 'HRPEpiC_ENCLB527ZZZ', 'HRPEpiC_ENCLB528ZZZ', 'HCM_ENCLB468ZZZ', 'HCM_ENCLB469ZZZ', 'fKidney_

In [39]:
tmp1 = (cancer_dataset[['Primary']]>0).astype(int).reset_index(drop=True)

In [42]:
tmp2 = (cancer_dataset[['Cancer']]>0).astype(int).reset_index(drop=True)

In [50]:
tmp = pd.concat([tmp1, tmp2],axis=1)

In [58]:
cancer_dataset = cancer_dataset[(tmp['Primary'] + tmp['Cancer'])==1]#select those only belonging to cancer or primary

In [59]:
cancer_dataset['Primary'] = (cancer_dataset['Primary']>0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [60]:
cancer_dataset['Cancer'] = (cancer_dataset['Cancer']>0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [61]:
cancer_dataset['Primary'].sum()

203243

In [62]:
cancer_dataset['Cancer'].sum()

71886

In [63]:
cancer_dataset

Unnamed: 0,dhs_id,chr,start,end,DHS_width,summit,numsamples,total_signal,component,proportion,...,fKidney_ENCLB759USM,fLung_ENCLB594BSZ,fKidney_ENCLB049MNH,fUmbilical_cord_ENCLB771UER,fBone_femur_ENCLB236BWV,fLiver_ENCLB638FEH,fPlacenta_ENCLB423VBC,fPlacenta_ENCLB711ZZZ,Primary,Cancer
0,chr1_16140_16200_16170,chr1,16140,16200,60,16170,1,0.129388,1,0.855153,...,0,0,0,0,0,0,0,0,1,0
1,chr1_51868_52040_51970,chr1,51868,52040,172,51970,1,0.080034,7,0.973545,...,0,0,0,0,0,0,0,0,1,0
2,chr1_57280_57354_57350,chr1,57280,57354,74,57350,4,1.093002,8,1.000000,...,0,0,0,0,0,0,0,0,1,0
4,chr1_79100_79231_79150,chr1,79100,79231,131,79150,2,0.226098,7,0.501840,...,0,1,0,0,0,0,0,0,1,0
5,chr1_79430_79497_79431,chr1,79430,79497,67,79431,1,0.097585,13,0.869445,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499994,chr11_8251960_8252260_8252110,chr11,8251960,8252260,300,8252110,9,1.293517,3,0.702211,...,0,0,0,0,0,0,0,1,1,0
499995,chr11_8252120_8252520_8252370,chr11,8252120,8252520,400,8252370,5,0.835269,3,0.473127,...,0,0,0,0,0,0,0,0,1,0
499997,chr11_8252520_8252840_8252710,chr11,8252520,8252840,320,8252710,11,1.538787,3,0.436661,...,0,1,1,0,1,0,0,0,1,0
499998,chr11_8252960_8253200_8253070,chr11,8252960,8253200,240,8253070,13,1.753646,3,0.526524,...,0,1,1,0,0,0,0,0,1,0


In [65]:
cancer_dataset.reset_index(drop=True, inplace=True)

In [66]:
cancer_dataset

Unnamed: 0,dhs_id,chr,start,end,DHS_width,summit,numsamples,total_signal,component,proportion,...,fKidney_ENCLB759USM,fLung_ENCLB594BSZ,fKidney_ENCLB049MNH,fUmbilical_cord_ENCLB771UER,fBone_femur_ENCLB236BWV,fLiver_ENCLB638FEH,fPlacenta_ENCLB423VBC,fPlacenta_ENCLB711ZZZ,Primary,Cancer
0,chr1_16140_16200_16170,chr1,16140,16200,60,16170,1,0.129388,1,0.855153,...,0,0,0,0,0,0,0,0,1,0
1,chr1_51868_52040_51970,chr1,51868,52040,172,51970,1,0.080034,7,0.973545,...,0,0,0,0,0,0,0,0,1,0
2,chr1_57280_57354_57350,chr1,57280,57354,74,57350,4,1.093002,8,1.000000,...,0,0,0,0,0,0,0,0,1,0
3,chr1_79100_79231_79150,chr1,79100,79231,131,79150,2,0.226098,7,0.501840,...,0,1,0,0,0,0,0,0,1,0
4,chr1_79430_79497_79431,chr1,79430,79497,67,79431,1,0.097585,13,0.869445,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275124,chr11_8251960_8252260_8252110,chr11,8251960,8252260,300,8252110,9,1.293517,3,0.702211,...,0,0,0,0,0,0,0,1,1,0
275125,chr11_8252120_8252520_8252370,chr11,8252120,8252520,400,8252370,5,0.835269,3,0.473127,...,0,0,0,0,0,0,0,0,1,0
275126,chr11_8252520_8252840_8252710,chr11,8252520,8252840,320,8252710,11,1.538787,3,0.436661,...,0,1,1,0,1,0,0,0,1,0
275127,chr11_8252960_8253200_8253070,chr11,8252960,8253200,240,8253070,13,1.753646,3,0.526524,...,0,1,1,0,0,0,0,0,1,0


In [67]:
cancer_dataset.to_feather('cancer_cleaned.ftr')

In [68]:
cancer_dataset = pd.read_feather('cancer_cleaned.ftr')

In [71]:
cancer_dataset['sequence']

0         CGGGCATCCTGTGTGCAGATACTCCCTGCTTCCTCTCTAGCCCCCA...
1         GGCGACCCAGCGAGACTCCGCCTCAAAAAAAAAAAAAGAAGATTGA...
2         CTCAGTCATTCCGAACAATTCACACACTAAGATTACCCATGCTAAA...
3         CATTTCTCCAAGGAGGAAATACCAGAGTCAATTCACAACCACTGCA...
4         AGTTCTATCCATGCTGTCCTCAGGCTTGGAAAGAAACAAAGCGCCT...
                                ...                        
275124    TGTGTGAGTGTGTGTGTGAGCATGTGTGTGTGTCCACACATATTCA...
275125    GTTGTGAAGATGAAATGAAAGACTCCGAGACATCTGTGCCCTTCCC...
275126    CTTTCAGGAAGCAGGCAGCCACGTTGGAGAGGCCCATACGGCATGG...
275127    GAGAAGGGTCCTCTCCTCAGGCCATTGAGAGGGGGCCCTGCAGACA...
275128    ACCCAGGCCCCCTGCCCTCAGCTGGGCAGACGAGGAAAGTGGGAGA...
Name: sequence, Length: 275129, dtype: object