In [1]:
from scipy.io import mmread
import os
import pandas as pd
import numpy as np
 

## Load scATAC-seq peak count data

In [2]:
# import scATAC-seq peak count matrix
# NOTE: the original file is a .txt file in the MatrixMarket format
# I just changed the .txt extension to .mtx 
os.chdir('C:/Users/lexie/Documents/SMU/super-test/paper-code/scGCN_tensorflow2/A549/')
atac = mmread('GSM3271041_ATAC_sciCAR_A549_peak_countMAT.mtx')
atac

<189603x6085 sparse matrix of type '<class 'numpy.float64'>'
	with 1950459 stored elements in COOrdinate format>

In [3]:
# convert ATAC-seq sparse matrix to dense matrix
atac = atac.todense()

In [4]:
# check dimensions of ATAC-seq peak count matrix
atac.shape
type(atac)

numpy.matrix

## Load scATAC-seq peak information and cell labels

In [5]:
# import ATAC-seq peaks
# NOTE: original .txt files were imported and formatted in excel, then saved as csv files
# it was easier to separate columns by delimiters in excel than python
ATACpeak_lab = pd.read_csv('ATAC_peaks.csv')

# import ATAC-seq cell labels
ATACcell_lab = pd.read_csv('ATAC_cells.csv')
display(ATACcell_lab)

Unnamed: 0,sample,source,cell_name,treatment_time,experiment
0,sci-RNA-A-071.AGGTCTATGG,Mouse,293T,999,co_assay
1,sci-RNA-A-071.GCGGAGTCGA,Human,A549,3,co_assay
2,sci-RNA-A-071.TTGCAGCATT,Human,A549,1,co_assay
3,sci-RNA-A-071.GCCTGATATA,Mouse,293T,999,co_assay
4,sci-RNA-A-071.GCGGCCAATC,Human,A549,3,co_assay
...,...,...,...,...,...
6080,sci-RNA-E-022.GAGGCTTATT,Human,293T,999,co_assay
6081,sci-RNA-E-022.TCTGACGAGG,Mouse,293T,999,co_assay
6082,sci-RNA-E-022.GCCTCAGCAT,Human,293T,999,co_assay
6083,sci-RNA-E-022.TCGGCGTCGT,Mouse,293T,999,co_assay


## Filter out non-A549 cells in scATAC-seq data

In [6]:
# get the index of the cells that are A549
include = ATACcell_lab.index[[ATACcell_lab['cell_name'] == 'A549']].tolist()

# only keep the column labels corresponding to A549 cells
ATACcell_lab = ATACcell_lab.iloc[include]
print(ATACcell_lab.shape)


(4258, 5)


  result = getitem(key)


In [7]:
# save out reduced ATAC-seq labels; reduced version only includes A549 cells
# ATACcell_lab.to_csv('ATAC_cells_reduced.csv')

In [8]:
# only keep the column data correponding to gene counts for A549 cells
atac = atac[:,include]
print(atac.shape)
type(atac)

MemoryError: Unable to allocate 6.02 GiB for an array with shape (4258, 189603) and data type float64

##  Annotate scATAC-seq peaks with corresponding gene

In [None]:
# import annotations for the genes closest to each peak
# tsv file is stored locally in the same directory as the other A549 files
annot = pd.read_csv('peak_annot_df.tsv',sep = '\t')
display(annot)

In [None]:
# for each ATAC-seq peak recorded in the original data file, 
# find and add the name of the corresponding closest gene

matched_gene = []
for peak in range(ATACpeak_lab.shape[0]):
    thispeak = ATACpeak_lab.iloc[peak,:]
    temp_match = []
    match = []
    
    # match the start of the peak to entries in the annot file
    if any(thispeak['start']  == annot['start']):
        temp_match = annot.loc[thispeak['start']  == annot['start']]
        
        # of the potential matches for 'start', find the entry that matches 'end'
        match = temp_match[temp_match['end']  == thispeak['end']]
        # if there are multiple entries with same start and end but we just use the first of the matches
        if len(match)>1:
            match = match.iloc[[0]]
            
        # get the name of the closest gene and add it to list
        if match.empty:
            matched_gene.append('null')
        else:
            matched_gene.append(match['gene_name'].values[0]) 
        
    # if a peak does not have annot entry, add null
    else: 
        matched_gene.append('null')
            
    
ATACpeak_lab.insert(0,'gene', matched_gene)
ATACpeak_lab

# save out new data peaks file with annotations 
# ATACpeak_lab.to_csv('ATAC_peaks_data_annot.csv')
