In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import h5py
from scipy import sparse
from pathlib import Path
import scanpy as sc
import copy
# import rnanorm

In [36]:
def load_sparse_mat(filtered_filename, sample_number):
    with h5py.File(filtered_filename, 'r') as f:
        data = np.array(f['matrix']['data'])
        filtered_barcodes = np.array(f['matrix']['barcodes']).astype('U13')  # b'AAACAAGTATCTCCCA-1'  4992 columns
        _id = np.array(f['matrix']['features']['id']).astype('U13')   #  b'ENSG00000275063' 33538 rows
        name = np.array(f['matrix']['features']['name']).astype('U13')   # Gene name b'AL627309.3' 33538 rows
        indices = np.array(f['matrix']['indices'])
        indptr = np.array(f['matrix']['indptr'])
        shape = np.array(f['matrix']['shape'])
        
    barcodes = pd.DataFrame(filtered_barcodes)
    barcodes['in_filtered'] = barcodes[0].isin(filtered_barcodes).astype('int')
    m = sparse.csr_matrix((data, indices, indptr), shape=(shape[1], shape[0]))
    index = [barcodes[0].astype('str').to_list(), 
             [str(sample_number)]*len(barcodes)]
    columns = [name, _id]
    return m.toarray()

def load_data():
    col_metadata = pd.read_csv(r'../data/column_metadata.csv', index_col=0)
    row_metadata = pd.read_csv(r'../data/row_metadata.csv', index_col=0)
    samples = [151507, 151669, 151673]
    filtfiles = [x for x in Path(r'..\data').glob('*filt*.h5')]
    filtfiles = [x for x in filtfiles if (str(samples[0]) in str(x)) or \
                 (str(samples[1]) in str(x)) or (str(samples[2]) in str(x))]
    dfs = []
    for filtered_file in filtfiles:
        sample_number = filtered_file.stem.split('_')[0]
        df = load_sparse_mat(filtered_filename=filtered_file, 
                             sample_number=sample_number)
        dfs.append(df)
    cdf = np.concatenate(dfs)
    adata_master = sc.AnnData(
        X=cdf,   
        obs=col_metadata.loc[col_metadata.sample_name.isin([151507, 151669, 151673])],     
        var=row_metadata)  # obs = rows      var = cols
    del cdf
    del dfs
    return adata_master
adata_master = load_data()

# Filter genes with counts less than 100
high_count_genes = adata_master.X.sum(axis=0) > 100
adata_master = adata_master[:, high_count_genes]



In [44]:
normtot = copy.copy(adata_master)
sc.pp.normalize_total(normtot, exclude_highly_expressed=True, inplace=True)
normtot = normtot.T

  view_to_actual(adata)


In [7]:
with open(r'..\data\dino_norm.csv') as f:
    r = csv.reader(f)
    for i, row in enumerate(r):
        print(row[:10])
        
        if i == 10:
            break

['', 'AAACAACGAATAGTTC.1', 'AAACAAGTATCTCCCA.1', 'AAACAATCTACTAGCA.1', 'AAACACCAATAACTGC.1', 'AAACAGCTTTCAGAAG.1', 'AAACAGGGTCTATATT.1', 'AAACAGTGTTCCTGGG.1', 'AAACATTTCCCGGATT.1', 'AAACCACTACACAGAT.1']
['ENSG00000237491', '0.045', '0', '0', '0', '0', '0.028', '0', '0', '0']
['ENSG00000188976', '0.275', '0.022', '0.186', '0.058', '0.064', '0.103', '0.175', '0.056', '0.094']
['ENSG00000187961', '0', '0.06', '0', '0', '0', '0.013', '0.004', '0.064', '0']
['ENSG00000188290', '0', '0.164', '0.217', '0.08', '0.086', '0.197', '0.212', '0.298', '0.133']
['ENSG00000187608', '0.8', '0.605', '0.15', '0.645', '0.255', '0.119', '0.045', '0.386', '0.137']
['ENSG00000188157', '0.226', '0.017', '0.099', '0.081', '0.2', '0.215', '0.124', '0.587', '0.083']
['ENSG00000131591', '0.002', '0.101', '0.089', '0', '0.074', '0.069', '0.015', '0', '0.069']
['ENSG00000186891', '0.057', '0.068', '0', '0.055', '0.055', '0', '0', '0.055', '0']
['ENSG00000186827', '0', '0', '0', '0', '0', '0.041', '0', '0', '0']
['E

In [8]:
dino = pd.read_csv(r'..\data\dino_norm.csv', index_col=0)

In [30]:
dino.columns[0][:16]

'AAACAACGAATAGTTC'

In [34]:
dino.values.shape

(13585, 11526)