# PROCESS KIDNEY_DATA

Read in csv file and create anndata object which are then saved to disk

### Common Imports

In [2]:
import sys
import os

root_path = os.path.dirname(os.getcwd())
src_path = os.path.join(root_path, "src")
sys.path.insert(0, src_path)

%load_ext autoreload
%autoreload 2

# Read csv files into anndata objects

In [3]:
from tissue_purifier.io_utils.read import anndata_from_expression_csv
import pandas as pd

In [4]:
root_data_dir = '/home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed'

sub_dir_list = []
for file in os.listdir(root_data_dir):
    path = os.path.join(root_data_dir, file)
    if os.path.isdir(path):
        sub_dir_list.append(file)
        
print(sub_dir_list)

['Puck_200210_02', 'Puck_200104_07', 'Puck_191223_10', 'Puck_200115_02', 'Puck_200115_04', 'Puck_191109_18', 'Puck_200127_08', 'Puck_191223_21', 'Puck_200115_14', 'Puck_191204_22', 'Puck_191223_23', 'Puck_191109_09', 'Puck_200113_10', 'Puck_200104_18', 'Puck_200115_17', 'Puck_191223_11', 'Puck_191223_22', 'Puck_191109_14', 'Puck_200127_05', 'Puck_200115_01', 'Puck_191204_16', 'Puck_200115_11', 'Puck_191204_14', 'Puck_191109_20', 'Puck_191223_17', 'Puck_191206_02', 'Puck_200115_07', 'Puck_200131_20', 'Puck_200115_16', 'Puck_191223_09', 'Puck_200104_10', 'Puck_200104_15', 'Puck_191206_01', 'Puck_191204_13', 'Puck_200121_03', 'Puck_191204_15', 'Puck_191223_02', 'Puck_200104_05', 'Puck_200104_09', 'Puck_191204_23', 'Puck_200113_11', 'Puck_191206_04', 'Puck_191223_12', 'Puck_200131_25', 'Puck_191223_18', 'Puck_200131_22', 'Puck_191223_03', 'Puck_200131_23', 'Puck_200115_18', 'Puck_191204_20', 'Puck_191223_13', 'Puck_200127_10', 'Puck_191204_12', 'Puck_191204_05', 'Puck_191204_03', 'Puck_200

In [None]:
anndata_fnames = []
anndata_fnames_failed = []
n_rows = None

for my_dir in sub_dir_list:
    
    path = os.path.join(root_data_dir, my_dir)
    files = os.listdir(path)
    
    expression_file, location_file = None, None
    for file in files:
        if file.startswith("MappedDGE"):
            expression_file = os.path.join(root_data_dir, my_dir, file)
        elif file.startswith("BeadLocations"):
            location_file = os.path.join(root_data_dir, my_dir, file)
    filename = os.path.join(root_data_dir, "./anndata_"+my_dir+".h5ad")
        
    print("---")
    print("---")
    print("---")
    print("---")
    print(my_dir)
    print("location_file --->", location_file)
    print("expression_file ->", expression_file)
    print("filename ->", filename, os.path.exists(filename))
    
    if os.path.exists(filename):
        continue
    else:
        print("working on", filename)
        try:
            ## create the anndata object
            print("creating anndata object with the counts")
            anndata = anndata_from_expression_csv(expression_file, key='gene', transpose=True, top_n_rows=n_rows)
            
            ## get the metadata and add to the obs dataframe
            metadata_df = pd.read_csv(location_file, usecols=["barcodes", "xcoord", "ycoord", "cell_type"]).set_index("barcodes")
            
            # add the metadata into the anndata.obs padaframe. Note that I do not change the order of the entries.
            anndata.obs = anndata.obs.join(metadata_df)

            # save the anndata object to disk 
            filename = os.path.join(root_data_dir, "./anndata_"+my_dir+".h5ad")
            print("saving anndata to file", filename)
            anndata.write(filename=filename, compression=None)
            anndata_fnames.append(filename)
        except:
            anndata_fnames_failed.append(my_dir)

---
---
---
---
Puck_200210_02
location_file ---> /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/Puck_200210_02/BeadLocationsForR.csv
expression_file -> /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/Puck_200210_02/MappedDGEForR.csv
filename -> /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/./anndata_Puck_200210_02.h5ad False
working on /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/./anndata_Puck_200210_02.h5ad
creating anndata object with the counts


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical


saving anndata to file /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/./anndata_Puck_200210_02.h5ad
---
---
---
---
Puck_200104_07
location_file ---> /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/Puck_200104_07/BeadLocationsForR.csv
expression_file -> /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/Puck_200104_07/MappedDGEForR.csv
filename -> /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/./anndata_Puck_200104_07.h5ad False
working on /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/./anndata_Puck_200104_07.h5ad
creating anndata object with the counts
---
---
---
---
Puck_191223_10
location_file ---> /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/Puck_191223_10/BeadLocationsForR.csv
expression_file -> /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/Puck_191223_10/MappedDGEForR.csv
filename -> /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/./anndata_Puck_191223_10.h5ad False
working on /home/jupyter/REPOS/ML_for_slideseq/DATA/kidney/QCed/./an