# PROCESS TESTIS_DATA

Read in csv file and create anndata object which are then saved to disk

### Common Imports

In [2]:
import sys
import os

root_path = os.path.dirname(os.getcwd())
src_path = os.path.join(root_path, "src")
sys.path.insert(0, src_path)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read csv files into anndata objects

In [6]:
from tissue_purifier.io_utils.read import anndata_from_expression_csv
import pandas as pd

In [57]:
root_data_dir = '/Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED'

dir_tmp = os.listdir(root_data_dir)
anndata_fnames = []
n_rows = None

for my_dir in dir_tmp:
    
    path = os.path.join(root_data_dir, my_dir)
    files = os.listdir(path)
       
    for file in files:
        if file.startswith("sick_"):
            cell_type_file = os.path.join(root_data_dir, my_dir, file)
        elif file.startswith("wt_"):
            cell_type_file = os.path.join(root_data_dir, my_dir, file)
        elif file.startswith("Mapped"):
            expression_file = os.path.join(root_data_dir, my_dir, file)
        elif file.startswith("BeadLocation"):
            location_file = os.path.join(root_data_dir, my_dir, file)
    
    print("---")
    print("---")
    print("---")
    print("---")
    print(my_dir)
    print("location_file --->", location_file)
    print("cell_type_file -->", cell_type_file)
    print("expression_file ->", expression_file)
    
    # create the anndata object
    print("creating anndata object with the counts")
    anndata = anndata_from_expression_csv(expression_file, observation_key='barcode', top_n_rows=n_rows)
    
    # get the metadata and add to the obs dataframe
    locations_df = pd.read_csv(location_file, usecols=["barcode", "x", "y"])
    cell_types_df = pd.read_csv(cell_type_file, usecols=["barcode", "cell_type"])
    metadata_df = locations_df.merge(cell_types_df).set_index("barcode")
    
    # add the metadata into the anndata.obs padaframe. Note that I do not change the order of the entries.
    anndata.obs = anndata.obs.join(metadata_df)

    # save the anndata object to disk
    filename = "./anndata_"+my_dir+".h5ad"
    print("saving anndata to file", filename)
    anndata.write(filename=filename, compression=None)
    anndata_fnames.append(filename)

---
---
---
---
sick1
location_file ---> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/sick1/BeadLocationsForR_T4_Trimmed.csv
cell_type_file --> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/sick1/sick_1.csv
expression_file -> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/sick1/MappedDGEForR_T4_Trimmed.csv
creating anndata object with the counts


... storing 'cell_type' as categorical


saving anndata to file ./anndata_sick1.h5ad
---
---
---
---
wt2
location_file ---> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/wt2/BeadLocationsForR_Puck24_Trimmed_cleaned.csv
cell_type_file --> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/wt2/wt_2.csv
expression_file -> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/wt2/MappedDGEForR_Puck24_Trimmed.csv
creating anndata object with the counts


... storing 'cell_type' as categorical


saving anndata to file ./anndata_wt2.h5ad
---
---
---
---
wt3
location_file ---> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/wt3/BeadLocationsForR_Normal_Puck7_Trimmed.csv
cell_type_file --> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/wt3/wt_3.csv
expression_file -> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/wt3/MappedDGEForR_Normal_Puck7_Trimmed.csv
creating anndata object with the counts


... storing 'cell_type' as categorical


saving anndata to file ./anndata_wt3.h5ad
---
---
---
---
sick2
location_file ---> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/sick2/BeadLocationsForR_Diabetes_Puck10_Trimmed.csv
cell_type_file --> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/sick2/sick_2.csv
expression_file -> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/sick2/MappedDGEForR_Diabetes_Puck10_Trimmed.csv
creating anndata object with the counts


... storing 'cell_type' as categorical


saving anndata to file ./anndata_sick2.h5ad
---
---
---
---
sick3
location_file ---> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/sick3/BeadLocationsForR_Diabetes_Puck11_Trimmed.csv
cell_type_file --> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/sick3/sick_3.csv
expression_file -> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/sick3/MappedDGEForR_Diabetes_Puck11_Trimmed.csv
creating anndata object with the counts


... storing 'cell_type' as categorical


saving anndata to file ./anndata_sick3.h5ad
---
---
---
---
wt1
location_file ---> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/wt1/BeadLocationsForR_T3_Trimmed.csv
cell_type_file --> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/wt1/wt_1.csv
expression_file -> /Users/ldalessi/REPOS/ML_for_slideseq/TESTIS_data/PROCESSED/wt1/MappedDGEForR_T3_Trimmed.csv
creating anndata object with the counts


... storing 'cell_type' as categorical


saving anndata to file ./anndata_wt1.h5ad
