## Import packages

In [None]:
import os, sys

sys.path.append('../')

import src.h5ld as h5ld

## Set directory for file of interest

In [None]:
data_dir = '/nfs/team298/ar32/repos/H5py_anndata_checker/dummy_data/test_adata.h5ad'
print(data_dir)

## See what is in the file

In [None]:
h5ld.generate_anndata_report(data_dir)

## See unique values in columns of interest

In [None]:
dataframe = 'obs'
columns = [
    'anno_LVL1', 
    'anno_LVL2',
    'biological_unit'
    ]

print(
    f"\033[1mDataFrame output to see all unique values for each column of interest in {dataframe}:\033[0m\n"
)
h5ld.inspect_column_categories(data_dir, dataframe, columns)

In [None]:
dataframe = 'var' # choose obs or var 
columns = [
    'hgnc',
    'high_var'
    ]

# Display the DataFrame
print(
    f"\033[1mDataFrame output to see all unique values for each column of interest in {dataframe}:\033[0m\n"
)
h5ld.inspect_column_categories(data_dir, dataframe, columns)

## Note: Displaying Column-Level Unique Categories, Not the Entire DataFrame!

If you had 5 unique categories in a column, removed 2 unique values leaving only 3 unique values, the column-level metadata may still report 5 unique categories and this is what h5py will report to the user. 

This discrepancy can occur if you haven't updated the column metadata by removing unused categories using the following code:

```python
# Update the column-level metadata to remove unused categories
df_subset['column'] = df_subset['column'].cat.remove_unused_categories()
```

To account for above, prior to loading in the data check that all the unique categories of interest are actually still in the data and haven't been sliced out

## Make a pandas dataframe for only data of interest and selected columns from obs

In [None]:
dataframe = 'obs'

filter_dict = {
    
    'anno_LVL1' : ['haematopoetic'],
    'anno_LVL2' : ['macrophage']
}

additional_cols_keep = [
    'biological_unit',
    ]

filter_method = 'intersection'
#filter_method = 'union'


print(
        f"\033[1mDataFrame output of {dataframe} subset by columns {list(filter_dict.keys())} by {filter_method} for values of interest:\033[0m\n"
    )
h5ld.create_dataframe_subset(data_dir, dataframe, filter_dict, additional_cols_keep, filter_method)

In [None]:
dataframe = 'obs'

filter_dict = {
    
    'anno_LVL1' : ['haematopoetic'],
    'anno_LVL2' : ['hepatocyte']
}

additional_cols_keep = [
    'biological_unit',
    ]

#filter_method = 'intersection'
filter_method = 'union'


print(
        f"\033[1mDataFrame output of {dataframe} subset by columns {list(filter_dict.keys())} by {filter_method} for values of interest:\033[0m\n"
    )
h5ld.create_dataframe_subset(data_dir, dataframe, filter_dict, additional_cols_keep, filter_method)

## Make a pandas dataframe for only data of interest and selected columns from var

In [None]:
dataframe = 'var'

filter_dict = {
    
    'high_var' : ['yes'],
}

additional_cols_keep = []

filter_method = 'intersection'
#filter_method = 'union'


print(
        f"\033[1mDataFrame output of {dataframe} subset by columns {list(filter_dict.keys())} by {filter_method} for values of interest:\033[0m\n"
    )
h5ld.create_dataframe_subset(data_dir, dataframe, filter_dict, additional_cols_keep, filter_method)

## Load only a slice of the original data into memory 

In [None]:
input_settings = {
    'data_dir' : data_dir,
    'obs_filter_dict' : {
                        'anno_LVL1' : ['haematopoetic'],
                        'anno_LVL2' : ['hepatocyte']
                        },
    'obs_additional_cols_keep' : [],
    'obs_filter_method' : 'union',
    'var_filter_dict' : {
                        'high_var' : ['yes'],
                        },
    'var_additional_cols_keep' : [],
    'var_filter_method' : 'intersection',
    
    'filter_layers' : [],
    'filter_obsm' : [],
    'filter_obsp' : [],
    'filter_varm' : [],
    'filter_varp' : [],
    'filter_uns' : [],
    
    'keep_layers' : False,
    'keep_obsm' : False,
    'keep_obsp' : False,
    'keep_varm' : False,
    'keep_varp' : False,
    'keep_uns' : False,
    
}

adata = h5ld.create_anndata_subset(**input_settings)

print("")
print("")
print("\033[1mSubset anndata object generated successfully\033[0m\n")
print("\033[1m" + "Anndata whole preview:" + "\033[0m")
display(adata)
print("")
print("")
print("\033[1mQuick view of the anndata object generated\033[0m\n")
print(f"Overall shape: {adata.shape}")
print(f"Min count: {adata.X.min()}")
print(f"Max count: {adata.X.max()}")
print("")
print("\033[1m" + "obs preview:" + "\033[0m")
display(adata.obs)
print("")
print("\033[1m" + "var preview:" + "\033[0m")
display(adata.var)
print("")