## Check if data is raw or not 
- max should be an integer and quite high e.g. >5000
- min should be 0
- check gene distribution
- check cell distribution

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde


# Check gene distributions

genes = ['FTL', 'C1QA', 'ACTB', 'CD14', 'CD19', 'SOX2', 'MKI67', 'MAP2']  # Add all the genes in your list

# Check for missing genes
missing_genes = [gene for gene in genes if gene not in adata.var_names]

if missing_genes:
    print(f"Warning: The following genes are not present in adata.var_names: {missing_genes}")
    print('')
    
# Extract genes present in adata.var_names and maintain the original order
genes_in_var_names = [gene for gene in adata.var_names if gene in genes]

# Get indices for genes present in adata.var_names
gene_indices = [np.where(adata.var_names == gene)[0][0] for gene in genes_in_var_names]

# Now gene_indices contains the index positions for genes present in adata.var_names
data1 = pd.DataFrame(adata.X[:, gene_indices].todense(), columns=genes_in_var_names)
print(f'Max count: {np.max(adata.X)}')
print(f'Min count: {np.min(adata.X)}')

# Calculate the number of rows needed
num_genes = len(genes_in_var_names)
num_rows = (num_genes + 2) // 3  # Ceiling division to get the number of rows

# Set the figure size
plt.figure(figsize=(15, 5 * num_rows))

# Plot histograms for genes present in adata.var_names in a grid with smooth lines
for i, gene in enumerate(genes_in_var_names, start=1):
    plt.subplot(num_rows, 3, i)
    
    # Calculate kernel density estimate
    kde = gaussian_kde(data1[gene])
    x_vals = np.linspace(data1[gene].min(), data1[gene].max(), 100)
    
    # Plot smooth line
    plt.plot(x_vals, kde(x_vals), label=gene)
    plt.title(gene)
    #plt.legend()

plt.suptitle('Histograms for inputted genes', fontsize=30)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
plt.close()


# Check cell distributions

# Assuming you have a total of at least 9 random cell indices
random_cell_indices = np.random.choice(range(adata.X.shape[0]), size=9, replace=False)

# Create a 3x3 grid of subplots
fig, axes = plt.subplots(3, 3, figsize=(18, 12))

# Loop over the random cell indices
for i, random_cell_idx in enumerate(random_cell_indices):
    # Extract raw counts for the selected cell
    raw_counts = adata.X[random_cell_idx, :].A.flatten()

    # Check if the counts are logged
    is_logged = np.min(raw_counts) <= 0  # Check if any count is non-positive

    # Calculate row_sum using expm1
    row_sum = adata.X[random_cell_idx, :].expm1().sum(axis=1).A.flatten()[0]

    # Create a kernel density estimate for the raw counts
    kde = gaussian_kde(raw_counts)

    # Plot the distribution of raw counts in the current subplot as a line plot (KDE)
    x_vals = np.linspace(np.min(raw_counts), np.max(raw_counts), 100)
    axes[i // 3, i % 3].plot(x_vals, kde(x_vals), color='blue', linestyle='-')
    axes[i // 3, i % 3].set_title(f'Distribution of Raw Counts for Cell Index {random_cell_idx}')
    axes[i // 3, i % 3].set_xlabel('Raw Counts')
    axes[i // 3, i % 3].set_ylabel('Density')

    # Add text box with information
    info_text = f"Cell Index: {random_cell_idx}\nTotal Counts: {np.sum(raw_counts):.5f}\nLogged: {is_logged}\nRow Sum (expm1): {row_sum:.5f}"
    axes[i // 3, i % 3].text(0.55, 0.95, info_text, transform=axes[i // 3, i % 3].transAxes, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Adjust layout to prevent overlapping
plt.suptitle('Histograms for random selected cells', fontsize=30)
plt.tight_layout(rect=[0, 0.03, 1, 0.95], h_pad=2.0)
plt.show()