# Analysis

## Imports

In [1]:
import sys
import os
import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.path.abspath(os.getcwd())

# Construct the path to the 'src' directory
ncp_src_path = os.path.abspath(os.path.join(notebook_dir, "..", "ncp", "src"))

# Add the 'src' directory to sys.path
if ncp_src_path not in sys.path:
    sys.path.append(ncp_src_path)

# Now you can import the modules
from analysis import split_data, mann_whitney_u_test, logistic_regression
from visualization import visualize_channels


# Suppressing warnings for cleaner output
import warnings

warnings.filterwarnings("ignore")

## Functions

## Load and clean data

In [2]:
data_path = "output/ncp_augmented.parquet"
df = pd.read_parquet(data_path)

# select only rows where the Metadata_line_source is "human"
df = df.query("Metadata_line_source == 'human'")

## Analyze

In [3]:
def analyze_by_category(
    df, category_col, target_col, target_col_mapping_dict, feature_cols
):
    """
    Analyze the data by category using logistic regression and Mann-Whitney U-test.

    Parameters:
    - df: DataFrame, the data.
    - category_col: str, the name of the column to define categories.
    - target_col: str, the name of the column to define groups for logistic regression and U-test.
    - target_col_mapping_dict: dict, a dictionary mapping target_col values to integers.
    - feature_cols: list, the list of feature columns to use for analysis.
    """
    categories = df[category_col].unique()

    target_col_encoded = f"{target_col}_encoded"

    df[target_col_encoded] = df[target_col].map(target_col_mapping_dict).fillna(-1)

    for category in categories:
        print(f"\nAnalyzing category: {category}")
        category_df = df[df[category_col] == category]

        # Prepare data for logistic regression
        X_train, X_test, y_train, y_test = split_data(
            category_df,
            feature_cols=feature_cols,
            target_col=target_col_encoded,
            group_split_col="Metadata_line_ID",
        )

        # Perform logistic regression
        score, feature_weights = logistic_regression(X_train, y_train, X_test, y_test)
        print(f"Logistic Regression Accuracy Score for {category}: {score:.4f}")

        # Perform Mann-Whitney U-test
        results = mann_whitney_u_test(
            category_df, feature_cols=feature_cols, target_col=target_col_encoded
        )

        # Visualize significant features
        significant_features = results.query("q_value < 0.05")["feature"].tolist()

        print(
            f"Number of significant features for {category}: {len(significant_features)}"
        )

        if significant_features:
            # print(f"Significant features for {category}: {significant_features}")
            # Print just the number of significant features
            visualize_channels(significant_features)

        print(70 * "=")

In [4]:
feature_cols = df.columns[df.columns.str.contains("Cells_|Cytoplasm_|Nuclei_")].tolist()

# HACK to reduce the number of features
feature_cols = feature_cols[:30]

# Call the function with the appropriate parameters
analyze_by_category(
    df=df,
    category_col="Metadata_cell_type",
    target_col="Metadata_line_condition",
    target_col_mapping_dict={"control": 0, "deletion": 1},
    feature_cols=feature_cols,
)


Analyzing category: stem
Logistic Regression Accuracy Score for stem: 0.8056
Number of significant features for stem: 20


NameError: name 'sns' is not defined

<Figure size 1000x500 with 0 Axes>