## Detect negative controls

In [32]:
import pandas as pd

In [33]:
profiles = pd.read_parquet("../1.load/output/raw_filtered_profiles.parquet")
profiles.head()

In [None]:
profiles.shape

In [None]:
# Create a new column Metadata_SymbolX which is equal to Metadata_Symbol, by only if the values are in the list `selected_negcons`, otherwise it is set to "other"

selected_negcons = ["BFP", "HcRed", "LUCIFERASE"]
profiles["Metadata_SymbolX"] = profiles.Metadata_Symbol
profiles.loc[
    ~profiles.Metadata_Symbol.isin(selected_negcons), "Metadata_SymbolX"
] = "other"
profiles.loc[
    profiles.Metadata_Symbol.isin(selected_negcons), "Metadata_SymbolX"
] = "negcon"

# Now report counts of Metadata_SymbolX

profiles.Metadata_SymbolX.value_counts()

In [None]:
# Keep only `Metadata_SymbolX` and columns that start with `Cells_` or `Nuclei_` or `Cytoplasm_` or `Image_`

prefixes = ["Cells_", "Nuclei_", "Cytoplasm_", "Image_"]
profiles = profiles[
    ["Metadata_SymbolX"]
    + [
        col
        for col in profiles.columns
        if any(col.startswith(prefix) for prefix in prefixes)
    ]
]

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import resample

In [None]:
# Define features and target
X = profiles.drop("Metadata_SymbolX", axis=1)
y = profiles["Metadata_SymbolX"]

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# Separate majority and minority classes in the training dataset
df_majority = X[X.Metadata_SymbolX == "other"]
df_minority = X[X.Metadata_SymbolX == "negcon"]

# Downsample majority class and upsample minority class
df_majority_downsampled = resample(
    df_majority,
    replace=False,  # sample without replacement
    n_samples=3200,  # downsampled number
    random_state=123,
)  # reproducible results

df_minority_upsampled = resample(
    df_minority,
    replace=True,  # sample with replacement
    n_samples=3200,  # upsampled number
    random_state=123,
)  # reproducible results

# Combine majority and minority class in the training dataset
df_balanced = pd.concat([df_majority_downsampled, df_minority_upsampled])

# Display new class counts
print(df_balanced.Metadata_SymbolX.value_counts())

# Get our features and labels back
y_train = df_balanced.Metadata_SymbolX
X_train = df_balanced.drop("Metadata_SymbolX", axis=1)

# Continue with training and evaluation as before
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))