## Detect negative controls

In [8]:
import pandas as pd

In [9]:
profiles = pd.read_parquet("../1.load/output/raw_filtered_profiles.parquet")
profiles.head()

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,...,Nuclei_Texture_Variance_RNA_10_03_256,Nuclei_Texture_Variance_RNA_3_00_256,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,Metadata_Batch
0,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,82.875999,76.996002,77.473999,76.582001,77.233002,78.186996,80.055,77.632004,79.955002,2021_06_21_Batch7
1,JCP2022_900011,ccsbBroad304_00013,ORF009063.1_TRC304.1,pLX_304,NM_001612.6,ACRV1,56,9606,acrosomal vesicle protein 1,100.0,...,93.607002,88.196999,89.211998,88.081001,89.154999,89.897003,92.719002,89.843002,92.597,2021_06_21_Batch7
2,JCP2022_900033,ccsbBroad304_00037,ORF015627.1_TRC304.1,pLX_304,NM_001136.5,AGER,177,9606,advanced glycosylation end-product specific re...,100.0,...,133.380005,126.150002,127.25,125.769997,127.25,128.429993,131.880005,127.940002,131.960007,2021_06_21_Batch7
3,JCP2022_900063,ccsbBroad304_00069,ORF005433.1_TRC304.1,pLX_304,NM_001153.5,ANXA4,307,9606,annexin A4,100.0,...,84.871002,80.910004,81.814003,80.850998,81.926003,82.567001,85.179001,82.646004,85.292999,2021_06_21_Batch7
4,JCP2022_900084,ccsbBroad304_00091,ORF014376.1_TRC304.1,pLX_304,NM_001651.4,AQP5,362,9606,aquaporin 5,100.0,...,91.669998,87.241997,87.132004,86.538002,87.476997,88.224998,90.223,87.663002,90.227997,2021_06_21_Batch7


In [10]:
profiles.shape

(79560, 4780)

In [11]:
# Create a new column Metadata_SymbolX which is equal to Metadata_Symbol, by only if the values are in the list `selected_negcons`, otherwise it is set to "other"

selected_negcons = ["BFP", "HcRed", "LUCIFERASE"]
profiles["Metadata_SymbolX"] = profiles.Metadata_Symbol
profiles.loc[
    ~profiles.Metadata_Symbol.isin(selected_negcons), "Metadata_SymbolX"
] = "other"
profiles.loc[
    profiles.Metadata_Symbol.isin(selected_negcons), "Metadata_SymbolX"
] = "negcon"

# Now report counts of Metadata_SymbolX

profiles.Metadata_SymbolX.value_counts()

other     76800
negcon     2760
Name: Metadata_SymbolX, dtype: int64

In [12]:
# Keep only `Metadata_SymbolX` and columns that start with `Cells_` or `Nuclei_` or `Cytoplasm_` or `Image_`

prefixes = ["Cells_", "Nuclei_", "Cytoplasm_", "Image_"]
profiles = profiles[
    ["Metadata_SymbolX"]
    + [
        col
        for col in profiles.columns
        if any(col.startswith(prefix) for prefix in prefixes)
    ]
]

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import resample

In [14]:
# Define features and target
X = profiles.drop("Metadata_SymbolX", axis=1)
y = profiles["Metadata_SymbolX"]

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# Separate majority and minority classes in the training dataset
df_majority = X[X.Metadata_SymbolX == "other"]
df_minority = X[X.Metadata_SymbolX == "negcon"]

# Downsample majority class and upsample minority class
df_majority_downsampled = resample(
    df_majority,
    replace=False,  # sample without replacement
    n_samples=3200,  # downsampled number
    random_state=123,
)  # reproducible results

df_minority_upsampled = resample(
    df_minority,
    replace=True,  # sample with replacement
    n_samples=3200,  # upsampled number
    random_state=123,
)  # reproducible results

# Combine majority and minority class in the training dataset
df_balanced = pd.concat([df_majority_downsampled, df_minority_upsampled])

# Display new class counts
print(df_balanced.Metadata_SymbolX.value_counts())

# Get our features and labels back
y_train = df_balanced.Metadata_SymbolX
X_train = df_balanced.drop("Metadata_SymbolX", axis=1)

# Continue with training and evaluation as before
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

other     3200
negcon    3200
Name: Metadata_SymbolX, dtype: int64
[[  151   411]
 [ 1817 13533]]
              precision    recall  f1-score   support

      negcon       0.08      0.27      0.12       562
       other       0.97      0.88      0.92     15350

    accuracy                           0.86     15912
   macro avg       0.52      0.58      0.52     15912
weighted avg       0.94      0.86      0.90     15912

