## Detect negative controls

In [1]:
import pandas as pd

In [2]:
profiles = pd.read_parquet("../1.load/output/raw_filtered_profiles.parquet")
profiles.head()

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,...,Nuclei_Texture_Variance_RNA_10_02_256,Nuclei_Texture_Variance_RNA_10_03_256,Nuclei_Texture_Variance_RNA_3_00_256,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256
0,JCP2022_900031,ccsbBroad304_00035,ORF008140.1_TRC304.1,pLX_304,NM_001133.2,AFM,173,9606,afamin,100.0,...,45.397999,45.061001,42.069,42.476002,42.023998,42.422001,42.803001,44.071999,42.768002,43.882999
1,JCP2022_900031,ccsbBroad304_00035,ORF008140.1_TRC304.1,pLX_304,NM_001133.2,AFM,173,9606,afamin,100.0,...,65.134003,64.718002,60.271,60.638,60.160999,60.575001,61.202,62.817001,61.082001,62.860001
2,JCP2022_900031,ccsbBroad304_00035,ORF008140.1_TRC304.1,pLX_304,NM_001133.2,AFM,173,9606,afamin,100.0,...,61.777,60.708,57.853001,57.944,57.993,58.472,58.494999,59.888,58.701,60.189999
3,JCP2022_900031,ccsbBroad304_00035,ORF008140.1_TRC304.1,pLX_304,NM_001133.2,AFM,173,9606,afamin,100.0,...,57.150002,56.278,53.229,53.683998,53.313,53.849998,54.125999,55.486,54.18,55.615002
4,JCP2022_900031,ccsbBroad304_00035,ORF008140.1_TRC304.1,pLX_304,NM_001133.2,AFM,173,9606,afamin,100.0,...,50.901001,50.551998,47.576,47.773998,47.596001,48.008999,48.313,49.362999,48.235001,49.581001


In [3]:
profiles.shape

(5365, 4777)

In [4]:
# Create a new column Metadata_SymbolX which is equal to Metadata_Symbol, by only if the values are in the list `selected_negcons`, otherwise it is set to "other"

selected_negcons = ["BFP", "HcRed", "LUCIFERASE"]
profiles["Metadata_SymbolX"] = profiles.Metadata_Symbol
profiles.loc[~profiles.Metadata_Symbol.isin(selected_negcons), "Metadata_SymbolX"] = "other"

# Now report counts of Metadata_SymbolX

profiles.Metadata_SymbolX.value_counts()

other         5170
BFP             65
HcRed           65
LUCIFERASE      65
Name: Metadata_SymbolX, dtype: int64

In [5]:
# Keep only `Metadata_SymbolX` and columns that start with `Cells_` or `Nuclei_` or `Cytoplasm_` or `Image_`

prefixes = ["Cells_", "Nuclei_", "Cytoplasm_", "Image_"]
profiles = profiles[
    ["Metadata_SymbolX"]
    + [col for col in profiles.columns if any(col.startswith(prefix) for prefix in prefixes)]
]



I have a dataframe a column `Metadata_SymbolX` and several feature columns. 

`Metadata_SymbolX` contains the class label

I want to create a classifier using the features to predict the class label.

Follow machine learning best practice and come up with a classifier that is robust to overfitting.

Then report the performance of the classifier on the test set.



In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Splitting the dataset into features and target
features = profiles.drop('Metadata_SymbolX', axis=1)
target = profiles['Metadata_SymbolX']

# Splitting the dataset into training and testing sets
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.2, random_state=42, stratify=target
)

# Creating the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Training the classifier
clf.fit(features_train, target_train)

# Predicting the classes for the test set
target_pred = clf.predict(features_test)

# Reporting the performance
print(f"Accuracy: {accuracy_score(target_test, target_pred)}")
print(classification_report(target_test, target_pred))



Accuracy: 0.9645852749301025
              precision    recall  f1-score   support

         BFP       1.00      0.08      0.14        13
       HcRed       0.00      0.00      0.00        13
  LUCIFERASE       0.00      0.00      0.00        13
       other       0.96      1.00      0.98      1034

    accuracy                           0.96      1073
   macro avg       0.49      0.27      0.28      1073
weighted avg       0.94      0.96      0.95      1073



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
