## Classify eGFP vs. negcons

In [33]:
import pandas as pd

In [34]:
profiles = pd.read_parquet("../1.load/output/raw_filtered_profiles_with_poscon.parquet")
profiles.head()

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,...,Nuclei_Texture_Variance_RNA_10_03_256,Nuclei_Texture_Variance_RNA_3_00_256,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,Metadata_Batch
0,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,82.875999,76.996002,77.473999,76.582001,77.233002,78.186996,80.055,77.632004,79.955002,2021_06_21_Batch7
1,JCP2022_900011,ccsbBroad304_00013,ORF009063.1_TRC304.1,pLX_304,NM_001612.6,ACRV1,56,9606,acrosomal vesicle protein 1,100.0,...,93.607002,88.196999,89.211998,88.081001,89.154999,89.897003,92.719002,89.843002,92.597,2021_06_21_Batch7
2,JCP2022_900033,ccsbBroad304_00037,ORF015627.1_TRC304.1,pLX_304,NM_001136.5,AGER,177,9606,advanced glycosylation end-product specific re...,100.0,...,133.380005,126.150002,127.25,125.769997,127.25,128.429993,131.880005,127.940002,131.960007,2021_06_21_Batch7
3,JCP2022_900063,ccsbBroad304_00069,ORF005433.1_TRC304.1,pLX_304,NM_001153.5,ANXA4,307,9606,annexin A4,100.0,...,84.871002,80.910004,81.814003,80.850998,81.926003,82.567001,85.179001,82.646004,85.292999,2021_06_21_Batch7
4,JCP2022_900084,ccsbBroad304_00091,ORF014376.1_TRC304.1,pLX_304,NM_001651.4,AQP5,362,9606,aquaporin 5,100.0,...,91.669998,87.241997,87.132004,86.538002,87.476997,88.224998,90.223,87.663002,90.227997,2021_06_21_Batch7


In [35]:
profiles.shape

(81490, 4780)

In [36]:
# Create a new column Metadata_SymbolX which is equal to Metadata_Symbol, by only if the values are in the list `selected_negcons`, otherwise it is set to "other"

selected_negcons = ["BFP", "HcRed", "LUCIFERASE", "LacZ"]
selected_poscons = ["eGFP"]
profiles["Metadata_SymbolX"] = profiles.Metadata_Symbol
profiles.loc[
    profiles.Metadata_Symbol.isin(selected_poscons), "Metadata_SymbolX"
] = "poscon"
profiles.loc[
    profiles.Metadata_Symbol.isin(selected_negcons), "Metadata_SymbolX"
] = "negcon"

# drop rows where Metadata_SymbolX is neither poscon nor negcon

profiles = profiles.loc[
    profiles.Metadata_SymbolX.isin(["poscon", "negcon"]), :
].reset_index(drop=True)


# Now report counts of Metadata_SymbolX

profiles.Metadata_SymbolX.value_counts()

negcon    3680
poscon    1930
Name: Metadata_SymbolX, dtype: int64

In [37]:
# Keep only `Metadata_SymbolX` and columns that start with `Cells_` or `Nuclei_` or `Cytoplasm_` or `Image_`

prefixes = ["Cells_", "Nuclei_", "Cytoplasm_", "Image_"]
profiles = profiles[
    ["Metadata_SymbolX"]
    + [
        col
        for col in profiles.columns
        if any(col.startswith(prefix) for prefix in prefixes)
    ]
]

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [39]:
# Define features and target
X = profiles.drop("Metadata_SymbolX", axis=1)
y = profiles["Metadata_SymbolX"]

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [40]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[702  44]
 [ 73 303]]
              precision    recall  f1-score   support

      negcon       0.91      0.94      0.92       746
      poscon       0.87      0.81      0.84       376

    accuracy                           0.90      1122
   macro avg       0.89      0.87      0.88      1122
weighted avg       0.89      0.90      0.89      1122



In [46]:
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel

from sklearn import preprocessing

# Assuming your data is already clean
X = profiles.drop("Metadata_SymbolX", axis=1)
y = profiles["Metadata_SymbolX"]

# Encode labels of y
le = preprocessing.LabelEncoder()
y_encoded = le.fit_transform(y)

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

# Train a model to do feature selection
selection = SelectFromModel(xgb.XGBClassifier(n_estimators=100, eval_metric="mlogloss"))
selection.fit(X_train, y_train)

# Transform train and test sets
select_X_train = selection.transform(X_train)
select_X_test = selection.transform(X_test)

# Train the model
clf = xgb.XGBClassifier(n_estimators=100, eval_metric="mlogloss", random_state=42)
clf.fit(select_X_train, y_train)

# Predict the labels
y_pred = clf.predict(select_X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[722  24]
 [ 34 342]]
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       746
           1       0.93      0.91      0.92       376

    accuracy                           0.95      1122
   macro avg       0.94      0.94      0.94      1122
weighted avg       0.95      0.95      0.95      1122



In [48]:
from sklearn.model_selection import RandomizedSearchCV

# Parameter Tuning
param_dist = {
    "n_estimators": range(80, 200, 20),
    "max_depth": range(2, 10, 1),
    "learning_rate": [0.1, 0.01, 0.05],
}

rsearch = RandomizedSearchCV(
    estimator=xgb.XGBClassifier(eval_metric="mlogloss"),
    param_distributions=param_dist,
    scoring="accuracy",
    cv=5,
    n_iter=10,
    random_state=42,
    n_jobs=-1,
    verbose=4,
)
rsearch.fit(select_X_train, y_train)

# Print best parameters
print(rsearch.best_params_)

# Train the model using the best parameters
clf_best = xgb.XGBClassifier(
    n_estimators=rsearch.best_params_["n_estimators"],
    max_depth=rsearch.best_params_["max_depth"],
    learning_rate=rsearch.best_params_["learning_rate"],
    eval_metric="mlogloss",
    random_state=42,
)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[CV 1/5] END learning_rate=0.05, max_depth=2, n_estimators=100;, score=0.890 total time=  27.5s
[CV 2/5] END learning_rate=0.05, max_depth=2, n_estimators=100;, score=0.895 total time=  36.1s
[CV 3/5] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.958 total time= 1.6min
[CV 1/5] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.947 total time= 1.6min
[CV 4/5] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.920 total time= 1.6min
[CV 5/5] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.941 total time= 1.6min
[CV 2/5] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.931 total time= 1.6min
[CV 3/5] END learning_rate=0.05, max_depth=2, n_estimators=100;, score=0.893 total time= 1.3min
[CV 4/5] END learning_rate=0.05, max_depth=2, n_estimators=100;, score=0.873 total time= 1.4min
[CV 5/5] END learning_rate=0.05, max_depth=2, n_estimators=100;, score=0.876 total time= 1.4min
[CV 5/5] END learning_rate=0.05, max_depth=5,

In [49]:
clf_best.fit(select_X_train, y_train)

# Predict the labels for the best model
y_pred_best = clf_best.predict(select_X_test)

# Evaluate the best model
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))

[[727  19]
 [ 36 340]]
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       746
           1       0.95      0.90      0.93       376

    accuracy                           0.95      1122
   macro avg       0.95      0.94      0.94      1122
weighted avg       0.95      0.95      0.95      1122

