In [1]:
# %tb

# Import Cell Profiler Dependencies
import cellprofiler
import cellprofiler_core.image
import cellprofiler_core.measurement
import cellprofiler_core.object
import cellprofiler_core.pipeline
import cellprofiler_core.preferences
import cellprofiler_core.workspace
from cellprofiler_core.utilities.java import start_java, stop_java




# # Inject Image module used to inject OMERO image planes into Cell Profiler Pipeline
# from cellprofiler_core.modules.injectimage import InjectImage

# Import OMERO Python BlitzGateway
# import omero
# from omero.gateway import BlitzGateway

# Import Numpy
import numpy as np

# Import Python System Packages
import os
import tempfile
import warnings


import io
import os
import pandas as pd
import plotly.express as px
import plotly.io as pio
import skimage.io

pio.renderers.default = "png"

# mura model imports
import requests
import math
import matplotlib.pyplot as plt
import shutil
from getpass import getpass
from PIL import Image, UnidentifiedImageError
from requests.exceptions import HTTPError
from io import BytesIO
from pathlib import Path
import torch
import pytorch_lightning as pl
# from huggingface_hub import HfApi, HfFolder, Repository, notebook_login
from torch.utils.data import DataLoader
from torchmetrics import Accuracy
from torchvision.datasets import ImageFolder
# from transformers import ViTFeatureExtractor, ViTForImageClassification
from transformers import ViTImageProcessor, ViTForImageClassification

set output directory

In [2]:
# Helper functions

profile_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/profiles/"
    "{Metadata_Batch}/{Metadata_Plate}/{Metadata_Plate}.parquet"
)

loaddata_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/load_data_csv/"
    "{Metadata_Batch}/{Metadata_Plate}/load_data_with_illum.parquet"
)

pull images

In [3]:
#Load metadata info for entire dataset

if "WORKSPACE_BUCKET" in os.environ:
    # This notebook is running on Terra.
    # Notebook 'workspace_setup.ipynb' cloned the git repo to this directory under $HOME.
    # If you cloned this repository manually to a different directory, edit this value to reflect that location.
    GIT_CLONE_DIR = "~/jump-cellpainting-datasets"
else:
    GIT_CLONE_DIR = "./datasets/"

In [4]:
#METADATA
plates = pd.read_csv(os.path.join(GIT_CLONE_DIR, "metadata/plate.csv.gz"))
wells = pd.read_csv(os.path.join(GIT_CLONE_DIR, "metadata/well.csv.gz"))
compound = pd.read_csv(os.path.join(GIT_CLONE_DIR, "metadata/compound.csv.gz"))
orf = pd.read_csv(os.path.join(GIT_CLONE_DIR, "metadata/orf.csv.gz"))

In [5]:
sample = (
    plates.query('Metadata_PlateType=="COMPOUND"')
    .sample(10, random_state=52)
)
sample

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType
25,source_1,Batch4_20221012,UL000083,COMPOUND
1898,source_6,p211123CPU2OS48hw384exp036JUMP,110000297102,COMPOUND
895,source_3,CP_25_all_Phenix1,C13451cW,COMPOUND
1965,source_7,20210723_Run2,CP2-SC1-17,COMPOUND
893,source_3,CP_25_all_Phenix1,C13443dW,COMPOUND
138,source_10,2021_06_22_U2OS_48_hr_run8,Dest210622-144809,COMPOUND
2255,source_8,J4,A1166167,COMPOUND
1429,source_5,JUMPCPE-20210628-Run02_20210628_170203,APTJUM114,COMPOUND
435,source_11,Batch5,EC000083,COMPOUND
309,source_11,Batch2,EC000037,COMPOUND


In [6]:
# load profiles of all plates
dframes = []
i = 0
u = len(sample)
# columns = ["Cells_AreaShape_Exent"]
# # ["Count"]
columns = [
    "Metadata_Source",
    "Metadata_Plate",
    "Metadata_Well",
]
for _, row in sample.iterrows():
    s3_path = profile_formatter.format(**row.to_dict())
    dframes.append(
        pd.read_parquet(s3_path, storage_options={"anon": True}, columns=columns)
    )
    i+=1
    print("profile " + str(i) + " of " + str(u) + " complete")
    
dframes = pd.concat(dframes)

profile 1 of 10 complete
profile 2 of 10 complete
profile 3 of 10 complete
profile 4 of 10 complete
profile 5 of 10 complete
profile 6 of 10 complete
profile 7 of 10 complete
profile 8 of 10 complete
profile 9 of 10 complete
profile 10 of 10 complete


In [7]:
dframes

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well
0,source_1,UL000083,A02
1,source_1,UL000083,A03
2,source_1,UL000083,A04
3,source_1,UL000083,A05
4,source_1,UL000083,A06
...,...,...,...
379,source_11,EC000037,P20
380,source_11,EC000037,P21
381,source_11,EC000037,P22
382,source_11,EC000037,P23


In [8]:
# merge compounds and wells, then merge all metadata to plates list (dframes)
metadata = compound.merge(wells, on="Metadata_JCP2022")
ann_dframe = metadata.merge(
    dframes, on=["Metadata_Source", "Metadata_Plate", "Metadata_Well"]
)

In [9]:
ann_dframe

Unnamed: 0,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI,Metadata_Source,Metadata_Plate,Metadata_Well
0,JCP2022_000013,AABSTWCOLWSFRA-UHFFFAOYSA-N,InChI=1S/C17H19N5O2S/c1-11-20-14(16-22(11)7-8-...,source_10,Dest210622-144809,O03
1,JCP2022_000042,AAESJVQAFSFYJD-UHFFFAOYSA-N,InChI=1S/C17H10ClF3N2S/c18-13-6-4-11(5-7-13)15...,source_8,A1166167,N06
2,JCP2022_000059,AAHBFIAFTLUMPU-UHFFFAOYSA-N,InChI=1S/C13H23N3O3S/c1-11-7-14-13(10-19-2)16(...,source_11,EC000037,J19
3,JCP2022_000063,AAHPBNNYWHITSB-UHFFFAOYSA-N,InChI=1S/C16H12F2N4S2/c17-11-5-1-9(2-6-11)13-1...,source_8,A1166167,G09
4,JCP2022_000073,AAIVITOVJTVMAM-UHFFFAOYSA-N,InChI=1S/C15H20N4O/c20-15(17-7-10-19-8-5-16-6-...,source_11,EC000083,M20
...,...,...,...,...,...,...
4830,JCP2022_116745,ZZYXOZDOAUYJMM-UHFFFAOYSA-N,InChI=1S/C14H16N4O5/c1-22-11-4-3-9(7-12(11)23-...,source_8,A1166167,F09
4831,JCP2022_999999,,,source_10,Dest210622-144809,A01
4832,JCP2022_999999,,,source_10,Dest210622-144809,E24
4833,JCP2022_999999,,,source_10,Dest210622-144809,I01


In [10]:
#gather metadata associated with images
load_data = []
i = 0
u = len(sample)
for _, row in sample.iterrows():
    s3_path = loaddata_formatter.format(**row.to_dict())
    load_data.append(pd.read_parquet(s3_path, storage_options={"anon": True}))
    i+=1
    print("profile " + str(i) + " of " + str(u) + " complete")

load_data = pd.concat(load_data)

profile 1 of 10 complete
profile 2 of 10 complete
profile 3 of 10 complete
profile 4 of 10 complete
profile 5 of 10 complete
profile 6 of 10 complete
profile 7 of 10 complete
profile 8 of 10 complete
profile 9 of 10 complete
profile 10 of 10 complete


In [11]:
# link metadata with image filepaths
linked = pd.merge(
    load_data, ann_dframe, on=["Metadata_Source", "Metadata_Plate", "Metadata_Well"]
)
linked[["Metadata_Well", "Metadata_Site"]]

Unnamed: 0,Metadata_Well,Metadata_Site
0,A02,1
1,A02,2
2,A02,3
3,A02,4
4,A03,1
...,...,...
35266,P24,5
35267,P24,6
35268,P24,7
35269,P24,8


In [12]:
# import os
# import requests
# from io import BytesIO
# from matplotlib import pyplot as plt
# from matplotlib import image as mpimg
# import boto3
# from botocore import UNSIGNED
# from botocore.config import Config
# i = 0
# u = 50 #len(linked)
# test_images = []
# test_targets = []
# train_images = []
# train_targets = []
# all_targets = []
# all_images = []
# for _, row in linked.iterrows():
#     image_url = os.path.join(
#         row.PathName_OrigDNA, row.FileName_OrigDNA
#     )
#     s3_client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
#     response = s3_client.get_object(
#         Bucket=image_url.split("/")[2], Key="/".join(image_url.split("/")[3:])
#     )
#     image = mpimg.imread(BytesIO(response["Body"].read()), format="tiff")
#     target = row.Metadata_InChIKey

#     plt.imshow(image, cmap = "gray") # , cmap="gray"
#     image_url
#     print(len(image), len(image[0]))
#     all_targets.append(target)
# #     print(os.stat(image_url).st_size)
#     all_images.append(image)
#     if i < u/2:
#         train_images.append(image)
#         train_targets.append(target)
#     else:
#         test_images.append(image)
#         test_targets.append(target)
        
#     i+=1
#     print("image " + str(i) + " of " + str(u) + " complete")
#     if i == u:
#         break
    

In [13]:
# print(linked[["PathName_OrigDNA", "FileName_OrigDNA"]])
# print(linked["PathName_OrigDNA"][0])
# print(linked["FileName_OrigDNA"][0])
# print(linked["FileName_OrigER"][0])
# print(linked["FileName_OrigRNA"][0])
# print(linked["FileName_OrigAGP"][0])
# print(linked["FileName_OrigMito"][0])
# print('')
# print(linked["FileName_OrigDNA"][1])
# print(linked["FileName_OrigER"][1])
# print(linked["FileName_OrigRNA"][1])
# print(linked["FileName_OrigAGP"][1])
# print(linked["FileName_OrigMito"][1])
# print('')
# print(linked["FileName_OrigDNA"][2])
# print(linked["FileName_OrigER"][2])
# print(linked["FileName_OrigRNA"][2])
# print(linked["FileName_OrigAGP"][2])
# print(linked["FileName_OrigMito"][2])
# print('')
# print(linked["FileName_OrigBrightfield"][2])
# print(linked["Metadata_InChIKey"][0])
# print(linked)

# dna_path = linked["PathName_OrigDNA"][0]
# dna_file = linked["FileName_OrigDNA"][0]
# er_path = linked["PathName_OrigER"][0]
# er_file = linked["FileName_OrigER"][0]
# rna_path = linked["PathName_OrigRNA"][0]
# rna_file = linked["FileName_OrigRNA"][0]
# agp_path = linked["PathName_OrigAGP"][0]
# agp_file = linked["FileName_OrigAGP"][0]
# mito_path = linked["PathName_OrigMito"][0]
# mito_file = linked["FileName_OrigMito"][0]

# illum_dna_path = linked["PathName_IllumDNA"][0]
# illum_dna_file = linked["FileName_IllumDNA"][0]
# illum_er_path = linked["PathName_IllumER"][0]
# illum_er_file = linked["FileName_IllumER"][0]
# illum_rna_path = linked["PathName_IllumRNA"][0]
# illum_rna_file = linked["FileName_IllumRNA"][0]
# illum_agp_path = linked["PathName_IllumAGP"][0]
# illum_agp_file = linked["FileName_IllumAGP"][0]
# illum_mito_path = linked["PathName_IllumMito"][0]
# illum_mito_file = linked["FileName_IllumMito"][0]
# both = path + file

# print(illum_dna_path, illum_dna_file)

linked

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_Well,Metadata_Site,FileName_IllumAGP,FileName_IllumDNA,FileName_IllumER,FileName_IllumMito,FileName_IllumRNA,...,PathName_IllumMito,PathName_IllumRNA,PathName_OrigAGP,PathName_OrigDNA,PathName_OrigER,PathName_OrigMito,PathName_OrigRNA,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI
0,source_1,Batch4_20221012,UL000083,A02,1,UL000083_IllumAGP.npy,UL000083_IllumDNA.npy,UL000083_IllumER.npy,UL000083_IllumMito.npy,UL000083_IllumRNA.npy,...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,JCP2022_033924,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,InChI=1S/C2H6OS/c1-4(2)3/h1-2H3
1,source_1,Batch4_20221012,UL000083,A02,2,UL000083_IllumAGP.npy,UL000083_IllumDNA.npy,UL000083_IllumER.npy,UL000083_IllumMito.npy,UL000083_IllumRNA.npy,...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,JCP2022_033924,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,InChI=1S/C2H6OS/c1-4(2)3/h1-2H3
2,source_1,Batch4_20221012,UL000083,A02,3,UL000083_IllumAGP.npy,UL000083_IllumDNA.npy,UL000083_IllumER.npy,UL000083_IllumMito.npy,UL000083_IllumRNA.npy,...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,JCP2022_033924,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,InChI=1S/C2H6OS/c1-4(2)3/h1-2H3
3,source_1,Batch4_20221012,UL000083,A02,4,UL000083_IllumAGP.npy,UL000083_IllumDNA.npy,UL000083_IllumER.npy,UL000083_IllumMito.npy,UL000083_IllumRNA.npy,...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,JCP2022_033924,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,InChI=1S/C2H6OS/c1-4(2)3/h1-2H3
4,source_1,Batch4_20221012,UL000083,A03,1,UL000083_IllumAGP.npy,UL000083_IllumDNA.npy,UL000083_IllumER.npy,UL000083_IllumMito.npy,UL000083_IllumRNA.npy,...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,JCP2022_085227,SRVFFFJZQVENJC-UHFFFAOYSA-N,InChI=1S/C17H30N2O5/c1-6-23-17(22)14-13(24-14)...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35266,source_11,Batch2,EC000037,P24,5,EC000037_IllumAGP.npy,EC000037_IllumDNA.npy,EC000037_IllumER.npy,EC000037_IllumMito.npy,EC000037_IllumRNA.npy,...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,JCP2022_046054,KPBNHDGDUADAGP-UHFFFAOYSA-N,InChI=1S/C24H29N3O2/c28-23(12-11-21-8-6-15-25-...
35267,source_11,Batch2,EC000037,P24,6,EC000037_IllumAGP.npy,EC000037_IllumDNA.npy,EC000037_IllumER.npy,EC000037_IllumMito.npy,EC000037_IllumRNA.npy,...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,JCP2022_046054,KPBNHDGDUADAGP-UHFFFAOYSA-N,InChI=1S/C24H29N3O2/c28-23(12-11-21-8-6-15-25-...
35268,source_11,Batch2,EC000037,P24,7,EC000037_IllumAGP.npy,EC000037_IllumDNA.npy,EC000037_IllumER.npy,EC000037_IllumMito.npy,EC000037_IllumRNA.npy,...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,JCP2022_046054,KPBNHDGDUADAGP-UHFFFAOYSA-N,InChI=1S/C24H29N3O2/c28-23(12-11-21-8-6-15-25-...
35269,source_11,Batch2,EC000037,P24,8,EC000037_IllumAGP.npy,EC000037_IllumDNA.npy,EC000037_IllumER.npy,EC000037_IllumMito.npy,EC000037_IllumRNA.npy,...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,s3://cellpainting-gallery/cpg0016-jump/source_...,JCP2022_046054,KPBNHDGDUADAGP-UHFFFAOYSA-N,InChI=1S/C24H29N3O2/c28-23(12-11-21-8-6-15-25-...


In [14]:
temp_image_path = "/Users/abestroka/Argonne/git_repos/JUMP_vision_model/image_temp"

Loop through linked list, pull each image, and segment with cell profiler pipeline, then train model

In [15]:
i = 0
for _, row in linked.iterrows():
    dna_path = linked["PathName_OrigDNA"][i]
    dna_file = linked["FileName_OrigDNA"][i]
    er_path = linked["PathName_OrigER"][i]
    er_file = linked["FileName_OrigER"][i]
    rna_path = linked["PathName_OrigRNA"][i]
    rna_file = linked["FileName_OrigRNA"][i]
    agp_path = linked["PathName_OrigAGP"][i]
    agp_file = linked["FileName_OrigAGP"][i]
    mito_path = linked["PathName_OrigMito"][i]
    mito_file = linked["FileName_OrigMito"][i]

    illum_dna_path = linked["PathName_IllumDNA"][i]
    illum_dna_file = linked["FileName_IllumDNA"][i]
    illum_er_path = linked["PathName_IllumER"][i]
    illum_er_file = linked["FileName_IllumER"][i]
    illum_rna_path = linked["PathName_IllumRNA"][i]
    illum_rna_file = linked["FileName_IllumRNA"][i]
    illum_agp_path = linked["PathName_IllumAGP"][i]
    illum_agp_file = linked["FileName_IllumAGP"][i]
    illum_mito_path = linked["PathName_IllumMito"][i]
    illum_mito_file = linked["FileName_IllumMito"][i]
    
    target = linked["Metadata_InChIKey"][i]
    print(i)
    print('TARGET', target)
    
    ! aws s3 cp \--no-sign-request \{dna_path}{dna_file} image_temp
    os.rename(temp_image_path+"/"+dna_file, temp_image_path+"/"+"dna.tiff")
    ! aws s3 cp \--no-sign-request \{er_path}{er_file} image_temp
    os.rename(temp_image_path+"/"+er_file, temp_image_path+"/"+"er.tiff")
    ! aws s3 cp \--no-sign-request \{rna_path}{rna_file} image_temp
    os.rename(temp_image_path+"/"+rna_file, temp_image_path+"/"+"rna.tiff")
    ! aws s3 cp \--no-sign-request \{agp_path}{agp_file} image_temp
    os.rename(temp_image_path+"/"+agp_file, temp_image_path+"/"+"agp.tiff")
    ! aws s3 cp \--no-sign-request \{mito_path}{mito_file} image_temp
    os.rename(temp_image_path+"/"+mito_file, temp_image_path+"/"+"mito.tiff")


    ! aws s3 cp \--no-sign-request \{illum_dna_path}/{illum_dna_file} image_temp
    os.rename(temp_image_path+"/"+illum_dna_file, temp_image_path+"/"+"illum_dna.npy")
    ! aws s3 cp \--no-sign-request \{illum_er_path}/{illum_er_file} image_temp
    os.rename(temp_image_path+"/"+illum_er_file, temp_image_path+"/"+"illum_er.npy")
    ! aws s3 cp \--no-sign-request \{illum_rna_path}/{illum_rna_file} image_temp
    os.rename(temp_image_path+"/"+illum_rna_file, temp_image_path+"/"+"illum_rna.npy")
    ! aws s3 cp \--no-sign-request \{illum_agp_path}/{illum_agp_file} image_temp
    os.rename(temp_image_path+"/"+illum_agp_file, temp_image_path+"/"+"illum_agp.npy")
    ! aws s3 cp \--no-sign-request \{illum_mito_path}/{illum_mito_file} image_temp
    os.rename(temp_image_path+"/"+illum_mito_file, temp_image_path+"/"+"illum_mito.npy")


    pipeline_filename = "/Users/abestroka/Argonne/git_repos/JUMP_vision_model/my_pipeline.cppipe"
    ! cellprofiler -c -r -p /Users/abestroka/Argonne/git_repos/JUMP_vision_model/my_project_421.cppipe -i /Users/abestroka/Argonne/git_repos/JUMP_vision_model/image_temp -o /Users/abestroka/Argonne/git_repos/JUMP_vision_model/segmented_image_temp/{target}
    cells_path = "/Users/abestroka/Argonne/git_repos/JUMP_vision_model/segmented_image_temp"

    
    # clear folder with images
    
    if i == 5:
        break
    else:
        i+=1

3
TARGET IAZDPXIOMUYVGZ-UHFFFAOYSA-N
download: s3://cellpainting-gallery/cpg0016-jump/source_1/images/Batch4_20221012/images/UL000083__2022-10-12T17_46_26-Measurement1/Images/r01c02f04p01-ch3sk1fk1fl1.tiff to image_temp/r01c02f04p01-ch3sk1fk1fl1.tiff
download: s3://cellpainting-gallery/cpg0016-jump/source_1/images/Batch4_20221012/images/UL000083__2022-10-12T17_46_26-Measurement1/Images/r01c02f04p01-ch1sk1fk1fl1.tiff to image_temp/r01c02f04p01-ch1sk1fk1fl1.tiff
download: s3://cellpainting-gallery/cpg0016-jump/source_1/images/Batch4_20221012/images/UL000083__2022-10-12T17_46_26-Measurement1/Images/r01c02f04p01-ch5sk1fk1fl1.tiff to image_temp/r01c02f04p01-ch5sk1fk1fl1.tiff
download: s3://cellpainting-gallery/cpg0016-jump/source_1/images/Batch4_20221012/images/UL000083__2022-10-12T17_46_26-Measurement1/Images/r01c02f04p01-ch4sk1fk1fl1.tiff to image_temp/r01c02f04p01-ch4sk1fk1fl1.tiff
download: s3://cellpainting-gallery/cpg0016-jump/source_1/images/Batch4_20221012/images/UL000083__2022-10-1

In [16]:
    #init dataset, split into training and validation
    ds = ImageFolder(cells_path)
#     print(ds)
    indices = torch.randperm(len(ds)).tolist()
#     print(indices)
    n_val = math.floor(len(indices) * .15)
#     print(n_val)
    train_ds = torch.utils.data.Subset(ds, indices[:-n_val])
#     print(len(train_ds))
    val_ds = torch.utils.data.Subset(ds, indices[-n_val:])
#     print(len(val_ds))

In [17]:
label2id = {}
id2label = {}

for i, class_name in enumerate(ds.classes):
    label2id[class_name] = str(i)
    id2label[str(i)] = class_name
  
print(label2id)
print(id2label)

{'IAZDPXIOMUYVGZ-UHFFFAOYSA-N': '0', 'SRVFFFJZQVENJC-UHFFFAOYSA-N': '1'}
{'0': 'IAZDPXIOMUYVGZ-UHFFFAOYSA-N', '1': 'SRVFFFJZQVENJC-UHFFFAOYSA-N'}


In [18]:
class ImageClassificationCollator:
    def __init__(self, feature_extractor):
        self.feature_extractor = feature_extractor
 
    def __call__(self, batch):
        encodings = self.feature_extractor([x[0] for x in batch], return_tensors='pt')
        encodings['labels'] = torch.tensor([x[1] for x in batch], dtype=torch.long)
        return encodings 

In [19]:
feature_extractor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224-in21k',
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label
)
collator = ImageClassificationCollator(feature_extractor)
train_loader = DataLoader(train_ds, batch_size=8, collate_fn=collator, num_workers=0, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=8, collate_fn=collator, num_workers=0)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
print(len(train_loader))
print(len(val_loader))

59
11


In [21]:
class Classifier(pl.LightningModule):

    def __init__(self, model, lr: float = 2e-5, **kwargs):
        super().__init__()
        self.save_hyperparameters('lr', *list(kwargs))
        self.model = model
        self.forward = self.model.forward
        self.val_acc = Accuracy(
            task='multiclass' if model.config.num_labels > 2 else 'binary',
            num_classes=model.config.num_labels
        )

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        self.log(f"train_loss", outputs.loss)
        return outputs.loss

    def validation_step(self, batch, batch_idx):
        outputs = self(**batch)
        self.log(f"val_loss", outputs.loss)
        acc = self.val_acc(outputs.logits.argmax(1), batch['labels'])
        self.log(f"val_acc", acc, prog_bar=True)
        return outputs.loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [22]:
pl.seed_everything(42)
classifier = Classifier(model, lr=2e-5)
# trainer = pl.Trainer(accelerator='gpu', devices=1, precision=16, max_epochs=4)
trainer = pl.Trainer(accelerator='auto', devices=1, precision=16, max_epochs=4)

# trainer = pl.Trainer()
trainer.fit(classifier, train_loader, val_loader)

Seed set to 42
/Users/abestroka/anaconda3/envs/cellprofiler/lib/python3.8/site-packages/lightning_fabric/connector.py:558: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
/Users/abestroka/anaconda3/envs/cellprofiler/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:551: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/abestroka/anaconda3/envs/cellprofiler/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potent

Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

/Users/abestroka/anaconda3/envs/cellprofiler/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
[W NNPACK.cpp:64] Could not initialize NNPACK! Reason: Unsupported hardware.
/Users/abestroka/anaconda3/envs/cellprofiler/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Training: |                                               | 0/? [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: '/Users/abestroka/Argonne/git_repos/JUMP_vision_model/segmented_image_temp/SRVFFFJZQVENJC-UHFFFAOYSA-N/Cells_69.png'