Check whether a protein always has the same location labels in a cell line across different images

In [1]:
from collections import defaultdict
import pickle

In [2]:
with open("/data/wei/hpa-webdataset-all-composite/HPACombineDatasetInfo.pickle", "rb") as in_file:
    data_info = pickle.load(in_file)
len(data_info), data_info[0]

(247678,
 {'filename': '/archive/1680/1680_F4_5_',
  'if_plate_id': 1680,
  'position': 'F4',
  'sample': 5,
  'status': 35,
  'Image status name': 'Annotated / Proteinatlas',
  'locations': 'Cell Junctions',
  'staining characteristics': nan,
  'unspecific': 0.0,
  'antibody': 'HPA077995',
  'ensembl_ids': 'ENSG00000253537',
  'gene_names': 'PCDHGA7',
  'atlas_name': 'SH-SY5Y',
  'versions': '16.0,16.1,17.0',
  'earliest_version': 16.0,
  'first_released': '2016-12-04 14:00:00',
  'latest_version': 17.0,
  'Spatial cell cycle': nan,
  'Intensity cell cycle': nan,
  'Annotated cell cycle': 0.0,
  'gain': 800.0,
  'Ab state': 'IF_FAILED',
  'Max tpm': 19.76,
  'Finished in genes': nan,
  'Protocol': 'PFA',
  'Gene reliability (in release)': nan,
  'Gene reliability (lims)': nan,
  'Cell count': 24.0,
  'well_location_predictions_all': nan,
  'image_id': '77995_1680_F4_5',
  'sequences': ['>sp|Q9Y5G6|PCDG7_HUMAN Protocadherin gamma-A7 OS=Homo sapiens OX=9606 GN=PCDHGA7 PE=2 SV=1\nMAAQPRG

In [3]:
protcl2locs = defaultdict(set)
for x in data_info:
    prot, cl = x["gene_names"], x["atlas_name"]
    if str(prot) != "nan":
        protcl2locs[(prot, cl)].add(str(x["locations"]))
len(protcl2locs)

75988

In [5]:
for protcl, locs in protcl2locs.items():
    if len(locs) > 1:
        print(protcl, locs)

('CEL', 'U-2 OS') {'Nucleoplasm', 'nan'}
('KIF14', 'A-431') {'Cytosol,Plasma membrane', 'Cytosol'}
('NCOA7', 'U-2 OS') {'nan', 'Nuclear bodies', 'Cytosol'}
('SLC35A1', 'MCF7') {'Nucleoplasm,Vesicles', 'Nucleoli', 'nan'}
('EEF1D', 'MCF7') {'Endoplasmic reticulum', 'Centrosome', 'nan', 'Nucleoli fibrillar center,Nucleoplasm'}
('PRKACA,PRKACB,PRKACG', 'U-2 OS') {'Golgi apparatus,Nucleoli', 'Cytosol', 'nan'}
('NFKBIB', 'CACO-2') {'Cell Junctions,Centrosome,Nucleoplasm', 'nan'}
('DUSP1', 'HeLa') {'Cytosol', 'nan'}
('TRPM8', 'U-251 MG') {'Cytosol,Nucleoplasm', 'Nucleoplasm'}
('STIM2', 'A-431') {'Vesicles', 'Nucleoplasm,Vesicles', 'Cytosol', 'Cell Junctions,Nucleoplasm'}
('INO80', 'U-2 OS') {'Intermediate filaments,Nucleoplasm', 'Centriolar satellite,Microtubules', 'Aggresome', 'Nucleoplasm', 'nan', 'Nucleoplasm,Plasma membrane'}
('GLE1', 'U-2 OS') {'Cytosol,Plasma membrane', 'Golgi apparatus', 'Nuclear membrane,Nucleoli'}
('SLC5A1', 'CACO-2') {'Nucleoplasm,Vesicles', 'Nucleoli,Nucleoplasm', 

The conclusion is that a protein can have different location labels in a cell line across different images.