In [3]:
import pandas as pd
import matplotlib.pyplot as plt


# 1. Load parquet file, check protein ID duplication

In [19]:
fn_parquet = '/data100t1/share/CCHC/CCHC_proteomics_dec2025/McCormic_Chatzi_Below_Explore_HT_DEC_25_NPX_2025-12-05.parquet'
df_data = pd.read_parquet(fn_parquet)
print(df_data.shape)
display(df_data.head(2))
print(df_data.columns)


(9400320, 24)


Unnamed: 0,SampleID,SampleType,WellID,PlateID,DataAnalysisRefID,OlinkID,UniProt,Assay,AssayType,Panel,...,Normalization,PCNormalizedNPX,AssayQC,SampleQC,SoftwareVersion,SoftwareName,PanelDataArchiveVersion,PreProcessingVersion,PreProcessingSoftware,InstrumentType
0,10Y0078,SAMPLE,A1,Rack_1,D10010,OID45511,EXT1,Extension control 1,ext_ctrl,Explore_HT,...,Intensity,0.0,,PASS,1.3.0,NPX Map,1.5.0,1.3.0,NPX Map CLI,Ultima Genomics UG100
1,10Y0221,SAMPLE,A2,Rack_1,D10010,OID45511,EXT1,Extension control 1,ext_ctrl,Explore_HT,...,Intensity,0.0,,PASS,1.3.0,NPX Map,1.5.0,1.3.0,NPX Map CLI,Ultima Genomics UG100


Index(['SampleID', 'SampleType', 'WellID', 'PlateID', 'DataAnalysisRefID',
       'OlinkID', 'UniProt', 'Assay', 'AssayType', 'Panel', 'Block', 'Count',
       'ExtNPX', 'NPX', 'Normalization', 'PCNormalizedNPX', 'AssayQC',
       'SampleQC', 'SoftwareVersion', 'SoftwareName',
       'PanelDataArchiveVersion', 'PreProcessingVersion',
       'PreProcessingSoftware', 'InstrumentType'],
      dtype='object')


## 1.1 Check ID duplication

In [22]:
print('# Check protein ID duplication')
test_sample_id = df_data['SampleID'].iloc[0]
print('# Number of proteins per sample:', len(df_data[df_data['SampleID']==test_sample_id]))
print('# N Unique Olink IDs:', df_data[df_data['SampleID']==test_sample_id]['OlinkID'].nunique())
print('# N Unique UniProt IDs:', df_data[df_data['SampleID']==test_sample_id]['UniProt'].nunique())
print('# N Unique Assasy IDs:', df_data[df_data['SampleID']==test_sample_id]['Assay'].nunique())

# No duplication this time, can use any ID in the reformated file
# But to keep everything the same, still save OlinkID and a ID mapping file

# Check protein ID duplication
# Number of proteins per sample: 5440
# N Unique Olink IDs: 5440
# N Unique UniProt IDs: 5440
# N Unique Assasy IDs: 5440


## 2. Remove control samples and reformat

In [64]:
# for v in df_data.loc[df_data['SampleID']==test_sample_id, '']:
print('# Remove assay control samples:', df_data['AssayType'].unique())
print('# Remove sample control samples:', df_data['SampleType'].unique())
df_data = df_data[(df_data['AssayType']=='assay') & (df_data['SampleType']=='SAMPLE')]

# Remove assay control samples: ['assay']
# Remove sample control samples: ['SAMPLE' 'NEGATIVE_CONTROL' 'SAMPLE_CONTROL' 'PLATE_CONTROL']


In [65]:
# output_path = '/vgipiper04/CCHC/proteomics/batch3'
output_path = '../output'
for col_name in ['NPX', 'PCNormalizedNPX', 'ExtNPX']:
    print(f'# Process {col_name}')
    protein_id = 'OlinkID'
    lst_dfs = []
    for labid, df in df_data.groupby('SampleID'):
        df = df[[protein_id, col_name]].set_index(keys=protein_id).T.reset_index().rename(columns={'index':'LABID'})
        df['LABID'] = labid
        lst_dfs.append(df)
    df_merged = pd.concat(lst_dfs)
    print('# -', df_merged.shape)
    output_fn = f'{output_path}/20251212_CCHC_proteomics.batch3.OlinkID.control_reomved.{col_name}.csv'
    df_merged.to_csv(output_fn, index=False)


# Create an ID mapping file
test_sample_id = df_data['SampleID'].iloc[0]
output_fn = f'{output_path}/olink_uniprot_assay_id_mapping.csv'
df_data.loc[df_data['SampleID']==test_sample_id, ['OlinkID', 'UniProt', 'Assay']].to_csv(output_fn, index=False)

# Process NPX
# - (1548, 5417)
# Process PCNormalizedNPX
# - (1548, 5417)
# Process ExtNPX
# - (1548, 5417)


In [70]:
# There are some samples not belong to BHRC
df_merged['LABID']

0                      10Y0010
0                      10Y0022
0                      10Y0036
0                      10Y0039
0                      10Y0041
               ...            
0                R12-sol-00409
0                R12-sol-00410
0                R12-sol-00411
0                R12-sol-00413
0    Removal.1 DAY 14- 8-31-23
Name: LABID, Length: 1548, dtype: object

In [77]:
print(len([v for v in df_merged['LABID'] if 'sol' in v.lower()]))
print(len([v for v in df_merged['LABID'] if 'glp' in v.lower()]))
print(len([v for v in df_merged['LABID'] if len(v)<6]))
print(len([v for v in df_merged['LABID'] if len(v)>20]))

277
112
88
4
