# RANDOM SAMPLING OF 5000 Files

In [1]:
import pandas as pd
import numpy as np

# Define a function to extract the raw file name
def extract_raw_file_name(filename):
    return filename.split('.')[0]

# Read in the target CSV file
target = pd.read_csv('subset_750_target_FDR_001_psm.tsv', sep=',')

# Apply the function to the PSMId column and create a new column called 'rawfile'
target['rawfile'] = target['PSMId'].apply(extract_raw_file_name)

# Get the unique rawfile names
unique_rawfiles = target['rawfile'].unique()

# Randomly select 1000 rawfile names
selected_rawfiles = np.random.choice(unique_rawfiles, size=5000, replace=False)

# Create a subset of the target data with the selected raw files
target_subset = target[target['rawfile'].isin(selected_rawfiles)]
target_subset = target_subset.drop(columns=['rawfile'])

# Read in the decoy CSV file
decoy = pd.read_csv('subset_750_decoy_FDR_001_psm.tsv', sep=',')

# Apply the function to the PSMId column and create a new column called 'rawfile'
decoy['rawfile'] = decoy['PSMId'].apply(extract_raw_file_name)

# Create a subset of the decoy data with the selected raw files
decoy_subset = decoy[decoy['rawfile'].isin(selected_rawfiles)]
decoy_subset = decoy_subset.drop(columns=['rawfile'])

#printing to double check the same raw files are mapped. 
print(decoy_subset.head())
print(target_subset.head())


# Write the target subset to a new CSV file
target_subset.to_csv(f'subset_5000_target_random_rawfiles.tsv', index=False)

# Write the decoy subset to a new CSV file
decoy_subset.to_csv(f'subset_5000_decoy_random_rawfiles.tsv', index=False)

                           PSMId     score   q-value  posterior_error_prob  \
0  366_6__H_Koll_1.7788.7788.2_1  0.498612  0.000351              0.011317   
1  366_6__H_Koll_1.6984.6984.3_1  0.452610  0.000523              0.015460   
2  366_6__H_Koll_1.8694.8694.4_1  0.444234  0.000694              0.016360   
3  366_6__H_Koll_1.6746.6746.2_1  0.437302  0.000866              0.017144   
4  366_6__H_Koll_1.3350.3350.3_1  0.430924  0.001007              0.017897   

                              peptide  \
0                         R.IPILVAR.M   
1             K.DAITSNLEITK[6.0201].F   
2           R.WHHNELVSMNQYLNALHHNTK.I   
3                      K.LGALEEELAR.L   
4  R.VC[57.0215]SALDLGEAK[6.0201]RR.F   

                                       proteinIds  
0                       rev_sp|Q17RN3|FA98C_HUMAN  
1  rev_sp|Q9Y6V0-6|PCLO_;rev_sp|Q9Y6V0|PCLO_HUMAN  
2                       rev_sp|Q9NP56|PDE7B_HUMAN  
3  rev_sp|O95347-2|SMC2_;rev_sp|O95347|SMC2_HUMAN  
4                       