### Collect data to input into Gibbs Cluster 2.0
- not consider peptides that have an Place Holder X in their amino acid sequence because Gibbs Cluster can't handle this.
- only the monoallelic peptides

In [1]:
import pandas as pd
import os

In [2]:
class DataSet:
    def __init__(self):
        """
        Initializes the dataset object.
        The mode is set to ABC by default meaning to only look at MHC alleles starting with A, B or C (not E, G).

        """
        self.dataframe = pd.DataFrame({'Allele': [], 'Peptide': [], 'Source': []})
        self.mode = "ABC"

    def get_dataframe(self):
        return self.dataframe

    def add_dfs_to_dataset(self, paths, sources, seps):
        """
        Adds dataframes to dataset object, assumes datsets are annotated with Allele(seperator)Peptide.
        Gets you a datset with Allele Peptide and Source column

        Args:
            paths (list): list of paths to dataframes
            sources (list): list of Sources of dataframes (if you want to annotate where the data comes from)
            seps (list): list of seperators of dataframes
        
        """
        assert len(paths) == len(sources) == len(seps)
        assert len(self.dataframe) == 0
        assert len(paths) > 0
        
        df_list = []
        for i,path in enumerate(paths):
            df = pd.read_csv(path, sep=seps[i])
            #remove duplicates input df
            df = df.drop_duplicates(subset=['Allele','Peptide'], keep='first')
            df['Source'] = sources[i]
            df_list.append(df)
        joined_df = pd.concat(df_list)
        joined_df = joined_df.drop_duplicates(subset=['Allele','Peptide'], keep='first')
        joined_df = joined_df[joined_df['Allele'].str.startswith('A') | joined_df['Allele'].str.startswith('B') | joined_df['Allele'].str.startswith('C')]
        self.dataframe = joined_df
    
    def cleanup_peptide(self):
        """
        removes peptide annotation: AYSSY(Oxidation)KIIK -> AYSSYKIIK.

        """
        self.dataframe["Peptide"] = self.dataframe["Peptide"].str.replace(r"\(.*\)","", regex=True)

In [4]:
binders = DataSet()
binders.add_dfs_to_dataset(paths=['../../../DATA/data_readout_final/1_readout_data_mhcmotifatlas_classI_MS_Peptides_all_peptides.csv', 
                                    '../../../DATA/data_readout_final/2_1readout_data_MSV000084172_2016.csv',
                                    '../../../DATA/data_readout_final/2_2readout_data_MSV000084172_2017.csv',
                                    '../../../DATA/data_readout_final/3_1readout_data_MSV000090323_batch12.csv',
                                    '../../../DATA/data_readout_final/3_2readout_data_MSV000090323_batch15.csv',
                                    '../../../DATA/data_readout_final/4_readout_data_PXD009531.csv',
                                    '../../../DATA/data_readout_final/5_1breadout_data_netmhcpan_binder.csv',
                                    '../../../DATA/data_readout_final/mhcflurry_data.csv'], 
                            sources=['mhcmotifatlas',
                                    'msv84172_2016',
                                    'msv84172_2017',
                                    'msv90323_12',
                                    'msv90323_15',
                                    'pxd9531',
                                    'netmhcpan_binder',
                                    'mhcflurry'],
                            seps=['\t']*8)
binders.cleanup_peptide()

In [None]:
binder_df = binders.get_dataframe()

for allele, group_df in binder_df.groupby('Allele'):
    peptides = group_df['Peptide'].tolist()
    
    # Write to a txt file
    with open(f'gibbs-input/{allele}.txt', 'w') as file:
        file.write('\n'.join(peptides))

### Run Gibbs Cluster 2.0 for every allele

Cite: Andreatta M, Alvarez B, Nielsen M. GibbsCluster: unsupervised clustering and alignment of peptide sequences. Nucleic Acids Res. 2017 Jul 3;45(W1):W458-W463. doi: 10.1093/nar/gkx248. PMID: 28407089; PMCID: PMC5570237.

In [None]:
allele_name = 'A0101'
command = f'../gibbscluster -f ../gibbs-input/{allele_name}.txt -g 1 -D 4 -I 1 -S 5 -j 2 -T -C -P {allele_name}_final'
print(command)

../gibbscluster -f ../gibbs-input/A0101.txt -g 1 -D 4 -I 1 -S 5 -j 2 -T -C -P A0101_final


# Generate filtered peptides dictionary from Gibbs Output
Move this into the Gibbs Output folder.

In [None]:
OUTPUT_FOLDER = '../../../DATA/gibbs-output'

filtered_peptides = {}

#generate paths to output files with filtered peptides
files_in_directory = [f for f in os.listdir('../'+OUTPUT_FOLDER) if os.path.isdir(os.path.join('../'+OUTPUT_FOLDER, f))]
list_of_dicts = files_in_directory
for i, server_output_folder in enumerate(files_in_directory):
    folders_in_directory = [f for f in os.listdir(server_output_folder) if os.path.isdir(os.path.join(server_output_folder, f))]
    list_of_dicts[i] = list_of_dicts[i] + '/res/gibbs.1g.out'

#iterating through outputs and collecting filtered peptides in dictionary
for i, data_allele in enumerate(list_of_dicts):
    if len(data_allele) > 20:
        corresponding_allele = data_allele.split('_final')[0]
        peptides = []
        try:
            with open(data_allele, 'r') as file:
                for line in file:
                    if line.startswith('#Trash'):
                        peptides.append(line.split()[1])
        except:
            print(f'No output file for this allele: {corresponding_allele}')
        filtered_peptides[corresponding_allele] = peptides

#write filtered peptides to a file to use for input in allele associated datase run
with open('filtered_peptides.txt', 'w') as file:
    file.write(str(filtered_peptides))