# AlphaCross-XL Format Converter

AlphaCross-XL uses 'Peptide-centric Format' for parsing the position of the cross-linking residues of the protein.

This script provides certain functionality that allows conversion of 'Protein-centric Format' to 'Peptide-centric Format' for use in AlphaCross-XL

This script is given as a foundation and uses the following assumptions.
- The cross-linking position is based on 0-indexed position of the residue in the protein.
- The cross-linking peptides are mentioned
- Reference Protein Sequence for the chosen species is from UniProt
- Multiple cross-links between the same two peptides are given on a single line with a unique character, which is termed as "separator"


### Dependencies
You need the following Python Packages
- Pandas
- ProDy (installed in the following block)
- PyFastx (installed in the following block)
- requests, os, shutil, json (should come along with you python distribution)

### NOTE
- If Colab asks you to restart the session, you can do so.
- If you are running this script locally, you only need to run the pip install once!
- If you are running locally, comment the line "os.chdir('/content')" in the second block



In [None]:
!pip install prody pyfastx

In [None]:
import requests, json
import os, pathlib, shutil
import re
import pandas as pd

# Only for Google Colab
# Comment this out if not on Colab
os.chdir('/content/')
######################

class FormatXLChange():
    def __init__(self,
                 input_uniprot_entry,
                 input_xl_peptide_a,
                 input_xl_peptide_b,
                 input_xl_peptide_link_site_data_a,
                 input_xl_peptide_link_site_data_b,
                 input_xl_peptide_link_site_separator,
                ):
        # Input Data
        self.input_uniprot_entry = input_uniprot_entry
        self.input_xl_peptide_a = input_xl_peptide_a
        self.input_xl_peptide_b = input_xl_peptide_b
        self.input_xl_peptide_link_site_a = input_xl_peptide_link_site_data_a
        self.input_xl_peptide_link_site_b = input_xl_peptide_link_site_data_b
        self.input_xl_peptide_link_site_separator = input_xl_peptide_link_site_separator

        self.base_dir = os.getcwd()

        # Input UniProt Entry Data
        self.uniprot_entry_type = None

        # UniProt Entry Sequence Data
        self.uniprot_entry_sequence_data = None
        self.uniprot_entry_sequence_value = None
        self.uniprot_entry_sequence_length = None

        # XL Peptides Data
        self.xl_peptide_a_found_unique = None
        self.xl_peptide_b_found_unique = None
        self.xl_peptide_a_found_multiple = None
        self.xl_peptide_b_found_multiple = None
        self.xl_peptide_a_start_residue_num = None
        self.xl_peptide_b_start_residue_num = None
        self.xl_peptide_a_end_residue_num = None
        self.xl_peptide_b_end_residue_num = None


    # Helper Functions
    def _get_peptide_match_start_list(self, xl_peptide_sequence, reference_sequence):
        xl_peptide_re = re.compile(xl_peptide_sequence)

        peptide_match_start_list = []
        for m in xl_peptide_re.finditer(reference_sequence):
            peptide_match_start_list.append(m.start())
        return peptide_match_start_list

    def _check_peptide_uniqueness(self, xl_peptide_seq, xl_peptide_id, xl_peptide_match_start_list):
        if (len(xl_peptide_match_start_list) == 0):
            xl_peptide_found_unique = False
            xl_peptide_found_multiple = False
            print(f'XL Peptide {xl_peptide_id}: {xl_peptide_seq} not found in the protein sequence!')
        elif (len(xl_peptide_match_start_list) == 1):
            xl_peptide_found_unique = True
            xl_peptide_found_multiple = False
            print(f'XL Peptide {xl_peptide_id}: {xl_peptide_seq} found uniquely in the protein sequence!')
        else:
            xl_peptide_found_unique = False
            xl_peptide_found_multiple = True
            print(f'XL Peptide {xl_peptide_id}: {xl_peptide_seq} found multiple times in the protein sequence!')
        return xl_peptide_found_unique, xl_peptide_found_multiple

    def _set_peptide_start_end_residue_num(self, xl_peptide_id, xl_peptide_match_start_list, xl_peptide_length):
        if xl_peptide_id == 'A':
            if (self.xl_peptide_a_found_unique):
                self.xl_peptide_a_start_residue_num = xl_peptide_match_start_list[0]
                self.xl_peptide_a_end_residue_num = self.xl_peptide_a_start_residue_num + xl_peptide_length
            elif (self.xl_peptide_a_found_multiple):
                xl_peptide_match_end_list = []
                for start_residue_num in xl_peptide_match_start_list:
                    xl_peptide_match_end_list.append(start_residue_num + xl_peptide_length)
                self.xl_peptide_a_start_residue_num = xl_peptide_match_start_list
                self.xl_peptide_a_end_residue_num = xl_peptide_match_end_list
            else:
                 self.xl_peptide_a_start_residue_num = None
                 self.xl_peptide_a_end_residue_num = None
        elif xl_peptide_id == 'B':
            if (self.xl_peptide_b_found_unique):
                self.xl_peptide_b_start_residue_num = xl_peptide_match_start_list[0]
                self.xl_peptide_b_end_residue_num = self.xl_peptide_b_start_residue_num + xl_peptide_length
            elif (self.xl_peptide_b_found_multiple):
                xl_peptide_match_end_list = []
                for start_residue_num in xl_peptide_match_start_list:
                    xl_peptide_match_end_list.append(start_residue_num + xl_peptide_length)
                self.xl_peptide_b_start_residue_num = xl_peptide_match_start_list
                self.xl_peptide_b_end_residue_num = xl_peptide_match_end_list
            else:
                self.xl_peptide_b_start_residue_num = None
                self.xl_peptide_b_end_residue_num = None
        else:
            raise Exception(f'XL Peptide ID can only be A or B, got ID: {xl_peptide_id}')

    def process_input_uniprot_entry(self):
        uniprot_api_return_sturcture_3d_sequence_path = f'https://rest.uniprot.org/uniprotkb/search?query=accession:{self.input_uniprot_entry}&fields=sequence'
        try:
            response = requests.get(uniprot_api_return_sturcture_3d_sequence_path)
            response_data = json.loads(response.text)
            if len(response_data['results']) == 0:
                print(f"The UniProt ID: {self.input_uniprot_entry} didn't return any results in the UniProtKB Database")
                if 'error' in response_data.keys():
                    print(f"UniProt Error: {response_data['error']}")
                #print(f"Execution Terminated. No UniProt Entries")
            elif len(response_data['results']) > 1:
                print(f"The The UniProt ID: {self.input_uniprot_entry} returned multiple primary_accessions in the UniProtKB Database")
                print(f"Execution Terminated. Multiple UniProt Entries")

            else:
                print(f"The UniProt ID: {self.input_uniprot_entry} returned a unique entry in the database!")
                uniprot_id_entry_data = response_data['results'][0]
                self.uniprot_entry_type = uniprot_id_entry_data["entryType"]
                print(f"The UniProt ID: {self.input_uniprot_entry}'s UniProtKB Entry Type is: {self.uniprot_entry_type}")
                try:
                    self.uniprot_entry_sequence_data = uniprot_id_entry_data["sequence"]

                    print(f"The UniProt ID: {self.input_uniprot_entry}'s Sequence Length is: {self.uniprot_entry_sequence_data['length']}")
                except:
                    #print(e)
                    print(f"The UniProt ID: {self.input_uniprot_entry} returned no sequence data!")
                    print(f"Execution Terminated. No sequence Data found for {self.input_uniprot_entry}.")
        except:
            #print(e)
            print("UniProt Endpoint Unavailable/API Path Malformed!")
            print(f"Execution Stopped")

    def process_peptide_sequences_for_given_uniprot_entry_sequence(self):
        self.uniprot_entry_sequence_value = self.uniprot_entry_sequence_data['value']
        self.uniprot_entry_sequence_length = self.uniprot_entry_sequence_data['length']

        xl_peptide_a_length = len(self.input_xl_peptide_a)
        xl_peptide_b_length = len(self.input_xl_peptide_b)
        xl_peptide_a_re = re.compile(self.input_xl_peptide_a)
        xl_peptide_b_re = re.compile(self.input_xl_peptide_b)
        peptide_match_start_list_a = self._get_peptide_match_start_list(self.input_xl_peptide_a, self.uniprot_entry_sequence_value)
        peptide_match_start_list_b = self._get_peptide_match_start_list(self.input_xl_peptide_b, self.uniprot_entry_sequence_value)

        # Peptide A
        self.xl_peptide_a_found_unique, self.xl_peptide_a_found_multiple = self._check_peptide_uniqueness(
            self.input_xl_peptide_a,
            'A',
            peptide_match_start_list_a)
        self._set_peptide_start_end_residue_num(
            'A',
            peptide_match_start_list_a,
            xl_peptide_a_length
        )
        # Peptide B
        self.xl_peptide_b_found_unique, self.xl_peptide_b_found_multiple = self._check_peptide_uniqueness(
            self.input_xl_peptide_b,
            'B',
            peptide_match_start_list_b)
        self._set_peptide_start_end_residue_num(
            'B',
            peptide_match_start_list_b,
            xl_peptide_b_length
        )

    def get_relative_link_sites(self):
        def _process_link_sites(link_site_data, separator):
            # Remove 'K' prefix and split if multiple sites
            sites = link_site_data.replace('K', '').split(separator)
            print(f"The link sites are: {sites}")
            return [int(site) for site in sites]


        def _calculate_relative_positions(link_sites, start_pos, peptide_seq):
            relative_positions = []
            for site in link_sites:
                # Convert absolute position to relative (0-based)
                absolute_pos = site - 1
                relative_pos = absolute_pos - start_pos
                # Verify the link site corresponds to a lysine
                if peptide_seq[relative_pos] != 'K':
                    raise ValueError(f"Link site {site} does not correspond to a lysine in peptide {peptide_seq}")
                relative_positions.append(relative_pos + 1)  # Convert back to 1-based
            return relative_positions

        results = {
            'peptide_a': None,
            'peptide_b': None
        }

        # Process peptide A
        if self.xl_peptide_a_found_unique and self.xl_peptide_a_start_residue_num is not None:
            link_sites_a = _process_link_sites(self.input_xl_peptide_link_site_a,
                                            self.input_xl_peptide_link_site_separator)
            results['peptide_a'] = _calculate_relative_positions(
                link_sites_a,
                self.xl_peptide_a_start_residue_num,
                self.input_xl_peptide_a
            )

        # Process peptide B
        if self.xl_peptide_b_found_unique and self.xl_peptide_b_start_residue_num is not None:
            link_sites_b = _process_link_sites(self.input_xl_peptide_link_site_b,
                                            self.input_xl_peptide_link_site_separator)
            results['peptide_b'] = _calculate_relative_positions(
                link_sites_b,
                self.xl_peptide_b_start_residue_num,
                self.input_xl_peptide_b
            )

        return results


def process_crosslinks(data_file):
    """Process cross-linked peptides from CSV/XLSX file"""
    # First parse the file
    ## Get the file extension in lowercase
    file_extension = os.path.splitext(data_file)[1].lower()

    try:
        if file_extension in ['.csv']:
            df = pd.read_csv(data_file)
        elif file_extension in ['.xlsx', '.xls']:
            df = pd.read_excel(data_file)
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")
        return df
    except Exception as e:
        raise Exception(f"Error reading file {data_file}: {str(e)}")


    # Initialize list to store processed results
    results = []

    for index, row in df.iterrows():
        # Extract data from row
        xl_data = {
            'peptide_a': row['Peptide-A'],
            'link_site_a': row['Link-Site-A'],
            'peptide_b': row['Peptide-B'],
            'link_site_b': row['Link-Site-B'],
            'uniprot_id': row['uniprotID'],
            'xl_type': row['X-link type']
        }

        # Create FormatXLChange instance for this cross-link
        xlc = FormatXLChange(
            input_uniprot_entry=xl_data['uniprot_id'],
            input_xl_peptide_a=xl_data['peptide_a'],
            input_xl_peptide_b=xl_data['peptide_b'],
            input_xl_peptide_link_site_data_a=xl_data['link_site_a'],
            input_xl_peptide_link_site_data_b=xl_data['link_site_b'],
            input_xl_peptide_link_site_separator=';'
        )

        # Process the UniProt entry
        xlc.process_input_uniprot_entry()

        # Get peptide status
        xlc.process_peptide_sequences_for_given_uniprot_entry_sequence()

        # Get relative link sites
        relative_positions = xlc.get_relative_link_sites()

        # Store results
        results.append({
            'row_index': index,
            'uniprot_id': xl_data['uniprot_id'],
            'peptide_a': xl_data['peptide_a'],
            'peptide_b': xl_data['peptide_b'],
            'absolute_link_site_a': xl_data['link_site_a'],
            'absolute_link_site_b': xl_data['link_site_b'],
            'relative_link_site_a': relative_positions['peptide_a'],
            'relative_link_site_b': relative_positions['peptide_b']
        })

    return results

def save_crosslink_results(results, output_file="processed_crosslinks.csv"):
    # Initialize list to store expanded results
    expanded_results = []

    for result in results:
        # Get link sites for peptides A and B
        sites_a = result['absolute_link_site_a'].replace('K', '').split(';')
        sites_b = result['absolute_link_site_b'].replace('K', '').split(';')
        rel_sites_a = result['relative_link_site_a'] if isinstance(result['relative_link_site_a'], list) else [result['relative_link_site_a']]
        rel_sites_b = result['relative_link_site_b'] if isinstance(result['relative_link_site_b'], list) else [result['relative_link_site_b']]

        # Create cartesian product of link sites
        for i, (abs_site_a, rel_site_a) in enumerate(zip(sites_a, rel_sites_a)):
            for j, (abs_site_b, rel_site_b) in enumerate(zip(sites_b, rel_sites_b)):
                # Get amino acids at link sites
                aa_a = result['peptide_a'][rel_site_a - 1] if rel_site_a else 'K'
                aa_b = result['peptide_b'][rel_site_b - 1] if rel_site_b else 'K'

                expanded_results.append({
                    'row_index': result['row_index'],
                    'uniprot_id': result['uniprot_id'],
                    'peptide_a': result['peptide_a'],
                    'peptide_b': result['peptide_b'],
                    'absolute_link_site_a': f'{aa_a}{abs_site_a}',
                    'absolute_link_site_b': f'{aa_b}{abs_site_b}',
                    'relative_link_site_a': f'{aa_a}{rel_site_a}',
                    'relative_link_site_b': f'{aa_b}{rel_site_b}'
                })

    # Convert to DataFrame and save
    df = pd.DataFrame(expanded_results)

    # Define column order
    columns = [
        'row_index',
        'uniprot_id',
        'peptide_a',
        'peptide_b',
        'absolute_link_site_a',
        'absolute_link_site_b',
        'relative_link_site_a',
        'relative_link_site_b'
    ]

    # Reorder columns and save to CSV
    df = df[columns]
    df.to_csv(output_file, index=False)

    return output_file

def save_crosslink_results(results, output_file="processed_crosslinks.csv"):
    # Initialize list to store expanded results
    expanded_results = []

    for result in results:
        # Get link sites for peptides A and B
        sites_a = result['absolute_link_site_a'].replace('K', '').split(';')
        sites_b = result['absolute_link_site_b'].replace('K', '').split(';')
        rel_sites_a = result['relative_link_site_a'] if isinstance(result['relative_link_site_a'], list) else [result['relative_link_site_a']]
        rel_sites_b = result['relative_link_site_b'] if isinstance(result['relative_link_site_b'], list) else [result['relative_link_site_b']]

        # Create cartesian product of link sites
        for i, (abs_site_a, rel_site_a) in enumerate(zip(sites_a, rel_sites_a)):
            for j, (abs_site_b, rel_site_b) in enumerate(zip(sites_b, rel_sites_b)):
                # Get amino acids at link sites
                aa_a = result['peptide_a'][rel_site_a - 1] if rel_site_a else 'K'
                aa_b = result['peptide_b'][rel_site_b - 1] if rel_site_b else 'K'

                expanded_results.append({
                    'row_index': result['row_index'],
                    'uniprot_id': result['uniprot_id'],
                    'peptide_a': result['peptide_a'],
                    'peptide_b': result['peptide_b'],
                    'absolute_link_site_a': f'{aa_a}{abs_site_a}',
                    'absolute_link_site_b': f'{aa_b}{abs_site_b}',
                    'relative_link_site_a': f'{aa_a}{rel_site_a}',
                    'relative_link_site_b': f'{aa_b}{rel_site_b}'
                })

    # Convert to DataFrame and save
    df = pd.DataFrame(expanded_results)

    # Define column order
    columns = [
        'row_index',
        'uniprot_id',
        'peptide_a',
        'peptide_b',
        'absolute_link_site_a',
        'absolute_link_site_b',
        'relative_link_site_a',
        'relative_link_site_b'
    ]

    # Reorder columns and save to CSV
    df = df[columns]
    df.to_csv(output_file, index=False)

    return output_file


## Single pair usage

Following code block demonstrates the script's ability to work on a single pair of cross-linking peptides, with multiple crosslinks, separated using the separator ';'

The output shows the relative positions of the cross-linking residues using the peptide-centric approach.

In [None]:
# Single Usage
xlc = FormatXLChange(
    input_uniprot_entry="P63104",
    input_xl_peptide_a="VVSSIEQKTEGAEK",
    input_xl_peptide_b="KQQMAR",
    input_xl_peptide_link_site_data_a="K68;K74",
    input_xl_peptide_link_site_data_b="K75",
    input_xl_peptide_link_site_separator=";"
)
xlc.process_input_uniprot_entry()
xlc.process_peptide_sequences_for_given_uniprot_entry_sequence()
relative_positions = xlc.get_relative_link_sites()

relative_positions

## Bulk conversion using a XLSX/CSV File
You can also bulk convert multiple pairs of cross-link using the following code snippet (Make sure you edit the file name)

An example file for the format converter is given in the "example" subdirectory of the AlphaCross-XL GitHub

### File Format
The file should have the following columns, which are self-explantory (and the columns should exactly match!)
- Peptide-A
- Link-Site-A
- Peptide-B
- Link-Site-B
- X-link type

#### IMPORTANT
The other columns of the input file will be lost! (Please use the row numbers to recuperate the lost information in the updated file)

In [None]:
# Only for Colab
os.chdir('/content/')
main_base_dir = os.getcwd()
csv_file = "input_csv_formatxlchange.csv"
results = process_crosslinks(os.path.join(main_base_dir, csv_file))

# Print first few results
for result in results[:3]:
    print(f"\nProcessing cross-link for {result['uniprot_id']}:")
    print(f"Peptide A: {result['peptide_a']}")
    print(f"Absolute link site A: {result['absolute_link_site_a']}")
    print(f"Relative link site A: {result['relative_link_site_a']}")
    print(f"Peptide B: {result['peptide_b']}")
    print(f"Absolute link site B: {result['absolute_link_site_b']}")
    print(f"Relative link site B: {result['relative_link_site_b']}")
# Save the file
save_crosslink_results(results, output_file="processed_crosslinks.csv")