Pre-processing Notebook -- Create a dataset for prediction

In [14]:
import pandas as pd
from Bio import SeqIO

df = pd.DataFrame(columns=['Uniprot_ID', 'Family', 'Sequence'])

# Open the fasta file
with open('/home/aarya/Downloads/GTseq/galA.reformat.newline.a2m', 'r') as file:
    for record in SeqIO.parse(file, 'fasta'):
        header_parts = record.description.split('|')
        uniprot_id = header_parts[0].strip()
        family = header_parts[1].strip() if len(header_parts) > 1 else None

        new_row = pd.DataFrame({
            'Uniprot_ID': [uniprot_id],
            'Family': [family],
            'Sequence': [str(record.seq)]
        })
        df = pd.concat([df, new_row], ignore_index=True)

# Save as CSV for training
df.to_csv('/home/aarya/Downloads/GTseq/galA.reformat.newline.8.csv', index=False)


Dataset + donor CSV

In [1]:
import pandas as pd
from Bio import SeqIO

df = pd.DataFrame(columns=['Uniprot_ID', 'Donor', 'Family', 'Sequence'])

# Open the fasta file
with open('/home/aarya/Documents/paper3/DL_donor_specificity_class/training4/testing3v1_8435_n.9.fa', 'r') as file:
    for record in SeqIO.parse(file, 'fasta'):
        header_parts = record.description.split('|')
        uniprot_id = header_parts[0].strip()
        family = header_parts[1].strip() if len(header_parts) > 1 else None
        donor = header_parts[2].strip() if len(header_parts) > 2 else None

        new_row = pd.DataFrame({
            'Uniprot_ID': [uniprot_id],
            'Donor': [donor],
            'Family': [family],
            'Sequence': [str(record.seq)]
        })
        df = pd.concat([df, new_row], ignore_index=True)

# Save as CSV for training
df.to_csv('/home/aarya/Documents/paper3/DL_donor_specificity_class/training4/testing3v1_8435_n.9.csv', index=False)


Parses Uniprot and Adds family to uniprot ID

In [2]:
import re

def parse_header(header):
    uniprot_id = header.split()[0][1:]
    match = re.search(r'profile=([\w-]+)', header)
    family = match.group(1) if match else 'Unknown'
    return f">{uniprot_id} | {family}"

with open('/home/aarya/Documents/paper3/family_prediction/uniref50.231020.nogaps.trimmed.v2.a2m', 'r') as f:
    lines = f.readlines()

with open('/home/aarya/Documents/paper3/family_prediction/uniref50.231020.nogaps.trimmed.reformatted.v2.a2m', 'w') as f:
    for line in lines:
        if line.startswith('>'):
            header = line.strip()
            profile = re.search(r'profile=(\S+)', header).group(1)
            header_parts = header.split(' ')
            truncated_header = header_parts[0]
            f.write(f"{truncated_header} | {profile}\n")
        else:
            f.write(line)



Trim Inserts from Original Sequence

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
import re

def trim_fasta_sequence(record, max_insert=50):
    # Split the sequence into conserved and insert regions
    conserved_insert_regions = re.split(r'([A-Z\-]+)', str(record.seq))

    # Trim the insert regions
    trimmed_sequence = ''
    for region in conserved_insert_regions:
        if region.islower():
            trimmed_sequence += region[-max_insert:] if len(region) > max_insert else region
        else:
            trimmed_sequence += region

    # Update the record's sequence
    record.seq = Seq(trimmed_sequence)
    return record

def count_uppercase_chars(sequence):
    # Count the number of uppercase characters in the sequence
    return sum(1 for char in sequence if char.isupper())

# Read the sequences from a fasta file
fasta_sequences = list(SeqIO.parse('/home/aarya/Documents/paper3/family_prediction/uniref50.231020.nogaps.fasta.cfa', 'fasta'))

# Filter out sequences with less than 100 uppercase characters
filtered_sequences = [record for record in fasta_sequences if count_uppercase_chars(record.seq) >= 150]

# Trim the sequences
trimmed_sequences = [trim_fasta_sequence(record) for record in filtered_sequences]

# Write the trimmed sequences to a new fasta file
SeqIO.write(trimmed_sequences, '/home/aarya/Documents/paper3/family_prediction/uniref50.231020.nogaps.trimmed.v2.cfa', 'fasta')


210781

Randomly sample 10K sequences from the original sequence set then produce training and testing datasets 

In [16]:
from Bio import SeqIO
import random

sequences = list(SeqIO.parse('/home/aarya/Documents/paper3/DL_donor_specificity_class/training2/33k_donors.fasta', 'fasta'))

families = {}

for seq in sequences:
    header = seq.description
    family_name = header.split('|')[1]
    if family_name in families:
        families[family_name].append(seq)
    else:
        families[family_name] = [seq]

random_sample = []
picked_sequences_ids = set()
samples_per_family = 5000 // len(families)

for family_name, seqs in families.items():
    if len(seqs) > samples_per_family:
        sample = random.sample(seqs, samples_per_family)
    else:
        sample = seqs
    random_sample.extend(sample)
    picked_sequences_ids.update(seq.id for seq in sample)

while len(random_sample) < 5000:
    remaining_samples = 5000 - len(random_sample)
    additional_sequences = [seq for family in families.values() for seq in family if seq.id not in picked_sequences_ids]
    if len(additional_sequences) > remaining_samples:
        new_samples = random.sample(additional_sequences, remaining_samples)
        random_sample.extend(new_samples)
        picked_sequences_ids.update(seq.id for seq in new_samples)
    else:
        random_sample.extend(additional_sequences)
        picked_sequences_ids.update(seq.id for seq in additional_sequences)

with open('/home/aarya/Documents/paper3/DL_donor_specificity_class/training2/trainingv2_5000.fa', 'w') as f:
    SeqIO.write(random_sample, f, 'fasta')

# Create the testing dataset with the remaining sequences
remaining_sequences = [seq for seq in sequences if seq.id not in picked_sequences_ids]

with open('/home/aarya/Documents/paper3/DL_donor_specificity_class/training2/testingv2_30000.fa', 'w') as f:
    SeqIO.write(remaining_sequences, f, 'fasta')


Brief change to above to include donors:

In [1]:
from Bio import SeqIO
import random

sequences = list(SeqIO.parse('/home/aarya/Documents/paper3/DL_donor_specificity_class/training2/33k_donors.fasta', 'fasta'))

families = {}

for seq in sequences:
    header = seq.description.split('|')
    family_name = header[1]
    substrate = header[2]

    if family_name not in families:
        families[family_name] = {}

    if substrate in families[family_name]:
        families[family_name][substrate].append(seq)
    else:
        families[family_name][substrate] = [seq]

random_sample = []
picked_sequences_ids = set()
samples_per_family = 1000 // len(families)

for family_name, substrates in families.items():
    samples_per_substrate = samples_per_family // len(substrates)
    for substrate, seqs in substrates.items():
        if len(seqs) > samples_per_substrate:
            sample = random.sample(seqs, samples_per_substrate)
        else:
            sample = seqs
        random_sample.extend(sample)
        picked_sequences_ids.update(seq.id for seq in sample)

while len(random_sample) < 1000:
    remaining_samples = 1000 - len(random_sample)
    additional_sequences = [seq for family in families.values() for substrate in family.values() for seq in substrate if seq.id not in picked_sequences_ids]
    if len(additional_sequences) > remaining_samples:
        new_samples = random.sample(additional_sequences, remaining_samples)
        random_sample.extend(new_samples)
        picked_sequences_ids.update(seq.id for seq in new_samples)
    else:
        random_sample.extend(additional_sequences)
        picked_sequences_ids.update(seq.id for seq in additional_sequences)

with open('/home/aarya/Documents/paper3/DL_donor_specificity_class/training3/training3v1_1000.fa', 'w') as f:
    SeqIO.write(random_sample, f, 'fasta')

# Create the testing dataset with the remaining sequences
remaining_sequences = [seq for seq in sequences if seq.id not in picked_sequences_ids]

with open('/home/aarya/Documents/paper3/DL_donor_specificity_class/training3/testing3v1_36000.fa', 'w') as f:
    SeqIO.write(remaining_sequences, f, 'fasta')


33K sequences organize csv into fasta containing donor info.

In [1]:
import pandas as pd

# Load csv file
data = pd.read_csv("./train_test_v3.csv")

# Open the new fasta file
with open("./train_test_v3.fasta", "w") as f:
    # Iterate over rows in the DataFrame
    for index, row in data.iterrows():
        # Write fasta header
        f.write(f">{row['GT-A Family']}\n") #{row['uniprot']}|{row['GT-A Family']}|{row['donor']}
        # Write sequence
        f.write(f"{row['sequence']}\n")

In [9]:
# import pandas as pd
# from Bio import SeqIO

# # Define a new DataFrame with the required columns
# df = pd.DataFrame(columns=['Family', 'Sequence'])

# # Specify the path to your fasta file
# fasta_file_path = '/home/aarya/Documents/paper3/family_prediction/uniref50.231020.nogaps.trimmed.reformatted.v2.80.a2m'

# # Open the fasta file
# with open(fasta_file_path, 'r') as file:
#     for record in SeqIO.parse(file, 'fasta'):
#         # The header is just the family name
#         family = record.id.strip()

#         # Append a new row to the DataFrame
#         new_row = pd.DataFrame({
#             'Family': [family],
#             'Sequence': [str(record.seq)]
#         })
#         df = pd.concat([df, new_row], ignore_index=True)

# # Specify the path where you want to save your CSV file
# csv_file_path = '/home/aarya/Documents/paper3/family_prediction/uniref50.231020.80gaps.v2.csv'

# # Save as CSV
# df.to_csv(csv_file_path, index=False)

import pandas as pd
from Bio import SeqIO

# Initialize a list to hold the data
data = []

# Specify the path to your fasta file
fasta_file_path = '/home/aarya/Documents/paper3/family_prediction/uniref50.231020.nogaps.trimmed.reformatted.v2.80.a2m'

# Open the fasta file
with open(fasta_file_path, 'r') as file:
    for record in SeqIO.parse(file, 'fasta'):
        # Split the header to get UniRef ID and Family
        header_parts = record.id.split('|')
        uniref_id = header_parts[0].strip() if len(header_parts) > 0 else 'Unknown'
        family = header_parts[1].strip() if len(header_parts) > 1 else 'Unknown'

        # Append the data to the list
        data.append({'UniRef ID': uniref_id, 'Family': family, 'Sequence': str(record.seq)})

# Create a DataFrame from the list
df = pd.DataFrame(data, columns=['UniRef ID', 'Family', 'Sequence'])

# Specify the path where you want to save your CSV file
csv_file_path = '/home/aarya/Documents/paper3/family_prediction/uniref50.231020.80gaps.v2.csv'

# Save as CSV
df.to_csv(csv_file_path, index=False)

