In [1]:
import sys
import pandas as pd
from Bio import SeqIO
from Bio import SearchIO
import os
import glob
from pathlib import Path
from Bio import AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import numpy as np
import networkx as nx
from itertools import combinations
from pybedtools import BedTool
import re

In [2]:
import logomaker
import matplotlib.pyplot as plt

In [3]:
os.chdir('../../data/MAT')

In [4]:
faa = list(SeqIO.parse("candidate_filtered.faa", "fasta"))

In [5]:
#another round of filtering
aliphatic_aa = ['A', 'I', 'L', 'G', 'V']
CAAX_candidates = []
for record in faa:
    seq = str(record.seq)
    if re.search(r'C[AILGV]{2}[A-Z]', seq):
        record.description = ''
        CAAX_candidates.append(record)

In [19]:
CHR5B_mfa = []
for record in CAAX_candidates:
    header = record.id
    if re.search(r'HapB_CHR05', header):
        CHR5B_mfa.append(record)
CHR5A_mfa = []  
for record in CAAX_candidates:
    header = record.id
    if re.search(r'HapA_CHR05', header):
        CHR5A_mfa.append(record)

In [25]:
#sort CHR5B_mfa by size
CHR5B_mfa = sorted(CHR5B_mfa, key=lambda x: len(x.seq), reverse=True)

In [41]:
CHR5B_mfa_1 =[]
CHR5B_mfa_2 =[]
CHR5B_mfa_3 =[]
#split CHR5B_mfa into 3 groups based on size
for i in range(len(CHR5B_mfa)):
    aalen = len(CHR5B_mfa[i].seq)
    if aalen <= 32:
        CHR5B_mfa_2.append(CHR5B_mfa[i])
    elif aalen >32 and aalen <= 70:
        CHR5B_mfa_1.append(CHR5B_mfa[i])
    else:
        CHR5B_mfa_3.append(CHR5B_mfa[i])

In [57]:
CHR5B_bed = pd.DataFrame()
for i in range(len(CHR5B_mfa_1)):
    record = CHR5B_mfa_1[i]
    header = record.id
    start = header.split(':')[1].split('-')[0]
    end = header.split(':')[1].split('-')[1].split('(')[0]
    strand = header.split('(')[1].split(')')[0]
    tmp_bed = pd.DataFrame({'chrom':'Chr05B', 'start':start, 'end':end, 'strand': strand,'name':"mfa1"}, index=[0])
    CHR5B_bed = pd.concat([CHR5B_bed, tmp_bed], ignore_index=True)
for i in range(len(CHR5B_mfa_2)):
    record = CHR5B_mfa_2[i]
    header = record.id
    start = header.split(':')[1].split('-')[0]
    end = header.split(':')[1].split('-')[1].split('(')[0]
    strand = header.split('(')[1].split(')')[0]
    tmp_bed = pd.DataFrame({'chrom':'Chr05B', 'start':start, 'end':end, 'strand': strand,'name':"mfa2"}, index=[0])
    CHR5B_bed = pd.concat([CHR5B_bed, tmp_bed], ignore_index=True)
for i in range(len(CHR5B_mfa_3)):
    record = CHR5B_mfa_3[i]
    header = record.id
    start = header.split(':')[1].split('-')[0]
    end = header.split(':')[1].split('-')[1].split('(')[0]
    strand = header.split('(')[1].split(')')[0]
    tmp_bed = pd.DataFrame({'chrom':'Chr05B', 'start':start, 'end':end, 'strand': strand,'name':"mfa3"}, index=[0])
    CHR5B_bed = pd.concat([CHR5B_bed, tmp_bed], ignore_index=True)

In [59]:
CHR5B_bed.to_csv('CHR5B_mfa.bed', sep='\t', index=False, header=True)

In [6]:
CAAX_candidates

[SeqRecord(seq=Seq('MIWFTFLFWEQVLIGKAENFPIFSKCSCIIH*'), id='APSI_AU3_HapA_CHR05:20835253-20835349(+)', name='APSI_AU3_HapA_CHR05:20835253-20835349(+)', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('MIWFTFLFWEQVLIGKAEFFPMFSKCSCIIY*'), id='APSI_AU3_HapA_CHR05:33855798-33855894(+)', name='APSI_AU3_HapA_CHR05:33855798-33855894(+)', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('MIWFTFLFWEQVLIGKAEIFPMFRQCICIIR*'), id='APSI_AU3_HapA_CHR05:41092739-41092835(+)', name='APSI_AU3_HapA_CHR05:41092739-41092835(+)', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('MIWFNFLFWEQVLIGKAEFFPMFRKCSCIIY*'), id='APSI_AU3_HapA_CHR05:60158164-60158260(+)', name='APSI_AU3_HapA_CHR05:60158164-60158260(+)', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('MILFTFLFWEQVFIGKAEFFPMFRKCICIIH*'), id='APSI_AU3_HapA_CHR05:51237010-51237106(+)', name='APSI_AU3_HapA_CHR05:51237010-51237106(+)', description='', dbxrefs=[]),
 SeqRecord(seq=Seq('MIWFTFLFWEQELIGKAECFPMFSKCCCIIY*'), id='APSI_AU3_HapB_CHR12:1623877-1623973

In [15]:
mfa_bed = pd.DataFrame()
for i in range(len(CAAX_candidates)):
    record = CAAX_candidates[i]
    locus = record.id.split(':')[1]
    header = record.id.split('Hap')[1].split(':')[0].split('_')[1]
    #If record.id contain 'HapA', add 'A' to the end of the header
    if re.search(r'_ab', record.id):
        header = header + 'B'
    if re.search(r'HapA', record.id):
        header = header + 'A'
    if re.search(r'HapB', record.id):
        header = header + 'B'
    if len(record.seq) <= 32:
        type = 'mfa2'
    elif len(record.seq) >32 and len(record.seq) <= 70:
        type = 'mfa1'
    else:
        type = 'mfa3'
    start = locus.split('-')[0]
    end = locus.split('-')[1].split('(')[0]
    strand = locus.split('(')[1].split(')')[0]
    seq = str(record.seq)
    tmp_bed = pd.DataFrame({'chrom':header, 'start':start, 'end':end, 'strand': strand, 'type': type, 'seq': seq }, index=[0])
    mfa_bed = pd.concat([mfa_bed, tmp_bed], ignore_index=True)

In [16]:
mfa_bed.to_csv('mfa.bed', sep='\t', index=False, header=True)

In [63]:
CHR5A_bed = pd.DataFrame()
for i in range(len(CHR5A_mfa)):
    record = CHR5A_mfa[i]
    header = record.id
    start = header.split(':')[1].split('-')[0]
    end = header.split(':')[1].split('-')[1].split('(')[0]
    strand = header.split('(')[1].split(')')[0]
    tmp_bed = pd.DataFrame({'chrom':'Chr05A', 'start':start, 'end':end, 'strand': strand,'name':"mfa2"}, index=[0])
    CHR5A_bed = pd.concat([CHR5A_bed, tmp_bed], ignore_index=True)

In [65]:
CHR5A_bed.to_csv('CHR5A_mfa.bed', sep='\t', index=False, header=True)