In [1]:
import pandas as pd
import numpy as np

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [2]:
eff = pd.read_csv("raw/fungal_effectors.tsv", sep="\t")

eff["class"] = "effector"
eff.loc[eff["Validated"] == "no", "class"] = "homolog"
eff.loc[~eff["EffectorP2"], "Set"] = np.nan
eff = eff[["class", "UniqueID", "Sequence", "Set"]]
eff.rename(columns={"UniqueID": "seqid", "Sequence": "seq", "Set": "set"}, inplace=True)
eff.head()

Unnamed: 0,class,seqid,seq,set
0,homolog,KEQ67658.1,MLFNSILATAMLFAASAVALPVELEARQSSTTCGNTYYSASQVSAA...,
1,homolog,XP_013431971.1,MLFKSIFASAVLFAASSIALPTDLEARQQATTCGSTSYSASQVRSA...,
2,homolog,KEQ81621.1,MLFQSVFTTAVLFVASTIALPTDLESRQSATTCGSTSYTAAQVRSA...,
3,homolog,XP_013344128.1,MLFQSIFTTAVLFAASAIALPTDLEARQQATRCGSTSYTAAQVRSA...,
4,homolog,XP_007678837.1,MRFLFATAAFFAAAAFAFPLTQRQSSTTCGKNRYTTKQVNAALNQG...,


In [3]:
columns = [
    "target",
    "query",
    "evalue",
    "pident",
    "bits",
    "qstart",
    "qend",
    "qlen",
    "tstart",
    "tend",
    "tlen",
    "theader",
    "sequence"
]

eff_hom = pd.read_csv(
    "processed/effector_homologues.tsv",
    sep="\t",
    names=columns,
    skipinitialspace=True
)

eff_hom["class"] = "homolog"
eff_hom = eff_hom[["class", "target", "sequence"]]
eff_hom.rename(columns={"target": "seqid", "sequence": "seq"}, inplace=True)
eff_hom.drop_duplicates(inplace=True)

eff_hom.head()

Unnamed: 0,class,seqid,seq
0,homolog,UniRef90_A0A010PZM0,MKFSTALFTIATLATSVFSTALSPLDNSAVGAVNMDVREVAPLDSS...
1,homolog,UniRef90_A0A010Q8R0,MQLSSILLSVGLMAAGADAKLHNAGVCVTNRQEMPVGGTGWSVSYT...
2,homolog,UniRef90_A0A010QGI7,MVTFNSLFLAFASVTSVLSAPGELMKRQSTPSSTGTHNGYYYSWWT...
3,homolog,UniRef90_A0A010QLX3,MLAKLSLLPLLSAVVSASPLLDARAPVAALDERAVTVSSADLSNFE...
4,homolog,UniRef90_A0A010QUB8,MQFSNLVTILSSVAAAAAVSVSYDTGYDDGSRSLTAVSCSDGANGL...


In [4]:
localized = pd.read_csv("processed/localised.tsv", sep="\t", names=["class", "seqid", "seq"])

# There are a small number that appear in the secreted and non-secreted lists.
# We'll remove then from the non-secreted.
secreted_ids = set(localized.loc[localized["class"] == "secreted", "seqid"].to_list())

localized = localized.loc[
    (localized["class"] == "secreted") |
    ((localized["class"] != "secreted") & (~localized["seqid"].isin(secreted_ids)))
]

localized.head()

Unnamed: 0,class,seqid,seq
0,secreted,sp|A0A024RXP8|GUX1_HYPJR,MYRKLAVISAFLATARAQSACTLQSETHPPLTWQKCSSGGTCTQQT...
1,secreted,sp|A0A024SH20|GUN2_HYPJR,MNKSVAPLLLAASILYGGAAAQQTVWGQCGGIGWSGPTNCAPGSAC...
2,secreted,sp|A0A024SH76|GUX2_HYPJR,MIVGILTTLATLATLAASVPLEERQACSSVWGQCGGQNWSGPTCCA...
3,secreted,sp|A0A024SIB3|XYN3_HYPJR,MKANVILCLLAPLVAALPTETIHLDPELAALRANLTERTADLWDRQ...
4,secreted,sp|A0A024SNB7|GUN1_HYPJR,MAPSVTLPLTTAILAIARLVAAQQPGTSTPEVHPKLTTYKCTKSGG...


In [5]:
proteomes = pd.read_csv("processed/proteomes.tsv", sep="\t", names=["class", "seqid", "seq"])
proteomes.head()

Unnamed: 0,class,seqid,seq
0,proteome,Bgrahor_DH14_BLGH_07057-mRNA-1,MQDYFHTIVQMEQFYEYTSRNPLHPILVQYRDSPAFQAGSMEEIDS...
1,proteome,Bgrahor_DH14_BLGH_07058-mRNA-1,MTPEKRSTTTLMWNVAEQPDYDSTREALQEALKAELTYYTYKSVDK...
2,proteome,Bgrahor_DH14_BLGH_07059-mRNA-1,MKISLFASIIAFLSNFMPTLALDGYRCNAQTISREILMREVNKSYD...
3,proteome,Bgrahor_DH14_BLGH_06565-mRNA-1,MSLFEELYEYTDQHPLTNLRAEFIQNYDNKIDPMIAIKLLLAKLLI...
4,proteome,Bgrahor_DH14_BLGH_06566-mRNA-1,MSLFEELYEYTDQHPLTNLRAEFIQNYDNKIDPMIAIKLLLAKLLI...


In [6]:
combined = pd.concat(
    [eff, eff_hom, localized, proteomes],
    ignore_index=True
)

# mmseqs extract swissprot names, which is really annoying.
# here we get a special join key just for them.

combined["key"] = combined["seqid"]
combined.loc[combined["key"].str.startswith('sp|'), "key"]

combined.head()

Unnamed: 0,class,seqid,seq,set,key
0,homolog,KEQ67658.1,MLFNSILATAMLFAASAVALPVELEARQSSTTCGNTYYSASQVSAA...,,KEQ67658.1
1,homolog,XP_013431971.1,MLFKSIFASAVLFAASSIALPTDLEARQQATTCGSTSYSASQVRSA...,,XP_013431971.1
2,homolog,KEQ81621.1,MLFQSVFTTAVLFVASTIALPTDLESRQSATTCGSTSYTAAQVRSA...,,KEQ81621.1
3,homolog,XP_013344128.1,MLFQSIFTTAVLFAASAIALPTDLEARQQATRCGSTSYTAAQVRSA...,,XP_013344128.1
4,homolog,XP_007678837.1,MRFLFATAAFFAAAAFAFPLTQRQSSTTCGKNRYTTKQVNAALNQG...,,XP_007678837.1


In [7]:
combined.loc[combined["key"].str.startswith('sp|'), "key"] = (
    combined.loc[combined["key"].str.startswith('sp|'), "key"]
    .apply(lambda x: x.split('|')[1])
)
combined

Unnamed: 0,class,seqid,seq,set,key
0,homolog,KEQ67658.1,MLFNSILATAMLFAASAVALPVELEARQSSTTCGNTYYSASQVSAA...,,KEQ67658.1
1,homolog,XP_013431971.1,MLFKSIFASAVLFAASSIALPTDLEARQQATTCGSTSYSASQVRSA...,,XP_013431971.1
2,homolog,KEQ81621.1,MLFQSVFTTAVLFVASTIALPTDLESRQSATTCGSTSYTAAQVRSA...,,KEQ81621.1
3,homolog,XP_013344128.1,MLFQSIFTTAVLFAASAIALPTDLEARQQATRCGSTSYTAAQVRSA...,,XP_013344128.1
4,homolog,XP_007678837.1,MRFLFATAAFFAAAAFAFPLTQRQSSTTCGKNRYTTKQVNAALNQG...,,XP_007678837.1
...,...,...,...,...,...
201881,proteome,Ztri_IPO323_XP_003857810.1,MFTQQLLALTALAAFTSAHSWVEEYQVISDNGSYTGPLGYTRGFVP...,,Ztri_IPO323_XP_003857810.1
201882,proteome,Ztri_IPO323_XP_003857811.1,MKPHRIRMAHSLIMNYGLYKKLEIYRAKPASKYEMTQFHTDEYVDF...,,Ztri_IPO323_XP_003857811.1
201883,proteome,Ztri_IPO323_XP_003857812.1,MNWYGISVLCLALANVTRAQADPLVDFCRRWSHQTTVVDNKLFIYG...,,Ztri_IPO323_XP_003857812.1
201884,proteome,Ztri_IPO323_XP_003857813.1,MSVQQTINEERMENRATLSLQCESRLITSGHTTPDFINRMNSLLQD...,,Ztri_IPO323_XP_003857813.1


In [8]:
clusters = pd.read_csv("processed/clusters.tsv", sep="\t", names=["cluster", "member"])
clusters

Unnamed: 0,cluster,member
0,Q7Z9I5,Q7Z9I5
1,Q8SQH8,Q8SQH8
2,Q8SQH8,Q8SS09
3,Q8SQH8,P40994
4,Q8SQH8,Pterter_W11_CAA9977015.1
...,...,...
201886,Q7SDY6,Bgrahor_DH14_BLGH_05980-mRNA-1
201887,Q7SDY6,Q6C9T0
201888,Q7SDY6,Mlin_CH5_jgi|Melli1|196253|MELLI_sc_1023.2
201889,Q7SDY6,Q9Y7Y2


In [9]:
combined_clusters = pd.merge(clusters, combined, left_on="member", right_on="key", how="outer")
combined_clusters.drop(columns=["member", "key"], inplace=True)

# Because we ran the clustering with the localized duplicates, the join will
# re-duplicate them.
# If you redo this, double check what's actually duplicated before removing things.
combined_clusters.drop_duplicates(inplace=True, keep="first")
combined_clusters.head()

Unnamed: 0,cluster,class,seqid,seq,set
0,Q7Z9I5,non_secreted,sp|Q7Z9I5|WTF7_SCHPO,MLKMSGSYAPIEDSADELSVHSGNDNEIDLEKGLLPKCNTGNGGTT...,
1,Q8SQH8,non_secreted,sp|Q8SQH8|ARF_ENCCU,MGNMMSKVNNLLYTKLRGLFSGQSERSITMIGLDGAGKTTLLLYLQ...,
2,Q8SQH8,non_secreted,sp|Q8SS09|SAR1_ENCCU,MLDNIQEYLGVVKAKLTEFYEKVFQNFVKSLFGKPSSILFLGIDNA...,
3,Q8SQH8,non_secreted,sp|P40994|ARF3_YEAST,MGNSISKVLGKLFGSKEMKILMLGLDKAGKTTILYKLKLNKIKTST...,
4,Q8SQH8,proteome,Pterter_W11_CAA9977015.1,MLSILRKARLKDKEMRILMLGLDNAGKTTIVKKIMNEDVNSVSPTL...,


In [10]:
combined_clusters.head()

Unnamed: 0,cluster,class,seqid,seq,set
0,Q7Z9I5,non_secreted,sp|Q7Z9I5|WTF7_SCHPO,MLKMSGSYAPIEDSADELSVHSGNDNEIDLEKGLLPKCNTGNGGTT...,
1,Q8SQH8,non_secreted,sp|Q8SQH8|ARF_ENCCU,MGNMMSKVNNLLYTKLRGLFSGQSERSITMIGLDGAGKTTLLLYLQ...,
2,Q8SQH8,non_secreted,sp|Q8SS09|SAR1_ENCCU,MLDNIQEYLGVVKAKLTEFYEKVFQNFVKSLFGKPSSILFLGIDNA...,
3,Q8SQH8,non_secreted,sp|P40994|ARF3_YEAST,MGNSISKVLGKLFGSKEMKILMLGLDKAGKTTILYKLKLNKIKTST...,
4,Q8SQH8,proteome,Pterter_W11_CAA9977015.1,MLSILRKARLKDKEMRILMLGLDNAGKTTIVKKIMNEDVNSVSPTL...,


In [11]:
combined_clusters[combined_clusters["class"] == "effector"]

Unnamed: 0,cluster,class,seqid,seq,set
6042,UniRef90_A0A482LME3,effector,FolSix7,MQVMKYLYLLFHFALFASAIPMLDLFPRQGQCFSTTGSTPPRPPPA...,train
6130,UniRef90_Q0C912,effector,FGL1,MRLLSLLSVVTLAVASPLSVEEYAKALDERAVSVSTTDFGNFKFYI...,train
7467,FolSix11,effector,FolSix11,MMFSKAIPISLLISTSHAINICCSSFAGHTCTKDQYNNHRQNVILN...,
7475,PST_Pec6,effector,PST_Pec6,MNITYLGTCFLVIATMLGNSDASGASTPKKCKKTIMHEKDKCWTIG...,train
7478,UmSee1,effector,UmSee1,MLFTTFVSLLLVILCLVHVSAHPLQSFRSSSAIGKQKHKIKSRQFE...,train
...,...,...,...,...,...
189890,UniRef90_A0A1B2CW13,effector,MlAvrL2,MGKGNNIQTPCFRASQLRSFCLIAFLLCQSLQSIVSLPALSSKVEL...,train
192536,UniRef90_A0A2J8D6X4,effector,VdSCP7,MKTCVIATLVGVAMSAPAMRTSMDAPMMEMANSRPMDMDMGSSTPA...,train
197244,UniRef90_A0A139HPX7,effector,CfEcp2,MLFNAAAAAVFAPLLVMGNVLPRNAGNSPGSNRCDASTFNNGQDFD...,train
197252,UniRef90_A0A1E1KN96,effector,FolSix3,MRFLLLIAMSMTWVCSIAGLPVEDADSSVGQLQGRGNPYCVFPGRR...,train


In [12]:
class_order = {
    'effector': 5,
    'homolog': 4,
    'secreted': 3,
    'proteome': 2,
    'non_secreted': 1,
}

In [13]:
def appl(table):
    table["length"] = table["seq"].apply(lambda x: len(x))
    table.sort_values('length', ascending=False, inplace=True)
    
    effectors = table.loc[table["class"] == "effector"]
    secreted = table.loc[table["class"] == "secreted"]
    non_secreted = table.loc[table["class"] == "non_secreted"]

    if len(effectors) > 0:
        new_cluster = effectors.iloc[0]["seqid"]
        priority = class_order["effector"]
    elif len(secreted) > 0:
        new_cluster = secreted.iloc[0]["seqid"]
        priority = class_order["secreted"]
    elif len(non_secreted) > 0:
        new_cluster = non_secreted.iloc[0]["seqid"]
        priority = class_order["non_secreted"]
    else:
        first_row = table.iloc[0]
        new_cluster = first_row["cluster"]
        priority = class_order[first_row["class"]]

    table["new_cluster"] = new_cluster
    table["priority"] = priority
    
    table.drop(columns=["length"], inplace=True)

    sets = set(table["set"].to_list())
    if "test" in sets:
        table["set"] = "test"
    elif "train" in sets:
        table["set"] = "train"

    return table

combined_cluster_renamed = combined_clusters.groupby("cluster", as_index=False).apply(appl)
combined_cluster_renamed["cluster"] = combined_cluster_renamed["new_cluster"]
combined_cluster_renamed.drop(columns="new_cluster", inplace=True)
combined_cluster_renamed.sort_values(["cluster", "class", "seqid"], inplace=True, ignore_index=True)
combined_cluster_renamed.head()

Unnamed: 0,cluster,class,seqid,seq,set,priority
0,AKM21218.1,homolog,AKM21218.1,MFFHASEYLLSAILMVSVASSAALPSSQALEPRQGACISALFRPNP...,,4
1,ALVi.Vi1.11,homolog,ALVi.1389.10,MKPFQFLLISILAASSASAKKHRLCCCAGFDACGLFSCEKDSTQSV...,,4
2,ALVi.Vi1.11,homolog,ALVi.1389.11,MKLSYPLLVALLAASVSAKKHSLCCCAGFNACNQFVCDDKSTQNIV...,,4
3,ALVi.Vi1.11,homolog,ALVi.1389.12,MKLLYSLFVSLLAASVSARKHRLCCCAGFNACNQFVCDDYSTQSIV...,,4
4,ALVi.Vi1.11,homolog,ALVi.1389.13,MMLSYSLLFALLAASVSAKKHRLCCCAGFNACNQFVCDDVHTQNLV...,,4


In [14]:
combined_cluster_renamed.loc[combined_cluster_renamed["cluster"] == combined_cluster_renamed["seqid"]]

Unnamed: 0,cluster,class,seqid,seq,set,priority
0,AKM21218.1,homolog,AKM21218.1,MFFHASEYLLSAILMVSVASSAALPSSQALEPRQGACISALFRPNP...,,4
10,ALVi.Vi1.11,homolog,ALVi.Vi1.11,MKLFYPLLVYLLTTSVSARRHRLCCCAGFNACNQFVCDDKSTQSII...,,4
19,ALVi.Vi1.12,homolog,ALVi.Vi1.12,MKATLISAILLTLSAVDAKKHRLCCCYGIDEDAPGKWSDKSAVCVQ...,,4
20,ALVi.Vi1.23,homolog,ALVi.Vi1.23,MKLTFTLLAVSLATSVSANSYTLCCCTKPTNIQDLKDPQYWSGTPP...,,4
42,ALVp_11032.1,homolog,ALVp_11032.1,MKLTPALVSSVLISFVSATCYDNCCCTKPQQPSNNWGCDDKAGQAV...,,4
...,...,...,...,...,...,...
201861,sp|W7MS18|FUB10_GIBM7,non_secreted,sp|W7MS18|FUB10_GIBM7,MAGDFSNRAPWKRSACDRCRAQKLRCHRDSGHSTDACLRCLKSGIE...,,1
201863,sp|W7MX26|FUS4_GIBM7,secreted,sp|W7MX26|FUS4_GIBM7,MLTIATLHVALQVFGAFSPSHAAAVTLEHRSARDGNSVAVPANWDV...,,3
201866,sp|W7N2B2|FUB2_GIBM7,non_secreted,sp|W7N2B2|FUB2_GIBM7,MASELKEYLVIIPDLPDVLAKRQVLLKPHNQDAAPLVKAGRVPFFG...,,1
201870,sp|W7N2C1|FUB8_GIBM7,non_secreted,sp|W7N2C1|FUB8_GIBM7,MQKIAKQALSSLSSLAKSPANAMGSISHLPAYGHRLLPVLIDEISR...,,1


In [15]:
rep = (
    combined_cluster_renamed
    .loc[combined_cluster_renamed["cluster"] == combined_cluster_renamed["seqid"]]
    .sort_values(["cluster", "priority"], ascending=False)
    .groupby("cluster")
    .first()
    .sort_values("priority", ascending=False)
    .reset_index()
)

def map_fun(df, test_proportion=0.2):
    samples = df[df["set"].isnull()].sample(frac=test_proportion).index
    df.loc[samples, "set"] = "test"
    return df.fillna(value="train")

rep = (
    rep
    .groupby("class", as_index=False)
    .apply(map_fun)
    [["cluster", "set"]]
    .set_index("cluster", drop=True)
    ["set"]
    .to_dict()
)
#rep

In [16]:
combined_cluster_renamed["set"] = combined_cluster_renamed["cluster"].apply(lambda x: rep[x])
combined_cluster_renamed.to_csv("processed/representative.tsv", sep="\t", index=False)

In [17]:
representatives = combined_cluster_renamed.loc[
    (combined_cluster_renamed["cluster"] == combined_cluster_renamed["seqid"]),
    ["cluster", "seq"]
]

representative_seqs = []

for i, row in representatives.iterrows():
    seq = Seq(row["seq"])
    sr = SeqRecord(seq, id=row["cluster"], name=row["cluster"], description=row["cluster"])
    representative_seqs.append(sr)

SeqIO.write(representative_seqs, "processed/representative.fasta", "fasta")

76662

In [18]:
test = combined_cluster_renamed.loc[
    ((combined_cluster_renamed["cluster"] == combined_cluster_renamed["seqid"]) &
     (combined_cluster_renamed["set"] == "test")),
    ["cluster", "seq"]
]

test_seqs = []

for i, row in test.iterrows():
    seq = Seq(row["seq"])
    sr = SeqRecord(seq, id=row["cluster"], name=row["cluster"], description=row["cluster"])
    test_seqs.append(sr)

SeqIO.write(test_seqs, "processed/test.fasta", "fasta")

15330

In [19]:
train = combined_cluster_renamed.loc[
    ((combined_cluster_renamed["cluster"] == combined_cluster_renamed["seqid"]) &
     (combined_cluster_renamed["set"] == "train")),
    ["cluster", "seq"]
]

train_seqs = []

for i, row in train.iterrows():
    seq = Seq(row["seq"])
    sr = SeqRecord(seq, id=row["cluster"], name=row["cluster"], description=row["cluster"])
    train_seqs.append(sr)

SeqIO.write(train_seqs, "processed/train.fasta", "fasta")

61332

In [20]:
setattr

<function setattr(obj, name, value, /)>