In [2]:
import re
import os

from Bio import AlignIO
import pandas as pd

In [32]:
# Консенсусные мотивы

# # v0
# walker_a_pattern = re.compile(r"(G[A-Z]N[A-Z]{2}GK[ST]|GGG[A-Z]HGK[ST]|GET[A-Z]GKS|G[A-Z]N[A-Z]{2}GKTT|G[A-Z]NG[A-Z]GKST|G[A-Z]N[A-Z]QGK[A-Z]ST|G[A-Z]N[A-Z]QGKTS|G[A/M]N[A-Z]{2}GKT|G[A-Z]N[A-Z]{2}GKSN|G[A-Z]{2}G[A-Z]GK[ST]|G[A-Z]{4}GKTS|G[A-Z]{4}GK)", re.IGNORECASE)

# #v1
# walker_a_pattern = re.compile(r"(G[A-Z]N[A-Z]{2}GK[ST]|GGG[A-Z]HGK[ST]|GET[A-Z]GKS|G[A-Z]N[A-Z]{2}GKTT|G[A-Z]NG[A-Z]GKST|G[A-Z]N[A-Z]QGK[A-Z]ST|G[A-Z]N[A-Z]QGKTS|G[A/M]N[A-Z]{2}GKT|G[A-Z]N[A-Z]{2}GKSN|G[A-Z]{2}G[A-Z]GK[ST]|G[A-Z]{4}GKTS|G[A-Z]{5}GK)", re.IGNORECASE)

# #v2
# walker_a_pattern = re.compile(r"(G[A-Z]N[A-Z]{2}GK[ST]|GGG[A-Z]HGK[ST]|GET[A-Z]GKS|G[A-Z]N[A-Z]{2}GKTT|G[A-Z]NG[A-Z]GKST|G[A-Z]N[A-Z]QGK[A-Z]ST|G[A-Z]N[A-Z]QGKTS|G[A/M]N[A-Z]{2}GKT|G[A-Z]N[A-Z]{2}GKSN|G[A-Z]{2}G[A-Z]GK[ST]|G[A-Z]{4}GKTS|G[A-Z]{5}GK|G[A-Z]{3}GK)", re.IGNORECASE)  # GXXXXGK[T/S]

# #v3
# walker_a_pattern = re.compile(r"(G[A-Z]N[A-Z]{2}GK[ST]|GGG[A-Z]HGK[ST]|GET[A-Z]GKS|G[A-Z]N[A-Z]{2}GKTT|G[A-Z]NG[A-Z]GKST|G[A-Z]N[A-Z]QGK[A-Z]ST|G[A-Z]N[A-Z]QGKTS|G[A/M]N[A-Z]{2}GKT|G[A-Z]N[A-Z]{2}GKSN|G[A-Z]{2}G[A-Z]GK[ST]|G[A-Z]{4}GKTS|G[A-Z]{5}GK|G[A-Z]{3}GK|G[A-Z-]{3,5}GK)", re.IGNORECASE)  # GXXXXGK[T/S]

# #v4
# walker_a_pattern = re.compile(r"(G[A-Z]N[A-Z]{2}GK[ST]|GGG[A-Z]HGK[ST]|GET[A-Z]GKS|G[A-Z]N[A-Z]{2}GKTT|G[A-Z]NG[A-Z]GKST|G[A-Z]N[A-Z]QGK[A-Z]ST|G[A-Z]N[A-Z]QGKTS|G[A/M]N[A-Z]{2}GKT|G[A-Z]N[A-Z]{2}GKSN|G[A-Z]{2}G[A-Z]GK[ST]|G[A-Z]{4}GKTS|G[A-Z]{5}GK|G[A-Z]{3}GK|G[A-Z-]{2,6}GK)", re.IGNORECASE)  # GXXXXGK[T/S]

# #v5
# walker_a_pattern = re.compile(r"(G[A-Z-]{3,10}GK)", re.IGNORECASE)  # GXXXXGK[T/S]

#v6
walker_a_pattern = re.compile(r"(G[A-Z-]{3,15}GK)", re.IGNORECASE)  # GXXXXGK[T/S]


# walker B (не менялся для всех версий)
walker_b_pattern = re.compile(r"(DE|DED|DEPTN[A-Z]{2}D[A-Z]E|DEPTN[A-Z]{2}LD|DTPLGRLD|DS|DQ|D[A-Z]xD|DD|DEGFG|D\(E\)ExE)", re.IGNORECASE)

In [4]:
def find_motifs(alignment_file, format="fasta"):
    alignments = AlignIO.read(alignment_file, format)
    results = []
    
    for record in alignments:
        seq_id = record.id
        # sequence = str(record.seq).replace("-", "")
        sequence = str(record.seq)
        
        # Поиск Walker A
        walker_a_matches = []
        for match in walker_a_pattern.finditer(sequence):
            start, end = match.span()
            walker_a_matches.append(f"{start+1}-{end+1}")
        
        # Поиск Walker B
        walker_b_matches = []
        valid_walker_b = []
        for match in walker_b_pattern.finditer(sequence):
            start, end = match.span()
            walker_b_matches.append(f"{start+1}-{end+1}")
        
        results.append({
            "ID": seq_id,
            "Walker_A": "Yes" if walker_a_matches else "No",
            "Walker_A_Pos": ", ".join(walker_a_matches) if walker_a_matches else "-",
            "Walker_B": "Yes" if walker_b_matches else "No",
            "Walker_B_Pos": ", ".join(walker_b_matches) if walker_b_matches else "-"
        })
    
    return pd.DataFrame(results)

In [33]:
folder_path = "../data/processed/trimmed_sequences/"

In [34]:
for filename in os.listdir(folder_path):
    if filename.endswith(".fasta"):
        fasta_file = os.path.join(folder_path, filename)
        
        df = find_motifs(fasta_file)
        print(f"Найдены мотивы в {filename}")
        
        crosstab = pd.crosstab(df['Walker_B'], df['Walker_A'])
        print(crosstab.head())

        print()

print("Обработка всех FASTA файлов завершена.")

Найдены мотивы в rep_0_clipkit_auto.fasta
Walker_A     No  Yes
Walker_B            
No         1753   32
Yes       10918  685

Найдены мотивы в clustalo_clipkit_auto.fasta
Walker_A    No   Yes
Walker_B            
No         534   124
Yes       4370  8370

Найдены мотивы в clustalo-2_trimmed_auto1.fasta
Walker_A    No
Walker_B      
No        3244
Yes       6373

Найдены мотивы в clustalo_2_clipkit_1.fasta
Walker_A    No   Yes
Walker_B            
No         458    80
Yes       6776  6084

Найдены мотивы в rep_lp_mafft_clipkit_auto.fasta
Walker_A     No  Yes
Walker_B            
No         1757   21
Yes       11170  450

Найдены мотивы в rep_0_clipkit_1.fasta
Walker_A     No  Yes
Walker_B            
No         1753   32
Yes       10918  685

Найдены мотивы в clustalo_clipkit_1.fasta
Walker_A    No   Yes
Walker_B            
No         534   124
Yes       4370  8370

Найдены мотивы в muscle_clipkit_1.fasta
Walker_A    No  Yes
Walker_B           
No        3293   70
Yes       9446  589


## Подробнее смотрим на лучший результат

In [35]:
df = find_motifs("../data/processed/trimmed_sequences/clustalo_clipkit_auto.fasta")
df.head()

Unnamed: 0,ID,Walker_A,Walker_A_Pos,Walker_B,Walker_B_Pos
0,CP036455.1_1856,Yes,3198-3209,Yes,"3766-3768, 5307-5309, 5514-5516, 6223-6225"
1,CP007192.1_1625,No,-,Yes,2449-2451
2,CP019600.1_1988,No,-,Yes,"2110-2112, 2710-2712, 3203-3205"
3,CP011280.1_1120,Yes,3198-3209,Yes,"3791-3793, 3957-3959, 5156-5158, 5307-5309"
4,CP019430.1_1569,Yes,3202-3209,Yes,"3766-3768, 3991-3993, 5307-5309"


In [36]:
pd.crosstab(df['Walker_B'], df['Walker_A'])

Walker_A,No,Yes
Walker_B,Unnamed: 1_level_1,Unnamed: 2_level_1
No,534,124
Yes,4370,8370


In [37]:
# сохраняем список нужных ID

filtered_df = df[(df['Walker_A'] == 'Yes') & (df['Walker_B'] == 'Yes')]

filtered_df['ID'].to_csv('../data/processed/seq_for_tree/seq_for_tree_v6.csv', 
                        index=False, 
                        header=False, 
                        sep='\t', 
                        encoding='utf-8')