In [1]:
import os
import pandas as pd

In [24]:
path_before_clu_0 = '../data/raw/from_lab'
path_before_clu_1 = '../data/processed/fasta_sequences/'

path_after_clu = '../data/processed/trimmed_sequences/clustalo_clipkit_auto.fasta'

path_after_filter = '../data/processed/seq_for_tree/seq_for_tree_v6.fasta'

In [3]:
def get_ids_from_file(file_path):
    """Извлекает ID последовательностей из FASTA файла."""
    ids = []
    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('>'):
                seq_id = line[1:].split()[0].strip()
                ids.append(seq_id)
    return ids

In [4]:
def parse_source_folders(path0, path1):
    """Парсит файлы .faa из path0 и .fasta из path1, возвращает DataFrame с ID и источниками."""
    data = []
    # Обрабатываем файлы из первой папки
    for filename in os.listdir(path0):
        if filename.endswith('.faa'):
            full_path = os.path.join(path0, filename)
            ids = get_ids_from_file(full_path)
            for seq_id in ids:
                data.append({'ID': seq_id, 'source': filename})
    
    # Обрабатываем файлы из второй папки
    for filename in os.listdir(path1):
        if filename.endswith('.fasta'):
            full_path = os.path.join(path1, filename)
            ids = get_ids_from_file(full_path)
            for seq_id in ids:
                data.append({'ID': seq_id, 'source': filename})
    
    return pd.DataFrame(data)


In [5]:
def read_single_file_ids(file_path):
    """Читает ID из одиночного FASTA файла и возвращает множество ID."""
    return set(get_ids_from_file(file_path))

In [25]:
# Создаем таблицу соответствия ID и источников
source_df = parse_source_folders(path_before_clu_0, path_before_clu_1)

# Собираем ID из файлов
after_clu_ids = read_single_file_ids(path_after_clu)
after_filter_ids = read_single_file_ids(path_after_filter)

# Группируем по источникам и создаем множества ID
result = (source_df.groupby('source')['ID']
          .agg(**{
              'before_clu': 'count',
              'after_clu': lambda x: x.isin(after_clu_ids).sum(),
              'after_filter': lambda x: x.isin(after_filter_ids).sum()
          })
          .reset_index())

# result.to_csv("result_statistics.csv", index=False)
# print("Результат сохранен в файл result_statistics.csv")

result

Unnamed: 0,source,before_clu,after_clu,after_filter
0,AAA_ATPase_15-21_family.fasta,261,67,57
1,ABC_ATPase_PARIS.fasta,456,406,360
2,AbiL.faa,8042,2086,955
3,GajA.faa,11200,1996,1140
4,HEC-02.faa,1248,419,179
5,HEC-03.faa,540,270,116
6,HEC-04.faa,1348,143,136
7,LmuB.faa,8395,2009,290
8,PDC-M01A.faa,1722,469,451
9,Ppl.faa,927,198,50


In [21]:
# Добавляем колонки с процентами
numeric_cols = result.columns[1:] 
for col in numeric_cols:
    result[f'{col}_pct'] = (result[col] / result[col].sum()) * 100

pct_columns = [col for col in result.columns if col.endswith('_pct')]
final_result = result[['source'] + pct_columns]

Округляем и добавляем '%'
final_result[pct_columns] = final_result[pct_columns].round(2)
for col in pct_columns:
    final_result[col] = final_result[col].astype(str) + '%'


final_result

# final_result.to_csv('result_percentages.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_result[pct_columns] = final_result[pct_columns].round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_result[col] = final_result[col].astype(str) + '%'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_result[col] = final_result[col].astype(str) + '%'
A value is trying to be set on

Unnamed: 0,source,before_clu_pct,after_clu_pct,after_filter_pct
0,AAA_ATPase_15-21_family.fasta,0.51%,0.48%,0.64%
1,ABC_ATPase_PARIS.fasta,0.9%,2.88%,4.04%
2,AbiL.faa,15.83%,14.79%,10.97%
3,GajA.faa,22.05%,14.16%,13.42%
4,HEC-02.faa,2.46%,2.97%,2.04%
5,HEC-03.faa,1.06%,1.91%,1.34%
6,HEC-04.faa,2.65%,1.01%,1.59%
7,LmuB.faa,16.53%,14.25%,1.6%
8,PDC-M01A.faa,3.39%,3.33%,5.27%
9,Ppl.faa,1.82%,1.4%,0.48%
