In [1]:
import pandas as pd

In [2]:
def process_gene_hit(gene_series):
    return gene_series.apply(lambda x: x.split('*')[0])

In [12]:
def process_file(filename):
    data = pd.read_csv(filename, sep='\t', low_memory=False)
    data = data[['Clone count', 'All V hits','All J hits', 'AA. Seq. CDR3']]
    # data = data[data['Clone count'] > 1]
    data['v_call'] = process_gene_hit(data['All V hits'])
    data['j_call'] = process_gene_hit(data['All J hits'])
    data = data.rename(
        columns={'AA. Seq. CDR3': 'junction_aa', 'Clone count': 'count'}).drop(
        columns=['All V hits', 'All J hits'])
    data['locus'] = 'beta'
    data = data[data.junction_aa.str.isalpha()]
    data = data[data.junction_aa.str.len() > 7].drop_duplicates()
    print(len(data))
    if len(data) > 350000:
        data = data.sample(350000)
    return data

In [13]:
import os
import re
name_to_data = {}
def process_all_f1_day0_15_files(directory, process_file):
    """
    Проходит по всем файлам в директории и вызывает process_file
    для файлов с F1 и днем 0 или 15.
    """
    pattern = re.compile(r'^[A-Z0-9]+_0_F(1|2)_\.txt\.gz$')
    
    for filename in os.listdir(directory):
        if pattern.match(filename):
            file_path = os.path.join(directory, filename)
            print(f"Processing: {file_path}")
            new_file_name = filename.split('.')[0][:-1] + '_with_1_downsampled.txt'
            data = process_file(file_path)
            name_to_data[filename] = data
            data.to_csv(f'{directory}/airr_format/{new_file_name}', sep='\t')

In [14]:
data_dir = "/projects/immunestatus/pogorelyy"
process_all_f1_day0_15_files(data_dir, process_file)

Processing: /projects/immunestatus/pogorelyy/S1_0_F1_.txt.gz
547582
Processing: /projects/immunestatus/pogorelyy/S2_0_F1_.txt.gz
727464
Processing: /projects/immunestatus/pogorelyy/Q1_0_F1_.txt.gz
371114
Processing: /projects/immunestatus/pogorelyy/P2_0_F2_.txt.gz
637709
Processing: /projects/immunestatus/pogorelyy/Q1_0_F2_.txt.gz
402000
Processing: /projects/immunestatus/pogorelyy/P2_0_F1_.txt.gz
628977
Processing: /projects/immunestatus/pogorelyy/S2_0_F2_.txt.gz
722436
Processing: /projects/immunestatus/pogorelyy/P1_0_F1_.txt.gz
598661
Processing: /projects/immunestatus/pogorelyy/S1_0_F2_.txt.gz
481993
Processing: /projects/immunestatus/pogorelyy/Q2_0_F1_.txt.gz
701995
Processing: /projects/immunestatus/pogorelyy/P1_0_F2_.txt.gz
541743
Processing: /projects/immunestatus/pogorelyy/Q2_0_F2_.txt.gz
719043


In [37]:
names = pd.Series(name_to_data.keys())

In [40]:
sizes = pd.Series(map(lambda x: len(x), name_to_data.values()))

In [42]:
df = pd.DataFrame({'name': names, 'size': sizes})

In [44]:
df['patient'] = df.name.apply(lambda x: x.split('_')[0])

In [46]:
df['day'] = df.name.apply(lambda x: x.split('_')[1])

In [48]:
df['replica'] = df.name.apply(lambda x: x.split('_')[2])

In [50]:
df = df.drop(columns=['name'])

In [53]:
df.sort_values(by=['patient', 'day', 'replica'])

Unnamed: 0,size,patient,day,replica
12,118243,P1,0,F1
16,101514,P1,0,F2
15,311112,P1,15,F1
9,161846,P2,0,F1
5,170699,P2,0,F2
2,362331,P2,15,F1
4,66723,Q1,0,F1
7,73604,Q1,0,F2
8,86750,Q1,15,F1
14,193601,Q2,0,F1
