In [73]:
from pathlib import Path
import pandas as pd

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
# from Bio.Align import MultipleSeqAlignment
# import Bio.Align.Applications

In [93]:
def build_path_clones(sample_name):
    BASE = Path("/data/samples/AIRR-Seq/OURS")
    seq_batch = sample_name.split("-")[0]
    sample_clones_path = BASE / seq_batch / sample_name / "clones" / f"{sample_name}_novj_with_clones.tsv"
    return sample_clones_path

def read_clones(path):
    clones_df = pd.read_csv(path, sep='\t')
    clones_df.sort_values(["duplicate_count"], ascending = False, inplace = True)
    return clones_df

def write_fasta(fpath, seq_records):
    SeqIO.write(seq_records, fpath, "fasta")

def sequenceRecords_from_clones(clones_df, sequence_colname, num_seqs, sample_name):
    
    total_counts = clones.duplicate_count.sum()
    records = []

    for i, (id_, series) in enumerate(clones.iloc[:num_seqs].iterrows()):

#         string_description = f"id={id_}|clone_id={series.clone_id}|sample_name={sample_name}|seq={sequence_colname}|rev={series.rev_comp}|count={series.duplicate_count}/{total_counts}|rank={i+1}"
        percent = series.duplicate_count / total_counts
        string_description = f"{id_}|{series.clone_id}|{sample_name}|{percent}"
        record = SeqRecord(
            Seq(series[sequence_colname]),
            id=string_description,
            description=string_description,
        )
        records.append(record)
    return records

In [166]:
SAMPLE_NAMES = [
#     "S3987Nr1-PBMC1_heavy",
#     "S3987Nr1-PBMC1_light",
#     "S3987Nr1-RAMOS_heavy",
#     "S3987Nr1-RAMOS_light",
#     "S3987Nr2-PBMC1_heavy",
#     "S3987Nr2-PBMC1_light",
#     "S3987Nr2-RAMOS_heavy",
    "S3987Nr2-RAMOS_light",
]

In [167]:
# clones = read_clones(build_path_clones(SAMPLE_NAMES[2]))

# print(clones.columns)
# clones.head(5)

# seq_records = sequenceRecords_from_clones(clones, "sequence", 10, "S3987Nr1-RAMOS-heavy")
# write_fasta("test.fasta", seq_records)

In [169]:
seq_feature = "cdr3"  # sequence, cdr3, Translation
for sample_name in SAMPLE_NAMES:
    clones = read_clones(build_path_clones(sample_name))
    seq_records = sequenceRecords_from_clones(clones, seq_feature, 10, sample_name)
    write_fasta(f"{sample_name}-{seq_feature}.fasta", seq_records)

### Analyze guides

In [124]:
def analyze_effect(effect_str):
    if effect_str[0] == 'D':
        if (int(effect_str.split(" ")[0][1:]) % 3) != 0:
            return "frameshift"
        else:
            return "no-frameshift"
    elif effect_str[0] == 'I':
        if (int(effect_str.split("+")[0][1:]) % 3) != 0:
            return "frameshift"
        else:
            return "no-frameshift"
    else:
        return "unknown"

In [158]:
lindel = pd.read_html("http://crispor.tefor.net/crispor.py?batchId=QM7zw8eNpm9eL2a3w5IZ&pamId=s283%2B&showMh=lindel")[0]
lindel.head()

effects = pd.Series(map(analyze_effect, lindel['Effect']))
probabilities = lindel['Probability'].str[:-1].astype('float32') * 10**(-2)

In [165]:
# probabilities.where(mask).sum()
probabilities[effects == "frameshift"].sum()

0.71409994