In [None]:
import glob
import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plots import pointplot
from tqdm.notebook import tqdm

## process mutated results

In [None]:
files = glob.glob('./results/mutated_per_pos_inference/results/*.parquet')
files

In [None]:
df = pd.concat([pd.read_parquet(f) for f in files])
len(df)

In [None]:
drop_columns = ["antibody_datatype", "dataset_columns", "tokenized_sequence"]
df = df.drop(columns=drop_columns)

In [None]:
(df['sequence_id'].value_counts() == 5).all()

#### load original dataset for gene segment masks

In [None]:
keep = ['sequence_id', 'gene_segment_mask_aa_heavy', 'gene_segment_mask_aa_light']
ref = pd.read_csv('./data/mutated_donors-combined.csv')[keep]

In [None]:
df = df.merge(ref, on='sequence_id')

#### convert to long format

In [None]:
long_data = []
for ppi in tqdm(df.itertuples(index=False), total=df.shape[0]):
    seq_pos = 0
    hlen = len(ppi.sequence_aa_heavy)
    model = ppi.model
    donor = ppi.donor
    sequence_id = ppi.sequence_id
    sequence = ppi.sequence_aa_heavy + ppi.sequence_aa_light
    tokenized = ppi.tokenized_seq_wo_special
    losses = ppi.loss
    perplexities = ppi.perplexity
    probabilities = ppi.probabilities
    predictions = ppi.prediction
    prediction_tokens = ppi.prediction_tokens

    cdr_mask = list(ppi.cdr_mask_aa_heavy + ppi.cdr_mask_aa_light)
    gene_segment_mask = list(ppi.gene_segment_mask_aa_heavy + ppi.gene_segment_mask_aa_light)

    for i in range(len(tokenized)):
        tok = tokenized[i]
        loss = losses[i]
        ppl = perplexities[i]
        prob = probabilities[i]
        pred = predictions[i]
        pred_tok = prediction_tokens[i]
        res = sequence[seq_pos]
        chain = "H" if seq_pos < hlen else "L"
        d = {
            "sequence_id": sequence_id,
            "model": model,
            "donor": donor,
            "position": seq_pos,
            "chain": chain,
            "label": res,
            "label_token": tok,
            "label_probability": prob[tok],
            "accuracy": float(res == pred),
            "predicted": pred,
            "predicted_token": pred_tok,
            "predicted_probability": prob[pred_tok],
            "loss": loss,
            "perplexity": ppl,
            "cdr_mask": cdr_mask[seq_pos],
            "gene_segment_mask": gene_segment_mask[seq_pos],
        }
        long_data.append(d)
        seq_pos += 1

In [None]:
long_df = pl.DataFrame(long_data)

In [None]:
long_df.write_parquet('./results/mutated_per_pos_inference/mutated_long-results.parquet')

## process unmutated results

In [None]:
files = glob.glob('./results/unmutated_per_pos_inference/results/*.parquet')
files

In [None]:
df = pd.concat([pd.read_parquet(f) for f in files])
len(df)

In [None]:
drop_columns = ["antibody_datatype", "dataset_columns", "tokenized_sequence"]
df = df.drop(columns=drop_columns)

In [None]:
(df['sequence_id'].value_counts() == 5).all()

#### load original dataset for gene segment masks

In [None]:
keep = ['sequence_id', 'gene_segment_mask_aa_heavy', 'gene_segment_mask_aa_light']
ref = pd.read_csv('./data/unmutated_donors-combined.csv')[keep]

In [None]:
df = df.merge(ref, on='sequence_id')

#### convert to long format

In [None]:
long_data = []
for ppi in tqdm(df.itertuples(index=False), total=df.shape[0]):
    seq_pos = 0
    hlen = len(ppi.sequence_aa_heavy)
    model = ppi.model
    donor = ppi.donor
    sequence_id = ppi.sequence_id
    sequence = ppi.sequence_aa_heavy + ppi.sequence_aa_light
    tokenized = ppi.tokenized_seq_wo_special
    losses = ppi.loss
    perplexities = ppi.perplexity
    probabilities = ppi.probabilities
    predictions = ppi.prediction
    prediction_tokens = ppi.prediction_tokens

    cdr_mask = list(ppi.cdr_mask_aa_heavy + ppi.cdr_mask_aa_light)
    gene_segment_mask = list(ppi.gene_segment_mask_aa_heavy + ppi.gene_segment_mask_aa_light)

    for i in range(len(tokenized)):
        tok = tokenized[i]
        loss = losses[i]
        ppl = perplexities[i]
        prob = probabilities[i]
        pred = predictions[i]
        pred_tok = prediction_tokens[i]
        res = sequence[seq_pos]
        chain = "H" if seq_pos < hlen else "L"
        d = {
            "sequence_id": sequence_id,
            "model": model,
            "donor": donor,
            "position": seq_pos,
            "chain": chain,
            "label": res,
            "label_token": tok,
            "label_probability": prob[tok],
            "accuracy": float(res == pred),
            "predicted": pred,
            "predicted_token": pred_tok,
            "predicted_probability": prob[pred_tok],
            "loss": loss,
            "perplexity": ppl,
            "cdr_mask": cdr_mask[seq_pos],
            "gene_segment_mask": gene_segment_mask[seq_pos],
        }
        long_data.append(d)
        seq_pos += 1

In [None]:
long_df = pl.DataFrame(long_data)

In [None]:
long_df.write_parquet('./results/unmutated_per_pos_inference/unmutated_long-results.parquet')