In [1]:
# motif analysis
import os
import pandas as pd
from utils.motif import donor_pattern, acceptor_pattern, default_window_size
from utils.utils import kmer, merge_kmer
from tqdm import tqdm

train_data = os.path.join("workspace", "seqlab-latest", "train.csv")
validation_data = os.path.join("workspace", "seqlab-latest", "validation.csv")
test_data = os.path.join("workspace", "seqlab-latest", "test.csv")
paths = [train_data, validation_data, test_data]

is_exists = all([os.path.exists(p) for p in paths])
if not is_exists:
    raise FileNotFoundError()

for p in paths:
    df = pd.read_csv(p)
    dest_filename = ".".join(os.path.basename(p).split(".")[0:-1])
    donor_sequences, donor_targets, donor_sequence_tokens, donor_target_tokens = [], [], [], []
    acceptor_sequences, acceptor_targets, acceptor_sequence_tokens, acceptor_target_tokens = [], [], [], []
    for i, r in tqdm(df.iterrows(), total=df.shape[0], desc="Extracting motif"):
        sequence_tokens = r["sequence"].split(" ")
        target_tokens = r["label"].split(" ")

        arr_s = kmer(sequence_tokens, default_window_size)
        arr_t = kmer(target_tokens, default_window_size)

        for i, j in zip(arr_s, arr_t):
            _j = kmer(j, 2, 1)
            if donor_pattern in _j:
                donor_sequences.append(merge_kmer(i))
                donor_targets.append(merge_kmer(j))
                donor_sequence_tokens.append(" ".join(i))
                donor_target_tokens.append(" ".join(j))
            if acceptor_pattern in _j:
                acceptor_sequences.append(merge_kmer(i))
                acceptor_targets.append(merge_kmer(j))
                acceptor_sequence_tokens.append(" ".join(i))
                acceptor_target_tokens.append(" ".join(j))
    
    pd.DataFrame(data={
        "sequence": donor_sequences,
        "target": donor_targets,
        "sequence_tokens": donor_sequence_tokens,
        "target_tokens": donor_target_tokens
    }).to_csv(os.path.join("motif_analysis", "seqlab-latest", f"{dest_filename}.donor.csv"), index=False)
    pd.DataFrame(data={
        "sequence": acceptor_sequences,
        "target": acceptor_targets,
        "sequence_tokens": acceptor_sequence_tokens,
        "target_tokens": acceptor_target_tokens
    }).to_csv(os.path.join("motif_analysis", "seqlab-latest", f"{dest_filename}.acceptor.csv"), index=False)



  from .autonotebook import tqdm as notebook_tqdm
Extracting motif: 100%|██████████| 70868/70868 [03:07<00:00, 378.03it/s]
Extracting motif: 100%|██████████| 17717/17717 [00:45<00:00, 389.88it/s]
Extracting motif: 100%|██████████| 6961/6961 [00:17<00:00, 388.71it/s]


In [3]:
# extract acceptor unique motif.
import os
import pandas as pd
from tqdm import tqdm

names = ["train", "validation", "test"]
acceptors = [f"{p}.donor.csv" for p in names]
acceptors = [os.path.join("motif_analysis", "seqlab-latest", p) for p in acceptors]
for p in tqdm(acceptors, total=len(acceptors), desc="Extracting unique motif"):
    df = pd.read_csv(p)
    udf = df.drop_duplicates(subset=["sequence", "target"])

    dest_name = os.path.basename(p).split(".")[0:-1]
    dest_name = f"{'.'.join(dest_name)}.unique.csv"
    dest_file = os.path.join("motif_analysis", "seqlab-latest", dest_name)
    udf.to_csv(dest_file, index=False)


Extracting unique motif: 100%|██████████| 3/3 [00:01<00:00,  1.53it/s]


In [8]:
# cross-reference training and test data to see if their acceptor motifs are intersected.
import os
import pandas as pd

for p in ["donor", "acceptor"]:
    dir_path = os.path.join("motif_analysis", "seqlab-latest")
    test_path = os.path.join(dir_path, f"test.{p}.unique.csv")
    validation_path = os.path.join(dir_path, f"validation.{p}.unique.csv")
    train_path = os.path.join(dir_path, f"train.{p}.unique.csv")

    train_df = pd.read_csv(train_path)
    validation_df = pd.read_csv(validation_path)
    test_df = pd.read_csv(test_path)

    def check(row, dataframe):
        seq = row["sequence"]
        count = dataframe[dataframe["sequence"] == seq].shape[0]
        if count > 0:
            return 1
        else:
            return 0

    test_df["is_motif_in_train_data"] = test_df.apply(lambda x: check(x, train_df), axis=1)
    test_df.to_csv(
        test_path, 
        index=False
    )
    validation_df["is_motif_in_train_data"] = validation_df.apply(lambda x: check(x, train_df), axis=1)
    validation_df.to_csv(
        validation_path,
        index=False
    )


In [2]:
import os
import pandas as pd
from transformers import BertTokenizer
from utils.seqlab import Index_Dictionary

path = os.path.join("prediction", "log", "prediction_log.csv")
df = pd.read_csv(path)

tokenizer = BertTokenizer.from_pretrained(os.path.join("pretrained", "3-new-12w-0"))
df["input_tokens"] = df.apply(lambda x: " ".join(tokenizer.convert_ids_to_tokens([int(a) for a in x["input_ids"].split(" ")])), axis=1)
df["prediction_tokens"] = df.apply(lambda x: " ".join([Index_Dictionary[a] for a in [int(b) for b in x["prediction_ids"].split(" ")]]), axis=1)
df["target_tokens"] = df.apply(lambda x: " ".join([Index_Dictionary[a] for a in [int(b) for b in x["target_ids"].split(" ")]]), axis=1)
df.to_csv(os.path.join("prediction", "log", "prediction_log_complete.csv"), index=False)

In [3]:
import pandas as pd


df = pd.read_csv(os.path.join("prediction", "log", "prediction_log_complete.csv"))
ss_labels = ["iiE", "iEE", "EEi", "Eii"]

all_correct_df = df[df["avg_f1_score"] == 1]
print(all_correct_df.shape)
tokens, predictions, targets = [], [], []
for i, r in all_correct_df.iterrows():
    arr_tokens = r["input_tokens"].split(" ")
    arr_predictions = r["prediction_tokens"].split(" ")
    arr_targets = r["target_tokens"].split(" ")

    all_clear = all([a == b for a, b in zip(arr_predictions, arr_targets)])
    if not all_clear:
        raise ValueError(f"{arr_predictions}\n{arr_targets}")

    for i, j, k in zip(arr_tokens, arr_predictions, arr_targets):
        if k in ss_labels:
            tokens.append(i)
            predictions.append(j)
            targets.append(k)

ndf = pd.DataFrame(data={
    "token": tokens,
    "prediction": predictions,
    "target": targets
})
print(ndf.shape)
correct_df = ndf[ndf["prediction"] == ndf["target"]]
print(correct_df.shape)
false_df = ndf[ndf["prediction"] != ndf["target"]]
print(false_df.shape)

(45, 31)
(180, 3)
(180, 3)
(0, 3)


In [8]:
ndf.head(10)

Unnamed: 0,token,prediction,target
0,AGA,iiE,iiE
1,GAA,iEE,iEE
2,AGG,EEi,EEi
3,GGT,Eii,Eii
4,AGG,iiE,iiE
5,GGA,iEE,iEE
6,AGG,EEi,EEi
7,GGT,Eii,Eii
8,AGG,iiE,iiE
9,GGA,iEE,iEE


In [10]:
false_df.head(10)

Unnamed: 0,token,prediction,target
1108,GGT,iii,iEE
2770,GGT,iii,iEE
3156,GGT,iii,iEE
3407,AGG,iii,iiE
3408,GGT,iii,iEE
3982,GGT,iii,iEE
3995,AGG,EEE,iiE
4024,GGT,iii,iEE
4031,AGG,iii,iiE
4032,GGT,iii,iEE
