# Training and testing Applm (RF model)

In [None]:
# Ubuntu                    22.04

# miniconda3                24.1.2

# python                    3.10.9 
# numpy                     1.26.4
# pandas                    1.5.3
# scikit-learn              1.4.1.post1

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import os
import _pickle as cPickle

In [None]:
def read_seq_pickle(seqid, embed="ohe"):
    # Define paths to where embeddings are stored
    prefix_dict={"ohe":"../0_embeddings/avgpool_ohe/",
                "xtrimopglm_10b":"../0_embeddings/avgpool_xtrimopglm_10b/"}
    # Embedded seqeunce are saved as a pickle file which contains the embeddings themselves
    # as a numpy array of shape (D, 1), and a label [0 or 1] as a numpy array of shape (1, ).
    with open(os.path.join(prefix_dict[embed], seqid+".pickle"), "rb") as file:
        x = cPickle.load(file)
        y = cPickle.load(file)
    return x, y

def read_fasta(filepath):
    """
    Reads a FASTA file and returns a dictionary of sequences.

    The function parses a file in FASTA format, where each sequence is
    preceded by a header line starting with '>'. It extracts the sequence
    identifier from the header and the corresponding sequence.

    Args:
        filepath: The path to the FASTA file.

    Returns:
        A dictionary where keys are sequence identifiers (str) and
        values are the corresponding protein or nucleotide sequences (str).
    """
    with open(filepath, "r") as file:
        s1 = file.read()
    s1 = s1.split(">")[1:]
    fasta_dict = {}
    for i in s1:
        seq_id = i.split("\n")[0]
        seq = ''.join(i.split("\n")[1:])
        fasta_dict[seq_id] = seq
    return fasta_dict

def read_dataset(fasta_filepath, embed="ohe"):
    """
    Reads a FASTA file and returns the sequence names, embedded sequences, 
    and labels as 3 separate lists.

    Args:
        fasta_filepath: The path to the FASTA file.
    
    Returns:
        Sequence names, embedded sequences, and labels as 3 separate lists.

    """
    seqs = read_fasta(fasta_filepath)
    seqid = list(seqs.keys())
    x = []
    y = []
    for i in seqs:
        x_embed, y_label = read_seq_pickle(i, embed)
        x.append(x_embed)
        y.append(y_label)

    x = np.array(x).squeeze()
    y = np.array(y).squeeze()
    return seqid, x, y

## 1. Loading the dataset

In [None]:
# Read the dataset
embed = 'xtrimopglm_10b'
seqid_train, x_train, y_train = read_dataset('../1_train_splits_fa/ex1/train.fa', embed)
seqid_valid, x_valid, y_valid = read_dataset('../1_train_splits_fa/ex1/valid.fa', embed)
seqid_test, x_test, y_test = read_dataset('../1_train_splits_fa/ex1/test.fa', embed)

100%|██████████| 2358/2358 [00:03<00:00, 589.86it/s]


## 2. Training the model

In [None]:
# Train the model
np.random.seed(42)
clf = RandomForestClassifier(n_estimators=1000, max_depth=None, n_jobs=4, random_state = 42)
clf.fit(x_train, y_train)


## 3. Saving predictions

In [None]:
# Get predictions
y_valid_hat = clf.predict_proba(x_valid)[:,1]
y_test_hat = clf.predict_proba(x_test)[:,1]

# Save predictions
valid_df = pd.DataFrame({"seqid":seqid_valid,"y_hat":np.squeeze(y_valid_hat), "y":np.squeeze(y_valid)})
valid_df.to_csv("../2_results/ex1/valid_labeled.csv")

test_df = pd.DataFrame({"seqid":seqid_test,"y_hat":np.squeeze(y_test_hat), "y":np.squeeze(y_test)})
test_df.to_csv("../2_results/ex1/test_labeled.csv")