# PREDICTING RSA CLUSTER LABELS FOR ANY SITE

## IMPORTING NECESSARY PACKAGES

In [1]:
from fragsys_ml import *

2024-01-31 11:27:25.835114: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## FUNCTIONS

In [2]:
def generate_rsa_vectors(rsa_profs, max_len = None):
    """
    From an RSA profile dict, generates suitable RSA
    feature vectors to use as input for the MLP. If max_len
    is not provided, it will take the value of the longest site
    of the rsa_profs dict.
    """
    if max_len == None:
        max_len = max([len(v) for v in rsa_profs.values()])
    bs_vectors = []
    bs_vectors_dict = {}
    for bs_id, rsa_sig in rsa_profs.items():
        rsa_sig_len = len(rsa_sig)
        rsa_range_prop = [0 for i in range(10)] # now let us change to 10
        for rsa in rsa_sig:
            prop_i = int(rsa/10) # 10 RSA BINS: b1 = [0,10), b2 = [10, 20), ... b10 = [90, MAX)
            if prop_i > 9: # if greater than 100, put in 10th bin
                prop_i = 9
            #print(prop_i)
            rsa_range_prop[prop_i] += 1
        rsa_range_prop = [round(i/rsa_sig_len, 3) for i in rsa_range_prop]
        rsa_range_prop.insert(0, rsa_sig_len/max_len) # ADDING BINDING SITE SIZE RELATIVE TO MAX SITE SIZE (IN THIS CASE 40)
        bs_vectors.append(rsa_range_prop)
        bs_vectors_dict[bs_id] = rsa_range_prop

    vector_df = pd.DataFrame(bs_vectors, index = list(rsa_profs.keys())) # obtaining RSA vectors, which are the 11-element features used for the machine learning
    return vector_df

## INPUT DATA

In [3]:
main_dir = "./../"

results_dir = os.path.join(main_dir, "results")

rsa_profs = load_pickle(os.path.join(results_dir, "rsa_profs.pkl")) # using this as an example, but could be any {site_id: [site RSA profile], ... } dict

In [4]:
for i, (k, v) in enumerate(rsa_profs.items()):
    if i < 5:
        print("{} : {}\n".format(k, v))
    else:
        break

H0Y4R8_0_BS0 : [22.4, 39.2, 41.4, 46.5, 63.0, 77.7]

O43809_0_BS0 : [0.7, 0.7, 0.7, 1.5, 3.0, 4.9, 5.5, 6.8, 8.5, 11.2, 14.4, 15.3, 17.9, 19.4, 19.6, 25.0, 29.9, 30.2, 30.4, 33.0, 33.9, 44.9, 52.1, 78.4]

O43809_0_BS1 : [0.4, 7.1, 8.9, 15.8, 20.7, 21.3, 27.4, 37.5, 47.8, 52.8, 63.2, 76.2, 78.9]

O43809_0_BS2 : [18.9, 21.0, 25.0, 33.5, 60.1, 63.4, 70.7, 72.7]

O43809_0_BS3 : [0.8, 5.9, 12.2, 15.7, 25.0, 33.5, 38.3, 39.7, 43.4, 63.2]



## GENERATING RSA FEATURE VECTORS DATAFRAME

In [5]:
vector_df = generate_rsa_vectors(rsa_profs)

In [6]:
X = vector_df.sort_index() # here, X is still a dataframe. Thus, we maintain binding site IDs

labels = X.index.tolist()

## RUNNING MODEL ON TEST DATA

In [7]:
model_path = "./../results/79619_2_model_epoch_49_train_acc_0.96.h5" # this is the name of the MLP model
final_model = keras.models.load_model(model_path)
final_preds = final_model.predict(x = X, batch_size = 27, verbose = 0)
rounded_predictions = np.argmax(final_preds, axis = -1)

2024-01-31 11:29:58.592985: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## READ RSA CLUSTER LABELS

In [8]:
predicted_labels_dict = {labels[i]: rounded_predictions[i] for i in range(len(labels))}

In [9]:
for i, (k, v) in enumerate(predicted_labels_dict.items()):
    if i < 5:
        print("{} : {}\n".format(k, v))
    else:
        break

H0Y4R8_0_BS0 : 3

O15178_0_BS0 : 1

O15178_0_BS1 : 2

O15178_0_BS10 : 1

O15178_0_BS2 : 2

